diff --git a/src/xenia/app/xenia_main.cc b/src/xenia/app/xenia_main.cc index 03cccc07c..4e644f777 100644 --- a/src/xenia/app/xenia_main.cc +++ b/src/xenia/app/xenia_main.cc @@ -379,6 +379,9 @@ std::vector> EmulatorApp::CreateInputDrivers( } bool EmulatorApp::OnInitialize() { +#if XE_ARCH_AMD64 == 1 + amd64::InitFeatureFlags(); +#endif Profiler::Initialize(); Profiler::ThreadEnter("Main"); diff --git a/src/xenia/base/clock.cc b/src/xenia/base/clock.cc index 774f8e25d..593a4f39e 100644 --- a/src/xenia/base/clock.cc +++ b/src/xenia/base/clock.cc @@ -51,7 +51,7 @@ uint64_t last_guest_tick_count_ = 0; uint64_t last_host_tick_count_ = Clock::QueryHostTickCount(); -using tick_mutex_type = xe_unlikely_mutex; +using tick_mutex_type = std::mutex; // Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync // std::mutex tick_mutex_; diff --git a/src/xenia/base/dma.cc b/src/xenia/base/dma.cc index ead0ac490..664f96054 100644 --- a/src/xenia/base/dma.cc +++ b/src/xenia/base/dma.cc @@ -1,7 +1,15 @@ #include "dma.h" #include "logging.h" +#include "mutex.h" +#include "platform_win.h" #include "xbyak/xbyak/xbyak_util.h" +XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution, + NtDelayExecutionPointer); +XE_NTDLL_IMPORT(NtAlertThread, cls_NtAlertThread, NtAlertThreadPointer); +XE_NTDLL_IMPORT(NtAlertThreadByThreadId, cls_NtAlertThreadByThreadId, + NtAlertThreadByThreadId); + template static void xedmaloghelper(const char (&fmt)[N], Ts... args) { char buffer[1024]; @@ -213,320 +221,140 @@ void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping, written_length); } -#define XEDMA_NUM_WORKERS 4 -class alignas(256) XeDMACGeneric : public XeDMAC { +#define MAX_INFLIGHT_DMAJOBS 65536 +#define INFLICT_DMAJOB_MASK (MAX_INFLIGHT_DMAJOBS - 1) +class XeDMACGeneric : public XeDMAC { + std::unique_ptr thrd_; + XeDMAJob* jobs_ring_; + volatile std::atomic write_ptr_; + struct alignas(XE_HOST_CACHE_LINE_SIZE) { - std::atomic free_job_slots_; - std::atomic jobs_submitted_; - std::atomic jobs_completed_; - std::atomic num_workers_awoken_; - std::atomic current_job_serial_; - - } dma_volatile_; - - alignas(XE_HOST_CACHE_LINE_SIZE) XeDMAJob jobs_[64]; - - volatile uint32_t jobserials_[64]; - - alignas(XE_HOST_CACHE_LINE_SIZE) - std::unique_ptr job_done_signals_[64]; - // really dont like using unique pointer for this... - std::unique_ptr job_submitted_signal_; - std::unique_ptr job_completed_signal_; - - std::unique_ptr scheduler_thread_; - struct WorkSlice { - uint8_t* destination; - uint8_t* source; - size_t numbytes; + volatile std::atomic read_ptr_; + xe_mutex push_into_ring_lock_; }; - std::unique_ptr workers_[XEDMA_NUM_WORKERS]; - std::unique_ptr worker_has_work_; //[XEDMA_NUM_WORKERS]; - std::unique_ptr worker_has_finished_[XEDMA_NUM_WORKERS]; - - threading::WaitHandle* worker_has_finished_nosafeptr_[XEDMA_NUM_WORKERS]; - WorkSlice worker_workslice_[XEDMA_NUM_WORKERS]; - - // chrispy: this is bad - static uint32_t find_free_hole_in_dword(uint64_t dw) { - XEDMALOG("Finding free hole in 0x%llX", dw); - - for (uint32_t i = 0; i < 64; ++i) { - if (dw & (1ULL << i)) { - continue; - } - - return i; - } - return ~0U; - } - - uint32_t allocate_free_dma_slot() { - XEDMALOG("Allocating free slot"); - uint32_t got_slot = 0; - uint64_t slots; - uint64_t allocated_slot; - - do { - slots = dma_volatile_.free_job_slots_.load(); - - got_slot = find_free_hole_in_dword(slots); - if (!~got_slot) { - XEDMALOG("Didn't get a slot!"); - return ~0U; - } - allocated_slot = slots | (1ULL << got_slot); - - } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong( - slots, allocated_slot))); - XEDMALOG("Allocated slot %d", got_slot); - return got_slot; - } - // chrispy: on x86 this can just be interlockedbittestandreset... - void free_dma_slot(uint32_t slot) { - XEDMALOG("Freeing slot %d", slot); - uint64_t slots; - - uint64_t deallocated_slot; - - do { - slots = dma_volatile_.free_job_slots_.load(); - - deallocated_slot = slots & (~(1ULL << slot)); - - } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong( - slots, deallocated_slot))); - } - - void DoDMAJob(uint32_t idx) { - XeDMAJob& job = jobs_[idx]; - if (job.precall) { - job.precall(&job); - } - // memcpy(job.destination, job.source, job.size); - - size_t job_size = job.size; - - size_t job_num_lines = job_size / XE_HOST_CACHE_LINE_SIZE; - - size_t line_rounded = job_num_lines * XE_HOST_CACHE_LINE_SIZE; - - size_t rem = job_size - line_rounded; - - size_t num_per_worker = line_rounded / XEDMA_NUM_WORKERS; - - XEDMALOG( - "Distributing %d bytes from %p to %p across %d workers, remainder is " - "%d", - line_rounded, job.source, job.destination, XEDMA_NUM_WORKERS, rem); - if (num_per_worker < 2048) { - XEDMALOG("not distributing across workers, num_per_worker < 8192"); - // not worth splitting up - memcpy(job.destination, job.source, job.size); - job.signal_on_done->Set(); - } else { - for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) { - worker_workslice_[i].destination = - (i * num_per_worker) + job.destination; - worker_workslice_[i].source = (i * num_per_worker) + job.source; - - worker_workslice_[i].numbytes = num_per_worker; - } - if (rem) { - __movsb(job.destination + line_rounded, job.source + line_rounded, rem); - } - // wake them up - worker_has_work_->Set(); - XEDMALOG("Starting waitall for job"); - threading::WaitAll(worker_has_finished_nosafeptr_, XEDMA_NUM_WORKERS, - false); - - XEDMALOG("Waitall for job completed!"); - job.signal_on_done->Set(); - } - if (job.postcall) { - job.postcall(&job); - } - ++dma_volatile_.jobs_completed_; - } - - void WorkerIter(uint32_t worker_index) { - xenia_assert(worker_index < XEDMA_NUM_WORKERS); - auto [dest, src, size] = worker_workslice_[worker_index]; - - // if (++dma_volatile_.num_workers_awoken_ == XEDMA_NUM_WORKERS ) { - worker_has_work_->Reset(); - //} - xenia_assert(size < (1ULL << 32)); - // memcpy(dest, src, size); - dma::vastcpy(dest, src, static_cast(size)); - } - XE_NOINLINE - void WorkerMainLoop(uint32_t worker_index) { - do { - XEDMALOG("Worker iter for worker %d", worker_index); - WorkerIter(worker_index); - - XEDMALOG("Worker %d is done\n", worker_index); - threading::SignalAndWait(worker_has_finished_[worker_index].get(), - worker_has_work_.get(), false); - } while (true); - } - void WorkerMain(uint32_t worker_index) { - XEDMALOG("Entered worker main loop, index %d", worker_index); - threading::Wait(worker_has_work_.get(), false); - XEDMALOG("First wait for worker %d completed, first job ever", - worker_index); - WorkerMainLoop(worker_index); - } - - static void WorkerMainForwarder(void* ptr) { - // we aligned XeDma to 256 bytes and encode extra info in the low 8 - uintptr_t uptr = (uintptr_t)ptr; - - uint32_t worker_index = (uint8_t)uptr; - - uptr &= ~0xFFULL; - - char name_buffer[64]; - sprintf_s(name_buffer, "dma_worker_%d", worker_index); - - xe::threading::set_name(name_buffer); - - reinterpret_cast(uptr)->WorkerMain(worker_index); - } - - void DMAMain() { - XEDMALOG("DmaMain"); - do { - threading::Wait(job_submitted_signal_.get(), false); - - auto slots = dma_volatile_.free_job_slots_.load(); - - for (uint32_t i = 0; i < 64; ++i) { - if (slots & (1ULL << i)) { - XEDMALOG("Got new job at index %d in DMAMain", i); - DoDMAJob(i); - - free_dma_slot(i); - - job_completed_signal_->Set(); - // break; - } - } - - } while (true); - } - - static void DMAMainForwarder(void* ud) { - xe::threading::set_name("dma_main"); - reinterpret_cast(ud)->DMAMain(); - } + HANDLE gotjob_event; + void WorkerWait(); public: - virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override { - XEDMALOG("New job, %p to %p with size %d", job->source, job->destination, - job->size); - uint32_t slot; - do { - slot = allocate_free_dma_slot(); - if (!~slot) { - XEDMALOG( - "Didn't get a free slot, waiting for a job to complete before " - "resuming."); - threading::Wait(job_completed_signal_.get(), false); + virtual ~XeDMACGeneric() {} + void WorkerThreadMain(); + XeDMACGeneric() { + threading::Thread::CreationParameters crparams; + crparams.create_suspended = true; + crparams.initial_priority = threading::ThreadPriority::kNormal; + crparams.stack_size = 65536; + gotjob_event = CreateEventA(nullptr, false, false, nullptr); + thrd_ = std::move(threading::Thread::Create( + crparams, [this]() { this->WorkerThreadMain(); })); - } else { - break; - } + jobs_ring_ = (XeDMAJob*)_aligned_malloc( + MAX_INFLIGHT_DMAJOBS * sizeof(XeDMAJob), XE_HOST_CACHE_LINE_SIZE); - } while (true); - jobs_[slot] = *job; + write_ptr_ = 0; + read_ptr_ = 0; - jobs_[slot].signal_on_done = job_done_signals_[slot].get(); - jobs_[slot].signal_on_done->Reset(); - XEDMALOG("Setting job submit signal, pushed into slot %d", slot); - - uint32_t new_serial = dma_volatile_.current_job_serial_++; - - jobserials_[slot] = new_serial; - - ++dma_volatile_.jobs_submitted_; - job_submitted_signal_->Set(); - return (static_cast(new_serial) << 32) | - static_cast(slot); - - // return job_done_signals_[slot].get(); + thrd_->Resume(); } - bool AllJobsDone() { - return dma_volatile_.jobs_completed_ == dma_volatile_.jobs_submitted_; + virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override { + // std::unique_lock pushlock{push_into_ring_lock_}; + HANDLE dmacevent = CreateEventA(nullptr, true, false, nullptr); + { + job->dmac_specific_ = (uintptr_t)dmacevent; + + jobs_ring_[write_ptr_ % MAX_INFLIGHT_DMAJOBS] = *job; + write_ptr_++; + SetEvent(gotjob_event); + } + return (DMACJobHandle)dmacevent; } virtual void WaitJobDone(DMACJobHandle handle) override { - uint32_t serial = static_cast(handle >> 32); - uint32_t jobid = static_cast(handle); - do { - if (jobserials_[jobid] != serial) { - return; // done, our slot was reused + while (WaitForSingleObject((HANDLE)handle, 2) == WAIT_TIMEOUT) { + // NtAlertThreadByThreadId.invoke(thrd_->system_id()); + // while (SignalObjectAndWait(gotjob_event, (HANDLE)handle, 2, false) == + // WAIT_TIMEOUT) { + // ; } + //} - auto waitres = threading::Wait(job_done_signals_[jobid].get(), false, - std::chrono::milliseconds{1}); - - if (waitres == threading::WaitResult::kTimeout) { - continue; - } else { - return; - } - } while (true); + // SignalObjectAndWait(gotjob_event, (HANDLE)handle, INFINITE, false); + CloseHandle((HANDLE)handle); } virtual void WaitForIdle() override { - while (!AllJobsDone()) { + while (write_ptr_ != read_ptr_) { threading::MaybeYield(); } } - XeDMACGeneric() { - XEDMALOG("Constructing xedma at addr %p", this); - dma_volatile_.free_job_slots_.store(0ULL); - dma_volatile_.jobs_submitted_.store(0ULL); - dma_volatile_.jobs_completed_.store(0ULL); - dma_volatile_.current_job_serial_.store( - 1ULL); // so that a jobhandle is never 0 - std::memset(jobs_, 0, sizeof(jobs_)); - job_submitted_signal_ = threading::Event::CreateAutoResetEvent(false); - job_completed_signal_ = threading::Event::CreateAutoResetEvent(false); - worker_has_work_ = threading::Event::CreateManualResetEvent(false); - threading::Thread::CreationParameters worker_params{}; - worker_params.create_suspended = false; - worker_params.initial_priority = threading::ThreadPriority::kBelowNormal; - worker_params.stack_size = 65536; // dont need much stack at all - - for (uint32_t i = 0; i < 64; ++i) { - job_done_signals_[i] = threading::Event::CreateManualResetEvent(false); - } - for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) { - // worker_has_work_[i] = threading::Event::CreateAutoResetEvent(false); - worker_has_finished_[i] = threading::Event::CreateAutoResetEvent(false); - worker_has_finished_nosafeptr_[i] = worker_has_finished_[i].get(); - - uintptr_t encoded = reinterpret_cast(this); - xenia_assert(!(encoded & 0xFFULL)); - xenia_assert(i < 256); - - encoded |= i; - - workers_[i] = threading::Thread::Create(worker_params, [encoded]() { - XeDMACGeneric::WorkerMainForwarder((void*)encoded); - }); - } - threading::Thread::CreationParameters scheduler_params{}; - scheduler_params.create_suspended = false; - scheduler_params.initial_priority = threading::ThreadPriority::kBelowNormal; - scheduler_params.stack_size = 65536; - scheduler_thread_ = threading::Thread::Create(scheduler_params, [this]() { - XeDMACGeneric::DMAMainForwarder((void*)this); - }); - } }; +void XeDMACGeneric::WorkerWait() { + constexpr unsigned NUM_PAUSE_SPINS = 2048; + constexpr unsigned NUM_YIELD_SPINS = 8; +#if 0 + + for (unsigned i = 0; i < NUM_PAUSE_SPINS; ++i) { + if (write_ptr_ == read_ptr_) { + _mm_pause(); + } else { + break; + } + } + for (unsigned i = 0; i < NUM_YIELD_SPINS; ++i) { + if (write_ptr_ == read_ptr_) { + threading::MaybeYield(); + } else { + break; + } + } + LARGE_INTEGER yield_execution_delay{}; + yield_execution_delay.QuadPart = + -2000; //-10000 == 1 ms, so -2000 means delay for 0.2 milliseconds + while (write_ptr_ == read_ptr_) { + NtDelayExecutionPointer.invoke(0, &yield_execution_delay); + } +#else + do { + if (WaitForSingleObjectEx(gotjob_event, 1, TRUE) == WAIT_OBJECT_0) { + while (write_ptr_ == read_ptr_) { + _mm_pause(); + } + } + + } while (write_ptr_ == read_ptr_); +#endif +} +void XeDMACGeneric::WorkerThreadMain() { + while (true) { + this->WorkerWait(); + + XeDMAJob current_job = jobs_ring_[read_ptr_ % MAX_INFLIGHT_DMAJOBS]; + swcache::ReadFence(); + + if (current_job.precall) { + current_job.precall(¤t_job); + } + + size_t num_lines = current_job.size / XE_HOST_CACHE_LINE_SIZE; + size_t line_rounded = num_lines * XE_HOST_CACHE_LINE_SIZE; + + size_t line_rem = current_job.size - line_rounded; + + vastcpy(current_job.destination, current_job.source, + static_cast(line_rounded)); + + if (line_rem) { + __movsb(current_job.destination + line_rounded, + current_job.source + line_rounded, line_rem); + } + + if (current_job.postcall) { + current_job.postcall(¤t_job); + } + read_ptr_++; + swcache::WriteFence(); + + SetEvent((HANDLE)current_job.dmac_specific_); + } +} + XeDMAC* CreateDMAC() { return new XeDMACGeneric(); } } // namespace xe::dma diff --git a/src/xenia/base/dma.h b/src/xenia/base/dma.h index e95639753..9406a31cd 100644 --- a/src/xenia/base/dma.h +++ b/src/xenia/base/dma.h @@ -16,7 +16,8 @@ struct XeDMAJob; using DmaPrecall = void (*)(XeDMAJob* job); using DmaPostcall = void (*)(XeDMAJob* job); struct XeDMAJob { - threading::Event* signal_on_done; + //threading::Event* signal_on_done; + uintptr_t dmac_specific_; uint8_t* destination; uint8_t* source; size_t size; diff --git a/src/xenia/base/logging.cc b/src/xenia/base/logging.cc index db6dfc5b7..08463bb0b 100644 --- a/src/xenia/base/logging.cc +++ b/src/xenia/base/logging.cc @@ -472,7 +472,7 @@ bool logging::internal::ShouldLog(LogLevel log_level) { std::pair logging::internal::GetThreadBuffer() { return {thread_log_buffer_, sizeof(thread_log_buffer_)}; } - +XE_NOALIAS void logging::internal::AppendLogLine(LogLevel log_level, const char prefix_char, size_t written) { if (!logger_ || !ShouldLog(log_level) || !written) { diff --git a/src/xenia/base/logging.h b/src/xenia/base/logging.h index 6b3c41561..5f6331d48 100644 --- a/src/xenia/base/logging.h +++ b/src/xenia/base/logging.h @@ -74,11 +74,15 @@ namespace internal { bool ShouldLog(LogLevel log_level); std::pair GetThreadBuffer(); - +XE_NOALIAS void AppendLogLine(LogLevel log_level, const char prefix_char, size_t written); } // namespace internal +//technically, noalias is incorrect here, these functions do in fact alias global memory, +//but msvc will not optimize the calls away, and the global memory modified by the calls is limited to internal logging variables, +//so it might as well be noalias template +XE_NOALIAS XE_NOINLINE XE_COLD static void AppendLogLineFormat_Impl(LogLevel log_level, const char prefix_char, const char* format, diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h index 6e323ede8..ad2af543a 100644 --- a/src/xenia/base/math.h +++ b/src/xenia/base/math.h @@ -400,10 +400,91 @@ static float ArchReciprocal(float den) { return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den))); } +#if 0 +using ArchFloatMask = float; + +XE_FORCEINLINE +static ArchFloatMask ArchCmpneqFloatMask(float x, float y) { + return _mm_cvtss_f32(_mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y))); +} +XE_FORCEINLINE +static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) { + return _mm_cvtss_f32(_mm_or_ps(_mm_set_ss(x), _mm_set_ss(y))); +} +XE_FORCEINLINE +static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) { + return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x), _mm_set_ss(y))); +} + +XE_FORCEINLINE +static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) { + return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x), _mm_set_ss(y))); +} + +XE_FORCEINLINE +static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) { + return static_cast(_mm_movemask_ps(_mm_set_ss(x))); +} + +constexpr ArchFloatMask floatmask_zero = .0f; +#else +using ArchFloatMask = __m128; + +XE_FORCEINLINE +static ArchFloatMask ArchCmpneqFloatMask(float x, float y) { + return _mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y)); +} +XE_FORCEINLINE +static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) { + return _mm_or_ps(x, y); +} +XE_FORCEINLINE +static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) { + return _mm_xor_ps(x, y); +} + +XE_FORCEINLINE +static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) { + return _mm_and_ps(x, y); +} + +XE_FORCEINLINE +static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) { + return static_cast(_mm_movemask_ps(x) &1); +} + +constexpr ArchFloatMask floatmask_zero{.0f}; +#endif #else static float ArchMin(float x, float y) { return std::min(x, y); } static float ArchMax(float x, float y) { return std::max(x, y); } static float ArchReciprocal(float den) { return 1.0f / den; } +using ArchFloatMask = unsigned; + +XE_FORCEINLINE +static ArchFloatMask ArchCmpneqFloatMask(float x, float y) { + return static_cast(-static_cast(x != y)); +} + +XE_FORCEINLINE +static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) { + return x | y; +} +XE_FORCEINLINE +static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) { + return x ^ y; +} + +XE_FORCEINLINE +static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) { + return x & y; +} +constexpr ArchFloatMask floatmask_zero = 0; + + +XE_FORCEINLINE +static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) { return x >> 31; } + #endif XE_FORCEINLINE static float RefineReciprocal(float initial, float den) { diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h index e0c42b42d..e99e8b83d 100644 --- a/src/xenia/base/platform.h +++ b/src/xenia/base/platform.h @@ -115,14 +115,17 @@ #define XE_COLD __declspec(code_seg(".cold")) #define XE_LIKELY(...) (!!(__VA_ARGS__)) #define XE_UNLIKELY(...) (!!(__VA_ARGS__)) - +#define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__) +#define XE_NOALIAS __declspec(noalias) #elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1 #define XE_FORCEINLINE __attribute__((always_inline)) #define XE_NOINLINE __attribute__((noinline)) #define XE_COLD __attribute__((cold)) #define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true) #define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false) - +#define XE_NOALIAS +//cant do unevaluated assume +#define XE_MSVC_ASSUME(...) static_cast(0) #else #define XE_FORCEINLINE inline #define XE_NOINLINE @@ -130,6 +133,9 @@ #define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]] #define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) [[unlikely]] +#define XE_NOALIAS +#define XE_MSVC_ASSUME(...) static_cast(0) + #endif #if XE_COMPILER_HAS_GNU_EXTENSIONS == 1 @@ -174,5 +180,7 @@ const char kPathSeparator = '/'; const char kGuestPathSeparator = '\\'; } // namespace xe - +#if XE_ARCH_AMD64==1 +#include "platform_amd64.h" +#endif #endif // XENIA_BASE_PLATFORM_H_ diff --git a/src/xenia/base/platform_amd64.cc b/src/xenia/base/platform_amd64.cc new file mode 100644 index 000000000..31df3c497 --- /dev/null +++ b/src/xenia/base/platform_amd64.cc @@ -0,0 +1,115 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2020 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + + +#include "xenia/base/cvar.h" +#include "xenia/base/platform.h" + +#include "third_party/xbyak/xbyak/xbyak.h" +#include "third_party/xbyak/xbyak/xbyak_util.h" +DEFINE_int32(x64_extension_mask, -1, + "Allow the detection and utilization of specific instruction set " + "features.\n" + " 0 = x86_64 + AVX1\n" + " 1 = AVX2\n" + " 2 = FMA\n" + " 4 = LZCNT\n" + " 8 = BMI1\n" + " 16 = BMI2\n" + " 32 = F16C\n" + " 64 = Movbe\n" + " 128 = GFNI\n" + " 256 = AVX512F\n" + " 512 = AVX512VL\n" + " 1024 = AVX512BW\n" + " 2048 = AVX512DQ\n" + " -1 = Detect and utilize all possible processor features\n", + "x64"); +namespace xe { +namespace amd64 { +static uint32_t g_feature_flags = 0U; +static bool g_did_initialize_feature_flags = false; +uint32_t GetFeatureFlags() { + xenia_assert(g_did_initialize_feature_flags); + return g_feature_flags; +} +XE_COLD +XE_NOINLINE +void InitFeatureFlags() { + uint32_t feature_flags_ = 0U; + + Xbyak::util::Cpu cpu_; +#define TEST_EMIT_FEATURE(emit, ext) \ + if ((cvars::x64_extension_mask & emit) == emit) { \ + feature_flags_ |= (cpu_.has(ext) ? emit : 0); \ + } + + TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2); + TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA); + TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT); + TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1); + TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2); + TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE); + TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI); + TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F); + TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL); + TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW); + TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ); + TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI); + TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW); +#undef TEST_EMIT_FEATURE + /* + fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in + latest version of xbyak +*/ + unsigned int data[4]; + Xbyak::util::Cpu::getCpuid(0x80000001, data); + unsigned amd_flags = data[2]; + if (amd_flags & (1U << 5)) { + if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) { + feature_flags_ |= kX64EmitLZCNT; + } + } + // todo: although not reported by cpuid, zen 1 and zen+ also have fma4 + if (amd_flags & (1U << 16)) { + if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) { + feature_flags_ |= kX64EmitFMA4; + } + } + if (amd_flags & (1U << 21)) { + if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) { + feature_flags_ |= kX64EmitTBM; + } + } + if (amd_flags & (1U << 11)) { + if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) { + feature_flags_ |= kX64EmitXOP; + } + } + if (cpu_.has(Xbyak::util::Cpu::tAMD)) { + bool is_zennish = cpu_.displayFamily >= 0x17; + /* + chrispy: according to agner's tables, all amd architectures that + we support (ones with avx) have the same timings for + jrcxz/loop/loope/loopne as for other jmps + */ + feature_flags_ |= kX64FastJrcx; + feature_flags_ |= kX64FastLoop; + if (is_zennish) { + // ik that i heard somewhere that this is the case for zen, but i need to + // verify. cant find my original source for that. + // todo: ask agner? + feature_flags_ |= kX64FlagsIndependentVars; + } + } + g_feature_flags = feature_flags_; + g_did_initialize_feature_flags = true; +} +} // namespace amd64 +} // namespace xe \ No newline at end of file diff --git a/src/xenia/base/platform_amd64.h b/src/xenia/base/platform_amd64.h new file mode 100644 index 000000000..326b69139 --- /dev/null +++ b/src/xenia/base/platform_amd64.h @@ -0,0 +1,61 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2019 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_BASE_PLATFORM_AMD64_H_ +#define XENIA_BASE_PLATFORM_AMD64_H_ +#include + +namespace xe { +namespace amd64 { +enum X64FeatureFlags { + kX64EmitAVX2 = 1 << 0, + kX64EmitFMA = 1 << 1, + kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount + kX64EmitBMI1 = 1 << 3, + kX64EmitBMI2 = 1 << 4, + kX64EmitPrefetchW = 1 << 5, + kX64EmitMovbe = 1 << 6, + kX64EmitGFNI = 1 << 7, + + kX64EmitAVX512F = 1 << 8, + kX64EmitAVX512VL = 1 << 9, + + kX64EmitAVX512BW = 1 << 10, + kX64EmitAVX512DQ = 1 << 11, + + kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL, + kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ, + kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1) + kX64FastLoop = + 1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2) + kX64EmitAVX512VBMI = 1 << 14, + kX64FlagsIndependentVars = + 1 << 15, // if true, instructions that only modify some flags (like + // inc/dec) do not introduce false dependencies on EFLAGS + // because the individual flags are treated as different vars by + // the processor. (this applies to zen) + kX64EmitXOP = 1 << 16, // chrispy: xop maps really well to many vmx + // instructions, and FX users need the boost + kX64EmitFMA4 = 1 << 17, // todo: also use on zen1? + kX64EmitTBM = 1 << 18, + // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family + // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm + // instructions by using the legacy sse version if we recently cleared the + // high 128 bits of the +}; + +XE_NOALIAS +uint32_t GetFeatureFlags(); +XE_COLD +void InitFeatureFlags(); + +} +} // namespace xe + +#endif // XENIA_BASE_PLATFORM_AMD64_H_ diff --git a/src/xenia/base/simple_freelist.h b/src/xenia/base/simple_freelist.h new file mode 100644 index 000000000..c0fd830be --- /dev/null +++ b/src/xenia/base/simple_freelist.h @@ -0,0 +1,40 @@ +#pragma once +namespace xe { +/* + a very simple freelist, intended to be used with HIRFunction/Arena to + eliminate our last-level cache miss problems with HIR simplifications not + thread safe, doesnt need to be +*/ +template +struct SimpleFreelist { + union Node { + union Node* next_; + T entry_; + }; + Node* head_; + + static_assert(sizeof(T) >= sizeof(void*)); + SimpleFreelist() : head_(nullptr) {} + T* NewEntry() { + Node* result_node = head_; + if (!result_node) { + return nullptr; + } else { + head_ = result_node->next_; + + memset(result_node, 0, sizeof(T)); + return &result_node->entry_; + // return new (&result_node->entry_) T(args...); + } + } + + void DeleteEntry(T* value) { + memset(value, 0, sizeof(T)); + Node* node = reinterpret_cast(value); + node->next_ = head_; + head_ = node; + } + void Reset() { head_ = nullptr; + } +}; +} // namespace xe \ No newline at end of file diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc index 5c00400e2..32ddf7487 100644 --- a/src/xenia/base/threading_win.cc +++ b/src/xenia/base/threading_win.cc @@ -50,6 +50,9 @@ XE_NTDLL_IMPORT(NtPulseEvent, cls_NtPulseEvent, NtPulseEventPointer); // counts XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore, NtReleaseSemaphorePointer); + +XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution, + NtDelayExecutionPointer); namespace xe { namespace threading { @@ -109,13 +112,30 @@ void set_name(const std::string_view name) { set_name(GetCurrentThread(), name); } +// checked ntoskrnl, it does not modify delay, so we can place this as a +// constant and avoid creating a stack variable +static const LARGE_INTEGER sleepdelay0_for_maybeyield{0LL}; + void MaybeYield() { +#if 0 #if defined(XE_USE_NTDLL_FUNCTIONS) + NtYieldExecutionPointer.invoke(); #else SwitchToThread(); #endif - +#else + // chrispy: SwitchToThread will only switch to a ready thread on the current + // processor, so if one is not ready we end up spinning, constantly calling + // switchtothread without doing any work, heating up the users cpu sleep(0) + // however will yield to threads on other processors and surrenders the + // current timeslice +#if defined(XE_USE_NTDLL_FUNCTIONS) + NtDelayExecutionPointer.invoke(0, &sleepdelay0_for_maybeyield); +#else + ::Sleep(0); +#endif +#endif // memorybarrier is really not necessary here... MemoryBarrier(); } diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index ba17c3caf..7b3e63222 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -26,24 +26,6 @@ #include "xenia/cpu/processor.h" #include "xenia/cpu/stack_walker.h" #include "xenia/cpu/xex_module.h" -DEFINE_int32(x64_extension_mask, -1, - "Allow the detection and utilization of specific instruction set " - "features.\n" - " 0 = x86_64 + AVX1\n" - " 1 = AVX2\n" - " 2 = FMA\n" - " 4 = LZCNT\n" - " 8 = BMI1\n" - " 16 = BMI2\n" - " 32 = F16C\n" - " 64 = Movbe\n" - " 128 = GFNI\n" - " 256 = AVX512F\n" - " 512 = AVX512VL\n" - " 1024 = AVX512BW\n" - " 2048 = AVX512DQ\n" - " -1 = Detect and utilize all possible processor features\n", - "x64"); DEFINE_bool(record_mmio_access_exceptions, true, "For guest addresses records whether we caught any mmio accesses " diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 32fec4fe2..74515d38e 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -103,7 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) "FAQ for system requirements at https://xenia.jp"); return; } - +#if 1 + feature_flags_ = amd64::GetFeatureFlags(); +#else #define TEST_EMIT_FEATURE(emit, ext) \ if ((cvars::x64_extension_mask & emit) == emit) { \ feature_flags_ |= (cpu_.has(ext) ? emit : 0); \ @@ -168,6 +170,7 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) feature_flags_ |= kX64FlagsIndependentVars; } } +#endif may_use_membase32_as_zero_reg_ = static_cast(reinterpret_cast( processor()->memory()->virtual_membase())) == 0; @@ -913,6 +916,8 @@ static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1, static const vec128_t xmm_consts[] = { /* XMMZero */ vec128f(0.0f), + /* XMMByteSwapMask */ + vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), /* XMMOne */ vec128f(1.0f), /* XMMOnePD */ vec128d(1.0), /* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f), @@ -937,8 +942,7 @@ static const vec128_t xmm_consts[] = { vec128i(0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu), /* XMMAbsMaskPD */ vec128i(0xFFFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu, 0x7FFFFFFFu), - /* XMMByteSwapMask */ - vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), + /* XMMByteOrderMask */ vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu), /* XMMPermuteControl15 */ vec128b(15), diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index b31e7d4d3..69e3b80ec 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -34,7 +34,7 @@ namespace xe { namespace cpu { namespace backend { namespace x64 { - +using namespace amd64; class X64Backend; class X64CodeCache; @@ -81,6 +81,7 @@ static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) { } enum XmmConst { XMMZero = 0, + XMMByteSwapMask, XMMOne, XMMOnePD, XMMNegativeOne, @@ -97,7 +98,7 @@ enum XmmConst { XMMSignMaskPD, XMMAbsMaskPS, XMMAbsMaskPD, - XMMByteSwapMask, + XMMByteOrderMask, XMMPermuteControl15, XMMPermuteByteMask, @@ -189,42 +190,6 @@ class XbyakAllocator : public Xbyak::Allocator { virtual bool useProtect() const { return false; } }; -enum X64EmitterFeatureFlags { - kX64EmitAVX2 = 1 << 0, - kX64EmitFMA = 1 << 1, - kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount - kX64EmitBMI1 = 1 << 3, - kX64EmitBMI2 = 1 << 4, - kX64EmitPrefetchW = 1 << 5, - kX64EmitMovbe = 1 << 6, - kX64EmitGFNI = 1 << 7, - - kX64EmitAVX512F = 1 << 8, - kX64EmitAVX512VL = 1 << 9, - - kX64EmitAVX512BW = 1 << 10, - kX64EmitAVX512DQ = 1 << 11, - - kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL, - kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ, - kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1) - kX64FastLoop = - 1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2) - kX64EmitAVX512VBMI = 1 << 14, - kX64FlagsIndependentVars = - 1 << 15, // if true, instructions that only modify some flags (like - // inc/dec) do not introduce false dependencies on EFLAGS - // because the individual flags are treated as different vars by - // the processor. (this applies to zen) - kX64EmitXOP = 1 << 16, // chrispy: xop maps really well to many vmx - // instructions, and FX users need the boost - kX64EmitFMA4 = 1 << 17, // todo: also use on zen1? - kX64EmitTBM = 1 << 18, - // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family - // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm - // instructions by using the legacy sse version if we recently cleared the - // high 128 bits of the -}; class ResolvableGuestCall { public: bool is_jump_; diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index e5843f37d..146fc8ca1 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -1354,15 +1354,17 @@ struct VECTOR_SHA_V128 static void EmitInt8(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): native version (with shift magic). if (i.src2.is_constant) { - if (e.IsFeatureEnabled(kX64EmitGFNI)) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 16 - n; ++n) { - if (shamt.u8[n] != shamt.u8[n + 1]) { - all_same = false; - break; - } + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; } + } + + + if (e.IsFeatureEnabled(kX64EmitGFNI)) { if (all_same) { // Every count is the same, so we can use gf2p8affineqb. const uint8_t shift_amount = shamt.u8[0] & 0b111; @@ -1375,6 +1377,19 @@ struct VECTOR_SHA_V128 return; } } + else if (all_same) { + Xmm to_be_shifted = GetInputRegOrConstant(e, i.src1, e.xmm1); + + e.vpmovsxbw(e.xmm0, to_be_shifted); //_mm_srai_epi16 / psraw + e.vpunpckhqdq(e.xmm2, to_be_shifted, to_be_shifted); + e.vpmovsxbw(e.xmm1, e.xmm2); + e.vpsraw(e.xmm0, shamt.u8[0]); + e.vpsraw(e.xmm1, shamt.u8[0]); + e.vpacksswb(i.dest, e.xmm0, e.xmm1); + return; + } + + e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); } else { e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index bf5addd3d..06a37ab91 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -3234,7 +3234,17 @@ struct SET_ROUNDING_MODE_I32 } }; EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32); - +// ============================================================================ +// OPCODE_DELAY_EXECUTION +// ============================================================================ +struct DELAY_EXECUTION + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // todo: what if they dont have smt? + e.pause(); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_DELAY_EXECUTION, DELAY_EXECUTION); // Include anchors to other sequence sources so they get included in the build. extern volatile int anchor_control; static int anchor_control_dest = anchor_control; diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index f7d91267b..4c1640302 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -98,7 +98,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstantTrue()) { i->Replace(&OPCODE_DEBUG_BREAK_info, i->flags); } else { - i->Remove(); + i->UnlinkAndNOP(); } result = true; } @@ -109,7 +109,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstantTrue()) { i->Replace(&OPCODE_TRAP_info, i->flags); } else { - i->Remove(); + i->UnlinkAndNOP(); } result = true; } @@ -122,7 +122,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { i->Replace(&OPCODE_CALL_info, i->flags); i->src1.symbol = symbol; } else { - i->Remove(); + i->UnlinkAndNOP(); } result = true; } @@ -146,7 +146,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { i->Replace(&OPCODE_CALL_INDIRECT_info, i->flags); i->set_src1(value); } else { - i->Remove(); + i->UnlinkAndNOP(); } result = true; } else if (i->src2.value->IsConstant()) { // chrispy: fix h3 bug from @@ -172,7 +172,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { i->Replace(&OPCODE_BRANCH_info, i->flags); i->src1.label = label; } else { - i->Remove(); + i->UnlinkAndNOP(); } result = true; } @@ -184,7 +184,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { i->Replace(&OPCODE_BRANCH_info, i->flags); i->src1.label = label; } else { - i->Remove(); + i->UnlinkAndNOP(); } result = true; } @@ -195,7 +195,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { TypeName target_type = v->type; v->set_from(i->src1.value); v->Cast(target_type); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -204,7 +204,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { TypeName target_type = v->type; v->set_from(i->src1.value); v->Convert(target_type, RoundMode(i->flags)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -212,7 +212,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->Round(RoundMode(i->flags)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -221,7 +221,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { TypeName target_type = v->type; v->set_from(i->src1.value); v->ZeroExtend(target_type); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -230,7 +230,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { TypeName target_type = v->type; v->set_from(i->src1.value); v->SignExtend(target_type); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -239,7 +239,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { TypeName target_type = v->type; v->set_from(i->src1.value); v->Truncate(target_type); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -247,7 +247,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { if (!(i->src1.value->AsUint32() & 0xF)) { v->set_zero(VEC128_TYPE); - i->Remove(); + i->UnlinkAndNOP(); result = true; break; } @@ -281,22 +281,22 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { switch (v->type) { case INT8_TYPE: v->set_constant(xe::load(host_addr)); - i->Remove(); + i->UnlinkAndNOP(); result = true; break; case INT16_TYPE: v->set_constant(xe::load(host_addr)); - i->Remove(); + i->UnlinkAndNOP(); result = true; break; case INT32_TYPE: v->set_constant(xe::load(host_addr)); - i->Remove(); + i->UnlinkAndNOP(); result = true; break; case INT64_TYPE: v->set_constant(xe::load(host_addr)); - i->Remove(); + i->UnlinkAndNOP(); result = true; break; case VEC128_TYPE: @@ -304,7 +304,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { val.low = xe::load(host_addr); val.high = xe::load(host_addr + 8); v->set_constant(val); - i->Remove(); + i->UnlinkAndNOP(); result = true; break; default: @@ -357,14 +357,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { i->src3.value->IsConstant()) { v->set_from(i->src2.value); v->Select(i->src3.value, i->src1.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } } else { if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) { v->set_from(i->src2.value); v->Select(i->src3.value, i->src1.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } } @@ -381,7 +381,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { } else { v->set_constant(uint8_t(0)); } - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -391,7 +391,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { bool value = i->src1.value->IsConstantEQ(i->src2.value); i->dest->set_constant(uint8_t(value)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -399,7 +399,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { bool value = i->src1.value->IsConstantNE(i->src2.value); i->dest->set_constant(uint8_t(value)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -407,7 +407,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { bool value = i->src1.value->IsConstantSLT(i->src2.value); i->dest->set_constant(uint8_t(value)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -415,7 +415,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { bool value = i->src1.value->IsConstantSLE(i->src2.value); i->dest->set_constant(uint8_t(value)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -423,7 +423,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { bool value = i->src1.value->IsConstantSGT(i->src2.value); i->dest->set_constant(uint8_t(value)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -431,7 +431,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { bool value = i->src1.value->IsConstantSGE(i->src2.value); i->dest->set_constant(uint8_t(value)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -439,7 +439,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { bool value = i->src1.value->IsConstantULT(i->src2.value); i->dest->set_constant(uint8_t(value)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -447,7 +447,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { bool value = i->src1.value->IsConstantULE(i->src2.value); i->dest->set_constant(uint8_t(value)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -455,7 +455,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { bool value = i->src1.value->IsConstantUGT(i->src2.value); i->dest->set_constant(uint8_t(value)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -463,7 +463,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { bool value = i->src1.value->IsConstantUGE(i->src2.value); i->dest->set_constant(uint8_t(value)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -477,7 +477,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { !should_skip_because_of_float) { v->set_from(i->src1.value); v->Add(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -489,7 +489,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { TypeName target_type = v->type; v->set_from(ca); v->ZeroExtend(target_type); - i->Remove(); + i->UnlinkAndNOP(); } else { if (i->dest->type == ca->type) { i->Replace(&OPCODE_ASSIGN_info, 0); @@ -507,7 +507,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { !should_skip_because_of_float) { v->set_from(i->src1.value); v->Sub(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -516,7 +516,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->Mul(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } else if (i->src1.value->IsConstant() || i->src2.value->IsConstant()) { @@ -548,7 +548,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->MulHi(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -557,13 +557,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0); - i->Remove(); + i->UnlinkAndNOP(); result = true; } else if (!i->src2.value->MaybeFloaty() && i->src2.value->IsConstantZero()) { // division by 0 == 0 every time, v->set_zero(i->src2.value->type); - i->Remove(); + i->UnlinkAndNOP(); result = true; } else if (i->src2.value->IsConstant()) { // Division by one = no-op. @@ -592,7 +592,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { } v->set_from(i->src1.value); v->Max(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -600,7 +600,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->Neg(); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -608,7 +608,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->Abs(); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -616,7 +616,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->Sqrt(); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -624,7 +624,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->RSqrt(); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -632,7 +632,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->Recip(); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -640,7 +640,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->And(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -648,7 +648,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->AndNot(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -656,7 +656,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->Or(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -664,13 +664,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->Xor(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } else if (!i->src1.value->IsConstant() && !i->src2.value->IsConstant() && i->src1.value == i->src2.value) { v->set_zero(v->type); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -678,7 +678,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->Not(); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -687,7 +687,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->Shl(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } else if (i->src2.value->IsConstantZero()) { auto src1 = i->src1.value; @@ -702,7 +702,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->Shr(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } else if (i->src2.value->IsConstantZero()) { auto src1 = i->src1.value; @@ -716,7 +716,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->Sha(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -724,7 +724,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->RotateLeft(i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -732,7 +732,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->ByteSwap(); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -740,7 +740,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_zero(v->type); v->CountLeadingZeros(i->src1.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -751,7 +751,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { (i->flags == INT8_TYPE || i->flags == INT16_TYPE)) { v->set_from(i->src1.value); v->Permute(i->src2.value, i->src3.value, (TypeName)i->flags); - i->Remove(); + i->UnlinkAndNOP(); result = true; } @@ -765,7 +765,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { */ v->set_zero(VEC128_TYPE); - i->Remove(); + i->UnlinkAndNOP(); result = true; } @@ -777,7 +777,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { i->src3.value->IsConstant()) { v->set_from(i->src1.value); v->Insert(i->src2.value, i->src3.value, (TypeName)i->flags); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -785,7 +785,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->Swizzle((uint32_t)i->src2.offset, (TypeName)i->flags); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -793,7 +793,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_zero(v->type); v->Extract(i->src1.value, i->src2.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -801,7 +801,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_zero(v->type); v->Splat(i->src1.value); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -809,7 +809,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->VectorCompareEQ(i->src2.value, hir::TypeName(i->flags)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -817,7 +817,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->VectorCompareSGT(i->src2.value, hir::TypeName(i->flags)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -825,7 +825,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->VectorCompareSGE(i->src2.value, hir::TypeName(i->flags)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -833,7 +833,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->VectorCompareUGT(i->src2.value, hir::TypeName(i->flags)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -841,7 +841,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->VectorCompareUGE(i->src2.value, hir::TypeName(i->flags)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -850,7 +850,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { v->set_zero(VEC128_TYPE); v->VectorConvertF2I(i->src1.value, !!(i->flags & ARITHMETIC_UNSIGNED)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -859,7 +859,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { v->set_zero(VEC128_TYPE); v->VectorConvertI2F(i->src1.value, !!(i->flags & ARITHMETIC_UNSIGNED)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -867,7 +867,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->VectorShl(i->src2.value, hir::TypeName(i->flags)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -875,7 +875,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->VectorShr(i->src2.value, hir::TypeName(i->flags)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -883,7 +883,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); v->VectorRol(i->src2.value, hir::TypeName(i->flags)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -894,7 +894,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { v->VectorAdd(i->src2.value, hir::TypeName(i->flags & 0xFF), !!(arith_flags & ARITHMETIC_UNSIGNED), !!(arith_flags & ARITHMETIC_SATURATE)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -905,7 +905,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { v->VectorSub(i->src2.value, hir::TypeName(i->flags & 0xFF), !!(arith_flags & ARITHMETIC_UNSIGNED), !!(arith_flags & ARITHMETIC_SATURATE)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -917,7 +917,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { v->VectorAverage(i->src2.value, hir::TypeName(i->flags & 0xFF), !!(arith_flags & ARITHMETIC_UNSIGNED), !!(arith_flags & ARITHMETIC_SATURATE)); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; @@ -926,7 +926,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->DenormalFlush(); - i->Remove(); + i->UnlinkAndNOP(); result = true; } break; diff --git a/src/xenia/cpu/compiler/passes/context_promotion_pass.cc b/src/xenia/cpu/compiler/passes/context_promotion_pass.cc index 9e042bd91..f5cc368db 100644 --- a/src/xenia/cpu/compiler/passes/context_promotion_pass.cc +++ b/src/xenia/cpu/compiler/passes/context_promotion_pass.cc @@ -146,7 +146,7 @@ void ContextPromotionPass::RemoveDeadStoresBlock(Block* block) { validity.set(static_cast(offset)); } else { // Already written to. Remove this store. - i->Remove(); + i->UnlinkAndNOP(); } } i = prev; diff --git a/src/xenia/cpu/compiler/passes/dead_code_elimination_pass.cc b/src/xenia/cpu/compiler/passes/dead_code_elimination_pass.cc index d7ff9a0fd..c778e45fa 100644 --- a/src/xenia/cpu/compiler/passes/dead_code_elimination_pass.cc +++ b/src/xenia/cpu/compiler/passes/dead_code_elimination_pass.cc @@ -120,7 +120,8 @@ bool DeadCodeEliminationPass::Run(HIRBuilder* builder) { Instr* next = i->next; if (i->opcode == &OPCODE_NOP_info) { // Nop - remove! - i->Remove(); + i->UnlinkAndNOP(); + i->Deallocate(); } i = next; } @@ -148,7 +149,9 @@ bool DeadCodeEliminationPass::Run(HIRBuilder* builder) { void DeadCodeEliminationPass::MakeNopRecursive(Instr* i) { i->opcode = &hir::OPCODE_NOP_info; - i->dest->def = NULL; + if (i->dest) { + i->dest->def = NULL; + } i->dest = NULL; #define MAKE_NOP_SRC(n) \ @@ -163,7 +166,9 @@ void DeadCodeEliminationPass::MakeNopRecursive(Instr* i) { if (value->def && value->def != i) { \ MakeNopRecursive(value->def); \ } \ + HIRBuilder::GetCurrent()->DeallocateValue(value); \ } \ + HIRBuilder::GetCurrent()->DeallocateUse(use); \ } MAKE_NOP_SRC(1); MAKE_NOP_SRC(2); @@ -189,7 +194,8 @@ void DeadCodeEliminationPass::ReplaceAssignment(Instr* i) { use = use->next; } - i->Remove(); + i->UnlinkAndNOP(); + i->Deallocate(); } bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) { @@ -204,11 +210,11 @@ bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) { } // Load/store are paired. They can both be removed. - use_instr->Remove(); + use_instr->UnlinkAndNOP(); } - i->Remove(); - + i->UnlinkAndNOP(); + i->Deallocate(); return false; } diff --git a/src/xenia/cpu/compiler/passes/finalization_pass.cc b/src/xenia/cpu/compiler/passes/finalization_pass.cc index 22b386799..1b409430c 100644 --- a/src/xenia/cpu/compiler/passes/finalization_pass.cc +++ b/src/xenia/cpu/compiler/passes/finalization_pass.cc @@ -61,7 +61,7 @@ bool FinalizationPass::Run(HIRBuilder* builder) { auto target = tail->src1.label; if (target->block == block->next) { // Jumping to subsequent block. Remove. - tail->Remove(); + tail->UnlinkAndNOP(); } } diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index 82e3021f9..b98ae80d5 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -46,15 +46,27 @@ namespace hir { (value->type) == FLOAT64_TYPE || (value->type) == VEC128_TYPE) #define ASSERT_TYPES_EQUAL(value1, value2) \ assert_true((value1->type) == (value2->type)) - +thread_local HIRBuilder* thrd_current_hirfunction = nullptr; HIRBuilder::HIRBuilder() { arena_ = new Arena(); Reset(); } +HIRBuilder* HIRBuilder::GetCurrent() { return thrd_current_hirfunction; } + +void HIRBuilder::MakeCurrent() { thrd_current_hirfunction = this; } +void HIRBuilder::RemoveCurrent() { + if (thrd_current_hirfunction == this) { + thrd_current_hirfunction = nullptr; + } +} + HIRBuilder::~HIRBuilder() { Reset(); delete arena_; + if (thrd_current_hirfunction == this) { + thrd_current_hirfunction = nullptr; + } } void HIRBuilder::Reset() { @@ -105,7 +117,37 @@ bool HIRBuilder::Finalize() { } return true; } +Instr* HIRBuilder::AllocateInstruction() { + Instr* result = free_instrs_.NewEntry(); + if (result) { + return result; + } + return arena()->Alloc(); +} +Value* HIRBuilder::AllocateValue() { + Value* result = free_values_.NewEntry(); + if (result) { + return result; + } + return arena()->Alloc(); +} +Value::Use* HIRBuilder::AllocateUse() { + Value::Use* result = free_uses_.NewEntry(); + if (result) { + return result; + } + return arena()->Alloc(); +} +void HIRBuilder::DeallocateInstruction(Instr* instr) { + // free_instrs_.DeleteEntry(instr); +} +void HIRBuilder::DeallocateValue(Value* value) { + // free_values_.DeleteEntry(value); +} +void HIRBuilder::DeallocateUse(Value::Use* use) { + // free_uses_.DeleteEntry(use); +} void HIRBuilder::DumpValue(StringBuffer* str, Value* value) { if (value->IsConstant()) { switch (value->type) { @@ -545,12 +587,12 @@ void HIRBuilder::MergeAdjacentBlocks(Block* left, Block* right) { auto sig = left->instr_tail->opcode->signature; if (GET_OPCODE_SIG_TYPE_SRC1(sig) == OPCODE_SIG_TYPE_L) { if (left->instr_tail->src1.label->block == right) { - left->instr_tail->Remove(); + left->instr_tail->UnlinkAndNOP(); } } if (GET_OPCODE_SIG_TYPE_SRC2(sig) == OPCODE_SIG_TYPE_L) { if (left->instr_tail->src2.label->block == right) { - left->instr_tail->Remove(); + left->instr_tail->UnlinkAndNOP(); } } } @@ -678,7 +720,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags, } Block* block = current_block_; - Instr* instr = arena_->Alloc(); + Instr* instr = AllocateInstruction(); instr->next = NULL; instr->prev = block->instr_tail; if (block->instr_tail) { @@ -705,7 +747,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags, } Value* HIRBuilder::AllocValue(TypeName type) { - Value* value = arena_->Alloc(); + Value* value = AllocateValue(); value->ordinal = next_value_ordinal_++; value->type = type; value->flags = 0; @@ -719,7 +761,7 @@ Value* HIRBuilder::AllocValue(TypeName type) { } Value* HIRBuilder::CloneValue(Value* source) { - Value* value = arena_->Alloc(); + Value* value = AllocateValue(); value->ordinal = next_value_ordinal_++; value->type = source->type; value->flags = source->flags; @@ -1295,6 +1337,9 @@ void HIRBuilder::CacheControl(Value* address, size_t cache_line_size, void HIRBuilder::MemoryBarrier() { AppendInstr(OPCODE_MEMORY_BARRIER_info, 0); } +void HIRBuilder::DelayExecution() { + AppendInstr(OPCODE_DELAY_EXECUTION_info, 0); +} void HIRBuilder::SetRoundingMode(Value* value) { ASSERT_INTEGER_TYPE(value); Instr* i = AppendInstr(OPCODE_SET_ROUNDING_MODE_info, 0); diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h index 4a200e0e5..d83806cd9 100644 --- a/src/xenia/cpu/hir/hir_builder.h +++ b/src/xenia/cpu/hir/hir_builder.h @@ -15,6 +15,8 @@ #include "third_party/fmt/include/fmt/format.h" #include "xenia/base/arena.h" #include "xenia/base/string_buffer.h" + +#include "xenia/base/simple_freelist.h" #include "xenia/cpu/hir/block.h" #include "xenia/cpu/hir/instr.h" #include "xenia/cpu/hir/label.h" @@ -31,11 +33,20 @@ enum FunctionAttributes { }; class HIRBuilder { + SimpleFreelist free_instrs_; + SimpleFreelist free_values_; + SimpleFreelist free_uses_; + public: HIRBuilder(); virtual ~HIRBuilder(); + static HIRBuilder* GetCurrent(); + + void MakeCurrent(); + void RemoveCurrent(); virtual void Reset(); + virtual bool Finalize(); void Dump(StringBuffer* str); @@ -66,6 +77,18 @@ class HIRBuilder { void RemoveBlock(Block* block); void MergeAdjacentBlocks(Block* left, Block* right); + Instr* AllocateInstruction(); + + Value* AllocateValue(); + Value::Use* AllocateUse(); + void DeallocateInstruction(Instr* instr); + void DeallocateValue(Value* value); + void DeallocateUse(Value::Use* use); + void ResetPools() { + free_instrs_.Reset(); + free_uses_.Reset(); + free_values_.Reset(); + } // static allocations: // Value* AllocStatic(size_t length); @@ -176,7 +199,7 @@ class HIRBuilder { void CacheControl(Value* address, size_t cache_line_size, CacheControlType type); void MemoryBarrier(); - + void DelayExecution(); void SetRoundingMode(Value* value); Value* Max(Value* value1, Value* value2); Value* VectorMax(Value* value1, Value* value2, TypeName part_type, diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc index 0e4a7c2fb..71f57a841 100644 --- a/src/xenia/cpu/hir/instr.cc +++ b/src/xenia/cpu/hir/instr.cc @@ -10,7 +10,7 @@ #include "xenia/cpu/hir/instr.h" #include "xenia/cpu/hir/block.h" - +#include "xenia/cpu/hir/hir_builder.h" namespace xe { namespace cpu { namespace hir { @@ -62,21 +62,35 @@ void Instr::Replace(const OpcodeInfo* new_opcode, uint16_t new_flags) { if (src1_use) { src1.value->RemoveUse(src1_use); src1.value = NULL; - src1_use = NULL; + // src1_use = NULL; } if (src2_use) { src2.value->RemoveUse(src2_use); src2.value = NULL; - src2_use = NULL; + // src2_use = NULL; } if (src3_use) { src3.value->RemoveUse(src3_use); src3.value = NULL; - src3_use = NULL; + // src3_use = NULL; + } + + if (src1_use) { + HIRBuilder::GetCurrent()->DeallocateUse(src1_use); + src1_use = nullptr; + } + if (src2_use) { + HIRBuilder::GetCurrent()->DeallocateUse(src2_use); + src2_use = nullptr; + } + + if (src3_use) { + HIRBuilder::GetCurrent()->DeallocateUse(src3_use); + src3_use = nullptr; } } -void Instr::Remove() { +void Instr::UnlinkAndNOP() { // Remove all srcs/dest. Replace(&OPCODE_NOP_info, 0); @@ -91,6 +105,10 @@ void Instr::Remove() { block->instr_tail = prev; } } + +void Instr::Deallocate() { + HIRBuilder::GetCurrent()->DeallocateInstruction(this); +} Instr* Instr::GetDestDefSkipAssigns() { Instr* current_def = this; diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h index a50219ceb..17c0607d1 100644 --- a/src/xenia/cpu/hir/instr.h +++ b/src/xenia/cpu/hir/instr.h @@ -78,7 +78,12 @@ class Instr { void MoveBefore(Instr* other); void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags); - void Remove(); + void UnlinkAndNOP(); + //chrispy: wanted to change this one to Remove, but i changed Remove's name to UnlinkAndNOP, + //so if changes happened in master that we wanted to port over, and those changes used Remove, then we would have a lot of issues that the cause of would + //be difficult to track + //^todo: rework this comment, im frazzled + void Deallocate(); const OpcodeInfo* GetOpcodeInfo() const { return opcode; } // if opcode is null, we have bigger problems Opcode GetOpcodeNum() const { return GetOpcodeInfo()->num; } diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 3254586e5..6d90f1811 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -292,7 +292,7 @@ enum Opcode { // as we already have OPCODE_ROUND. round double to float ( // ppc "single" fpu instruction result rounding behavior ) OPCODE_SET_NJM, - + OPCODE_DELAY_EXECUTION, //for db16cyc __OPCODE_MAX_VALUE, // Keep at end. }; diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl index 783e9a439..e27f30b46 100644 --- a/src/xenia/cpu/hir/opcodes.inl +++ b/src/xenia/cpu/hir/opcodes.inl @@ -218,7 +218,7 @@ DEFINE_OPCODE( "context_barrier", OPCODE_SIG_X, 0) - +DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0) DEFINE_OPCODE( OPCODE_LOAD_MMIO, "load_mmio", diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc index 473dc5d90..84b28a8f2 100644 --- a/src/xenia/cpu/hir/value.cc +++ b/src/xenia/cpu/hir/value.cc @@ -16,13 +16,13 @@ #include "xenia/base/assert.h" #include "xenia/base/byte_order.h" #include "xenia/base/math.h" - +#include "xenia/cpu/hir/hir_builder.h" namespace xe { namespace cpu { namespace hir { Value::Use* Value::AddUse(Arena* arena, Instr* instr) { - Use* use = arena->Alloc(); + Use* use = HIRBuilder::GetCurrent()->AllocateUse(); use->instr = instr; use->prev = NULL; use->next = use_head; @@ -42,6 +42,8 @@ void Value::RemoveUse(Use* use) { if (use->next) { use->next->prev = use->prev; } + + //HIRBuilder::GetCurrent()->DeallocateUse(use); } uint32_t Value::AsUint32() { diff --git a/src/xenia/cpu/ppc/ppc_emit_alu.cc b/src/xenia/cpu/ppc/ppc_emit_alu.cc index 9e030a460..03098585e 100644 --- a/src/xenia/cpu/ppc/ppc_emit_alu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc @@ -789,8 +789,15 @@ int InstrEmit_norx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_orx(PPCHIRBuilder& f, const InstrData& i) { // RA <- (RS) | (RB) if (i.X.RT == i.X.RB && i.X.RT == i.X.RA && !i.X.Rc) { - // Sometimes used as no-op. - f.Nop(); + // chrispy: this special version of orx is db16cyc and is heavily used in + // spinlocks. since we do not emit any code for this we end up wasting a ton + // of power + if (i.code == 0x7FFFFB78) { + f.DelayExecution(); + } else { + // Sometimes used as no-op. + f.Nop(); + } return 0; } Value* ra; diff --git a/src/xenia/cpu/ppc/ppc_frontend.cc b/src/xenia/cpu/ppc/ppc_frontend.cc index 33d6f892d..7b7617368 100644 --- a/src/xenia/cpu/ppc/ppc_frontend.cc +++ b/src/xenia/cpu/ppc/ppc_frontend.cc @@ -117,6 +117,7 @@ bool PPCFrontend::DefineFunction(GuestFunction* function, uint32_t debug_info_flags) { auto translator = translator_pool_.Allocate(this); bool result = translator->Translate(function, debug_info_flags); + translator->Reset(); translator_pool_.Release(translator); return result; } diff --git a/src/xenia/cpu/ppc/ppc_translator.cc b/src/xenia/cpu/ppc/ppc_translator.cc index 99c258a32..af8159d97 100644 --- a/src/xenia/cpu/ppc/ppc_translator.cc +++ b/src/xenia/cpu/ppc/ppc_translator.cc @@ -96,10 +96,25 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : frontend_(frontend) { PPCTranslator::~PPCTranslator() = default; +class HirBuilderScope { + PPCHIRBuilder* builder_; + + public: + HirBuilderScope(PPCHIRBuilder* builder) : builder_(builder) { + builder_->MakeCurrent(); + } + + ~HirBuilderScope() { + if (builder_) { + builder_->RemoveCurrent(); + } + } +}; + bool PPCTranslator::Translate(GuestFunction* function, uint32_t debug_info_flags) { SCOPE_profile_cpu_f("cpu"); - + HirBuilderScope hir_build_scope{builder_.get()}; // Reset() all caching when we leave. xe::make_reset_scope(builder_); xe::make_reset_scope(compiler_); @@ -196,7 +211,7 @@ bool PPCTranslator::Translate(GuestFunction* function, return true; } - +void PPCTranslator::Reset() { builder_->ResetPools(); } void PPCTranslator::DumpSource(GuestFunction* function, StringBuffer* string_buffer) { Memory* memory = frontend_->memory(); diff --git a/src/xenia/cpu/ppc/ppc_translator.h b/src/xenia/cpu/ppc/ppc_translator.h index 56cec59ab..483be9c95 100644 --- a/src/xenia/cpu/ppc/ppc_translator.h +++ b/src/xenia/cpu/ppc/ppc_translator.h @@ -31,7 +31,7 @@ class PPCTranslator { ~PPCTranslator(); bool Translate(GuestFunction* function, uint32_t debug_info_flags); - + void Reset(); private: void DumpSource(GuestFunction* function, StringBuffer* string_buffer); diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 109636bb1..ab54438d7 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -36,7 +36,8 @@ using namespace xe::gpu::xenos; CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system, kernel::KernelState* kernel_state) - : memory_(graphics_system->memory()), + : reader_(nullptr, 0), + memory_(graphics_system->memory()), kernel_state_(kernel_state), graphics_system_(graphics_system), register_file_(graphics_system_->register_file()), @@ -45,6 +46,11 @@ CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system, write_ptr_index_event_(xe::threading::Event::CreateAutoResetEvent(false)), write_ptr_index_(0) { assert_not_null(write_ptr_index_event_); +#if 0 + dmac_ = dma::CreateDMAC(); +#else + dmac_ = nullptr; +#endif } CommandProcessor::~CommandProcessor() = default; @@ -217,12 +223,12 @@ void CommandProcessor::WorkerThreadMain() { do { // If we spin around too much, revert to a "low-power" state. if (loop_count > 500) { - const int wait_time_ms = 5; + const int wait_time_ms = 2; xe::threading::Wait(write_ptr_index_event_.get(), true, std::chrono::milliseconds(wait_time_ms)); + } else { + xe::threading::MaybeYield(); } - - xe::threading::MaybeYield(); loop_count++; write_ptr_index = write_ptr_index_.load(); } while (worker_running_ && pending_fns_.empty() && @@ -509,11 +515,78 @@ void CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, } } -void CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring, - uint32_t base, +void CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + WriteRegisterRangeFromRing(ring, base + 0x4000, num_times); +} + +void CommandProcessor::WriteFetchRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + WriteRegisterRangeFromRing(ring, base + 0x4800, num_times); +} + +XE_FORCEINLINE +void CommandProcessor::WriteBoolRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + WriteRegisterRangeFromRing(ring, base + 0x4900, num_times); +} + +XE_FORCEINLINE +void CommandProcessor::WriteLoopRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + WriteRegisterRangeFromRing(ring, base + 0x4908, num_times); +} + +XE_FORCEINLINE +void CommandProcessor::WriteREGISTERSRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + WriteRegisterRangeFromRing(ring, base + 0x2000, num_times); +} + +XE_FORCEINLINE +void CommandProcessor::WriteALURangeFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + WriteRegistersFromMem(start_index + 0x4000, base, num_registers); +} + +XE_FORCEINLINE +void CommandProcessor::WriteFetchRangeFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + WriteRegistersFromMem(start_index + 0x4800, base, num_registers); +} + +XE_FORCEINLINE +void CommandProcessor::WriteBoolRangeFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + WriteRegistersFromMem(start_index + 0x4900, base, num_registers); +} + +XE_FORCEINLINE +void CommandProcessor::WriteLoopRangeFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + WriteRegistersFromMem(start_index + 0x4908, base, num_registers); +} + +XE_FORCEINLINE +void CommandProcessor::WriteREGISTERSRangeFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + WriteRegistersFromMem(start_index + 0x2000, base, num_registers); +} +XE_NOINLINE +void CommandProcessor::WriteOneRegisterFromRing(uint32_t base, uint32_t num_times) { for (uint32_t m = 0; m < num_times; m++) { - uint32_t reg_data = ring->ReadAndSwap(); + uint32_t reg_data = reader_.ReadAndSwap(); uint32_t target_index = base; WriteRegister(target_index, reg_data); } @@ -585,1104 +658,47 @@ uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index, trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index); // Execute commands! - RingBuffer reader(memory_->TranslatePhysical(primary_buffer_ptr_), - primary_buffer_size_); - reader.set_read_offset(read_index * sizeof(uint32_t)); - reader.set_write_offset(write_index * sizeof(uint32_t)); + + RingBuffer old_reader = reader_; + new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_), + primary_buffer_size_); + + reader_.set_read_offset(read_index * sizeof(uint32_t)); + reader_.set_write_offset(write_index * sizeof(uint32_t)); do { - if (!ExecutePacket(&reader)) { + if (!ExecutePacket()) { // This probably should be fatal - but we're going to continue anyways. XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet."); assert_always(); break; } - } while (reader.read_count()); + } while (reader_.read_count()); OnPrimaryBufferEnd(); trace_writer_.WritePrimaryBufferEnd(); + reader_ = old_reader; return write_index; } -void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) { - SCOPE_profile_cpu_f("gpu"); - - trace_writer_.WriteIndirectBufferStart(ptr, count * sizeof(uint32_t)); - - // Execute commands! - RingBuffer reader(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)); - reader.set_write_offset(count * sizeof(uint32_t)); - do { - if (!ExecutePacket(&reader)) { - // Return up a level if we encounter a bad packet. - XELOGE("**** INDIRECT RINGBUFFER: Failed to execute packet."); - assert_always(); - // break; - } - } while (reader.read_count()); - - trace_writer_.WriteIndirectBufferEnd(); -} - void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) { // Execute commands! - RingBuffer reader(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)); - reader.set_write_offset(count * sizeof(uint32_t)); + RingBuffer old_reader = reader_; + + new (&reader_) + RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)}; + + reader_.set_write_offset(count * sizeof(uint32_t)); + do { - if (!ExecutePacket(&reader)) { + if (!ExecutePacket()) { XELOGE("**** ExecutePacket: Failed to execute packet."); assert_always(); break; } - } while (reader.read_count()); -} - -bool CommandProcessor::ExecutePacket(RingBuffer* reader) { - // prefetch the wraparound range - // it likely is already in L3 cache, but in a zen system it may be another - // chiplets l3 - reader->BeginPrefetchedRead( - reader->read_count()); - const uint32_t packet = reader->ReadAndSwap(); - const uint32_t packet_type = packet >> 30; - if (XE_UNLIKELY(packet == 0 || packet == 0x0BADF00D)) { - trace_writer_.WritePacketStart(uint32_t(reader->read_ptr() - 4), 1); - trace_writer_.WritePacketEnd(); - return true; - } - - if (XE_LIKELY(packet != 0xCDCDCDCD)) { - actually_execute_packet: - switch (packet_type) { - case 0x00: - return ExecutePacketType0(reader, packet); - case 0x01: - return ExecutePacketType1(reader, packet); - case 0x02: - return ExecutePacketType2(reader, packet); - case 0x03: - return ExecutePacketType3(reader, packet); - default: - assert_unhandled_case(packet_type); - return false; - } - } else { - XELOGW("GPU packet is CDCDCDCD - probably read uninitialized memory!"); - goto actually_execute_packet; - } -} - -bool CommandProcessor::ExecutePacketType0(RingBuffer* reader, uint32_t packet) { - // Type-0 packet. - // Write count registers in sequence to the registers starting at - // (base_index << 2). - - uint32_t count = ((packet >> 16) & 0x3FFF) + 1; - if (reader->read_count() < count * sizeof(uint32_t)) { - XELOGE( - "ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})", - reader->read_count(), count * sizeof(uint32_t)); - return false; - } - - trace_writer_.WritePacketStart(uint32_t(reader->read_ptr() - 4), 1 + count); - - uint32_t base_index = (packet & 0x7FFF); - uint32_t write_one_reg = (packet >> 15) & 0x1; - - if (!write_one_reg) { - if (count == 1) { - WriteRegister(base_index, reader->ReadAndSwap()); - } else { - WriteRegisterRangeFromRing(reader, base_index, count); - } - } else { - WriteOneRegisterFromRing(reader, base_index, count); - } - - trace_writer_.WritePacketEnd(); - return true; -} - -bool CommandProcessor::ExecutePacketType1(RingBuffer* reader, uint32_t packet) { - // Type-1 packet. - // Contains two registers of data. Type-0 should be more common. - trace_writer_.WritePacketStart(uint32_t(reader->read_ptr() - 4), 3); - uint32_t reg_index_1 = packet & 0x7FF; - uint32_t reg_index_2 = (packet >> 11) & 0x7FF; - uint32_t reg_data_1 = reader->ReadAndSwap(); - uint32_t reg_data_2 = reader->ReadAndSwap(); - WriteRegister(reg_index_1, reg_data_1); - WriteRegister(reg_index_2, reg_data_2); - trace_writer_.WritePacketEnd(); - return true; -} - -bool CommandProcessor::ExecutePacketType2(RingBuffer* reader, uint32_t packet) { - // Type-2 packet. - // No-op. Do nothing. - trace_writer_.WritePacketStart(uint32_t(reader->read_ptr() - 4), 1); - trace_writer_.WritePacketEnd(); - return true; -} - -bool CommandProcessor::ExecutePacketType3(RingBuffer* reader, uint32_t packet) { - // Type-3 packet. - uint32_t opcode = (packet >> 8) & 0x7F; - uint32_t count = ((packet >> 16) & 0x3FFF) + 1; - auto data_start_offset = reader->read_offset(); - - XE_UNLIKELY_IF(reader->read_count() < count * sizeof(uint32_t)) { - XELOGE( - "ExecutePacketType3 overflow (read count {:08X}, packet count {:08X})", - reader->read_count(), count * sizeof(uint32_t)); - return false; - } - - // To handle nesting behavior when tracing we special case indirect buffers. - if (opcode == PM4_INDIRECT_BUFFER) { - trace_writer_.WritePacketStart(uint32_t(reader->read_ptr() - 4), 2); - } else { - trace_writer_.WritePacketStart(uint32_t(reader->read_ptr() - 4), 1 + count); - } - - // & 1 == predicate - when set, we do bin check to see if we should execute - // the packet. Only type 3 packets are affected. - // We also skip predicated swaps, as they are never valid (probably?). - if (packet & 1) { - bool any_pass = (bin_select_ & bin_mask_) != 0; - if (!any_pass || opcode == PM4_XE_SWAP) { - reader->AdvanceRead(count * sizeof(uint32_t)); - trace_writer_.WritePacketEnd(); - return true; - } - } - - bool result = false; - switch (opcode) { - case PM4_ME_INIT: - result = ExecutePacketType3_ME_INIT(reader, packet, count); - break; - case PM4_NOP: - result = ExecutePacketType3_NOP(reader, packet, count); - break; - case PM4_INTERRUPT: - result = ExecutePacketType3_INTERRUPT(reader, packet, count); - break; - case PM4_XE_SWAP: - result = ExecutePacketType3_XE_SWAP(reader, packet, count); - break; - case PM4_INDIRECT_BUFFER: - case PM4_INDIRECT_BUFFER_PFD: - result = ExecutePacketType3_INDIRECT_BUFFER(reader, packet, count); - break; - case PM4_WAIT_REG_MEM: - result = ExecutePacketType3_WAIT_REG_MEM(reader, packet, count); - break; - case PM4_REG_RMW: - result = ExecutePacketType3_REG_RMW(reader, packet, count); - break; - case PM4_REG_TO_MEM: - result = ExecutePacketType3_REG_TO_MEM(reader, packet, count); - break; - case PM4_MEM_WRITE: - result = ExecutePacketType3_MEM_WRITE(reader, packet, count); - break; - case PM4_COND_WRITE: - result = ExecutePacketType3_COND_WRITE(reader, packet, count); - break; - case PM4_EVENT_WRITE: - result = ExecutePacketType3_EVENT_WRITE(reader, packet, count); - break; - case PM4_EVENT_WRITE_SHD: - result = ExecutePacketType3_EVENT_WRITE_SHD(reader, packet, count); - break; - case PM4_EVENT_WRITE_EXT: - result = ExecutePacketType3_EVENT_WRITE_EXT(reader, packet, count); - break; - case PM4_EVENT_WRITE_ZPD: - result = ExecutePacketType3_EVENT_WRITE_ZPD(reader, packet, count); - break; - case PM4_DRAW_INDX: - result = ExecutePacketType3_DRAW_INDX(reader, packet, count); - break; - case PM4_DRAW_INDX_2: - result = ExecutePacketType3_DRAW_INDX_2(reader, packet, count); - break; - case PM4_SET_CONSTANT: - result = ExecutePacketType3_SET_CONSTANT(reader, packet, count); - break; - case PM4_SET_CONSTANT2: - result = ExecutePacketType3_SET_CONSTANT2(reader, packet, count); - break; - case PM4_LOAD_ALU_CONSTANT: - result = ExecutePacketType3_LOAD_ALU_CONSTANT(reader, packet, count); - break; - case PM4_SET_SHADER_CONSTANTS: - result = ExecutePacketType3_SET_SHADER_CONSTANTS(reader, packet, count); - break; - case PM4_IM_LOAD: - result = ExecutePacketType3_IM_LOAD(reader, packet, count); - break; - case PM4_IM_LOAD_IMMEDIATE: - result = ExecutePacketType3_IM_LOAD_IMMEDIATE(reader, packet, count); - break; - case PM4_INVALIDATE_STATE: - result = ExecutePacketType3_INVALIDATE_STATE(reader, packet, count); - break; - case PM4_VIZ_QUERY: - result = ExecutePacketType3_VIZ_QUERY(reader, packet, count); - break; - - case PM4_SET_BIN_MASK_LO: { - uint32_t value = reader->ReadAndSwap(); - bin_mask_ = (bin_mask_ & 0xFFFFFFFF00000000ull) | value; - result = true; - } break; - case PM4_SET_BIN_MASK_HI: { - uint32_t value = reader->ReadAndSwap(); - bin_mask_ = - (bin_mask_ & 0xFFFFFFFFull) | (static_cast(value) << 32); - result = true; - } break; - case PM4_SET_BIN_SELECT_LO: { - uint32_t value = reader->ReadAndSwap(); - bin_select_ = (bin_select_ & 0xFFFFFFFF00000000ull) | value; - result = true; - } break; - case PM4_SET_BIN_SELECT_HI: { - uint32_t value = reader->ReadAndSwap(); - bin_select_ = - (bin_select_ & 0xFFFFFFFFull) | (static_cast(value) << 32); - result = true; - } break; - case PM4_SET_BIN_MASK: { - assert_true(count == 2); - uint64_t val_hi = reader->ReadAndSwap(); - uint64_t val_lo = reader->ReadAndSwap(); - bin_mask_ = (val_hi << 32) | val_lo; - result = true; - } break; - case PM4_SET_BIN_SELECT: { - assert_true(count == 2); - uint64_t val_hi = reader->ReadAndSwap(); - uint64_t val_lo = reader->ReadAndSwap(); - bin_select_ = (val_hi << 32) | val_lo; - result = true; - } break; - case PM4_CONTEXT_UPDATE: { - assert_true(count == 1); - uint32_t value = reader->ReadAndSwap(); - XELOGGPU("GPU context update = {:08X}", value); - assert_true(value == 0); - result = true; - break; - } - case PM4_WAIT_FOR_IDLE: { - // This opcode is used by 5454084E while going / being ingame. - assert_true(count == 1); - uint32_t value = reader->ReadAndSwap(); - XELOGGPU("GPU wait for idle = {:08X}", value); - result = true; - break; - } - - default: - XELOGGPU("Unimplemented GPU OPCODE: 0x{:02X}\t\tCOUNT: {}\n", opcode, - count); - assert_always(); - reader->AdvanceRead(count * sizeof(uint32_t)); - break; - } - - trace_writer_.WritePacketEnd(); -#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 - - if (opcode == PM4_XE_SWAP) { - // End the trace writer frame. - if (trace_writer_.is_open()) { - trace_writer_.WriteEvent(EventCommand::Type::kSwap); - trace_writer_.Flush(); - if (trace_state_ == TraceState::kSingleFrame) { - trace_state_ = TraceState::kDisabled; - trace_writer_.Close(); - } - } else if (trace_state_ == TraceState::kSingleFrame) { - // New trace request - we only start tracing at the beginning of a frame. - uint32_t title_id = kernel_state_->GetExecutableModule()->title_id(); - auto file_name = fmt::format("{:08X}_{}.xtr", title_id, counter_ - 1); - auto path = trace_frame_path_ / file_name; - trace_writer_.Open(path, title_id); - InitializeTrace(); - } - } -#endif - - assert_true(reader->read_offset() == - (data_start_offset + (count * sizeof(uint32_t))) % - reader->capacity()); - return result; -} - -bool CommandProcessor::ExecutePacketType3_ME_INIT(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // initialize CP's micro-engine - me_bin_.clear(); - for (uint32_t i = 0; i < count; i++) { - me_bin_.push_back(reader->ReadAndSwap()); - } - - return true; -} - -bool CommandProcessor::ExecutePacketType3_NOP(RingBuffer* reader, - uint32_t packet, uint32_t count) { - // skip N 32-bit words to get to the next packet - // No-op, ignore some data. - reader->AdvanceRead(count * sizeof(uint32_t)); - return true; -} - -bool CommandProcessor::ExecutePacketType3_INTERRUPT(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - SCOPE_profile_cpu_f("gpu"); - - // generate interrupt from the command stream - uint32_t cpu_mask = reader->ReadAndSwap(); - for (int n = 0; n < 6; n++) { - if (cpu_mask & (1 << n)) { - graphics_system_->DispatchInterruptCallback(1, n); - } - } - return true; -} - -bool CommandProcessor::ExecutePacketType3_XE_SWAP(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - SCOPE_profile_cpu_f("gpu"); - - XELOGD("XE_SWAP"); - - Profiler::Flip(); - - // Xenia-specific VdSwap hook. - // VdSwap will post this to tell us we need to swap the screen/fire an - // interrupt. - // 63 words here, but only the first has any data. - uint32_t magic = reader->ReadAndSwap(); - assert_true(magic == kSwapSignature); - - // TODO(benvanik): only swap frontbuffer ptr. - uint32_t frontbuffer_ptr = reader->ReadAndSwap(); - uint32_t frontbuffer_width = reader->ReadAndSwap(); - uint32_t frontbuffer_height = reader->ReadAndSwap(); - reader->AdvanceRead((count - 4) * sizeof(uint32_t)); - - IssueSwap(frontbuffer_ptr, frontbuffer_width, frontbuffer_height); - - ++counter_; - return true; -} - -bool CommandProcessor::ExecutePacketType3_INDIRECT_BUFFER(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // indirect buffer dispatch - uint32_t list_ptr = CpuToGpu(reader->ReadAndSwap()); - uint32_t list_length = reader->ReadAndSwap(); - assert_zero(list_length & ~0xFFFFF); - list_length &= 0xFFFFF; - ExecuteIndirectBuffer(GpuToCpu(list_ptr), list_length); - return true; -} - -bool CommandProcessor::ExecutePacketType3_WAIT_REG_MEM(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - SCOPE_profile_cpu_f("gpu"); - - // wait until a register or memory location is a specific value - uint32_t wait_info = reader->ReadAndSwap(); - uint32_t poll_reg_addr = reader->ReadAndSwap(); - uint32_t ref = reader->ReadAndSwap(); - uint32_t mask = reader->ReadAndSwap(); - uint32_t wait = reader->ReadAndSwap(); - bool matched = false; - do { - uint32_t value; - if (wait_info & 0x10) { - // Memory. - auto endianness = static_cast(poll_reg_addr & 0x3); - poll_reg_addr &= ~0x3; - value = xe::load(memory_->TranslatePhysical(poll_reg_addr)); - value = GpuSwap(value, endianness); - trace_writer_.WriteMemoryRead(CpuToGpu(poll_reg_addr), 4); - } else { - // Register. - assert_true(poll_reg_addr < RegisterFile::kRegisterCount); - value = register_file_->values[poll_reg_addr].u32; - if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) { - MakeCoherent(); - value = register_file_->values[poll_reg_addr].u32; - } - } - switch (wait_info & 0x7) { - case 0x0: // Never. - matched = false; - break; - case 0x1: // Less than reference. - matched = (value & mask) < ref; - break; - case 0x2: // Less than or equal to reference. - matched = (value & mask) <= ref; - break; - case 0x3: // Equal to reference. - matched = (value & mask) == ref; - break; - case 0x4: // Not equal to reference. - matched = (value & mask) != ref; - break; - case 0x5: // Greater than or equal to reference. - matched = (value & mask) >= ref; - break; - case 0x6: // Greater than reference. - matched = (value & mask) > ref; - break; - case 0x7: // Always - matched = true; - break; - } - if (!matched) { - // Wait. - if (wait >= 0x100) { - PrepareForWait(); - if (!cvars::vsync) { - // User wants it fast and dangerous. - xe::threading::MaybeYield(); - } else { - xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100)); - } - xe::threading::SyncMemory(); - ReturnFromWait(); - - if (!worker_running_) { - // Short-circuited exit. - return false; - } - } else { - xe::threading::MaybeYield(); - } - } - } while (!matched); - - return true; -} - -bool CommandProcessor::ExecutePacketType3_REG_RMW(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // register read/modify/write - // ? (used during shader upload and edram setup) - uint32_t rmw_info = reader->ReadAndSwap(); - uint32_t and_mask = reader->ReadAndSwap(); - uint32_t or_mask = reader->ReadAndSwap(); - uint32_t value = register_file_->values[rmw_info & 0x1FFF].u32; - if ((rmw_info >> 31) & 0x1) { - // & reg - value &= register_file_->values[and_mask & 0x1FFF].u32; - } else { - // & imm - value &= and_mask; - } - if ((rmw_info >> 30) & 0x1) { - // | reg - value |= register_file_->values[or_mask & 0x1FFF].u32; - } else { - // | imm - value |= or_mask; - } - WriteRegister(rmw_info & 0x1FFF, value); - return true; -} - -bool CommandProcessor::ExecutePacketType3_REG_TO_MEM(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // Copy Register to Memory (?) - // Count is 2, assuming a Register Addr and a Memory Addr. - - uint32_t reg_addr = reader->ReadAndSwap(); - uint32_t mem_addr = reader->ReadAndSwap(); - - uint32_t reg_val; - - assert_true(reg_addr < RegisterFile::kRegisterCount); - reg_val = register_file_->values[reg_addr].u32; - - auto endianness = static_cast(mem_addr & 0x3); - mem_addr &= ~0x3; - reg_val = GpuSwap(reg_val, endianness); - xe::store(memory_->TranslatePhysical(mem_addr), reg_val); - trace_writer_.WriteMemoryWrite(CpuToGpu(mem_addr), 4); - - return true; -} - -bool CommandProcessor::ExecutePacketType3_MEM_WRITE(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - uint32_t write_addr = reader->ReadAndSwap(); - for (uint32_t i = 0; i < count - 1; i++) { - uint32_t write_data = reader->ReadAndSwap(); - - auto endianness = static_cast(write_addr & 0x3); - auto addr = write_addr & ~0x3; - write_data = GpuSwap(write_data, endianness); - xe::store(memory_->TranslatePhysical(addr), write_data); - trace_writer_.WriteMemoryWrite(CpuToGpu(addr), 4); - write_addr += 4; - } - - return true; -} - -bool CommandProcessor::ExecutePacketType3_COND_WRITE(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // conditional write to memory or register - uint32_t wait_info = reader->ReadAndSwap(); - uint32_t poll_reg_addr = reader->ReadAndSwap(); - uint32_t ref = reader->ReadAndSwap(); - uint32_t mask = reader->ReadAndSwap(); - uint32_t write_reg_addr = reader->ReadAndSwap(); - uint32_t write_data = reader->ReadAndSwap(); - uint32_t value; - if (wait_info & 0x10) { - // Memory. - auto endianness = static_cast(poll_reg_addr & 0x3); - poll_reg_addr &= ~0x3; - trace_writer_.WriteMemoryRead(CpuToGpu(poll_reg_addr), 4); - value = xe::load(memory_->TranslatePhysical(poll_reg_addr)); - value = GpuSwap(value, endianness); - } else { - // Register. - assert_true(poll_reg_addr < RegisterFile::kRegisterCount); - value = register_file_->values[poll_reg_addr].u32; - } - bool matched = false; - switch (wait_info & 0x7) { - case 0x0: // Never. - matched = false; - break; - case 0x1: // Less than reference. - matched = (value & mask) < ref; - break; - case 0x2: // Less than or equal to reference. - matched = (value & mask) <= ref; - break; - case 0x3: // Equal to reference. - matched = (value & mask) == ref; - break; - case 0x4: // Not equal to reference. - matched = (value & mask) != ref; - break; - case 0x5: // Greater than or equal to reference. - matched = (value & mask) >= ref; - break; - case 0x6: // Greater than reference. - matched = (value & mask) > ref; - break; - case 0x7: // Always - matched = true; - break; - } - if (matched) { - // Write. - if (wait_info & 0x100) { - // Memory. - auto endianness = static_cast(write_reg_addr & 0x3); - write_reg_addr &= ~0x3; - write_data = GpuSwap(write_data, endianness); - xe::store(memory_->TranslatePhysical(write_reg_addr), write_data); - trace_writer_.WriteMemoryWrite(CpuToGpu(write_reg_addr), 4); - } else { - // Register. - WriteRegister(write_reg_addr, write_data); - } - } - return true; -} - -bool CommandProcessor::ExecutePacketType3_EVENT_WRITE(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // generate an event that creates a write to memory when completed - uint32_t initiator = reader->ReadAndSwap(); - // Writeback initiator. - WriteRegister(XE_GPU_REG_VGT_EVENT_INITIATOR, initiator & 0x3F); - if (count == 1) { - // Just an event flag? Where does this write? - } else { - // Write to an address. - assert_always(); - reader->AdvanceRead((count - 1) * sizeof(uint32_t)); - } - return true; -} - -bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_SHD(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // generate a VS|PS_done event - uint32_t initiator = reader->ReadAndSwap(); - uint32_t address = reader->ReadAndSwap(); - uint32_t value = reader->ReadAndSwap(); - // Writeback initiator. - WriteRegister(XE_GPU_REG_VGT_EVENT_INITIATOR, initiator & 0x3F); - uint32_t data_value; - if ((initiator >> 31) & 0x1) { - // Write counter (GPU vblank counter?). - data_value = counter_; - } else { - // Write value. - data_value = value; - } - auto endianness = static_cast(address & 0x3); - address &= ~0x3; - data_value = GpuSwap(data_value, endianness); - xe::store(memory_->TranslatePhysical(address), data_value); - trace_writer_.WriteMemoryWrite(CpuToGpu(address), 4); - return true; -} - -bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_EXT(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // generate a screen extent event - uint32_t initiator = reader->ReadAndSwap(); - uint32_t address = reader->ReadAndSwap(); - // Writeback initiator. - WriteRegister(XE_GPU_REG_VGT_EVENT_INITIATOR, initiator & 0x3F); - auto endianness = static_cast(address & 0x3); - address &= ~0x3; - - // Let us hope we can fake this. - // This callback tells the driver the xy coordinates affected by a previous - // drawcall. - // https://www.google.com/patents/US20060055701 - uint16_t extents[] = { - 0 >> 3, // min x - xenos::kTexture2DCubeMaxWidthHeight >> 3, // max x - 0 >> 3, // min y - xenos::kTexture2DCubeMaxWidthHeight >> 3, // max y - 0, // min z - 1, // max z - }; - assert_true(endianness == xenos::Endian::k8in16); - xe::copy_and_swap_16_unaligned(memory_->TranslatePhysical(address), extents, - xe::countof(extents)); - trace_writer_.WriteMemoryWrite(CpuToGpu(address), sizeof(extents)); - return true; -} - -bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // Set by D3D as BE but struct ABI is LE - const uint32_t kQueryFinished = xe::byte_swap(0xFFFFFEED); - assert_true(count == 1); - uint32_t initiator = reader->ReadAndSwap(); - // Writeback initiator. - WriteRegister(XE_GPU_REG_VGT_EVENT_INITIATOR, initiator & 0x3F); - - // Occlusion queries: - // This command is send on query begin and end. - // As a workaround report some fixed amount of passed samples. - auto fake_sample_count = cvars::query_occlusion_fake_sample_count; - if (fake_sample_count >= 0) { - auto* pSampleCounts = - memory_->TranslatePhysical( - register_file_->values[XE_GPU_REG_RB_SAMPLE_COUNT_ADDR].u32); - // 0xFFFFFEED is written to this two locations by D3D only on D3DISSUE_END - // and used to detect a finished query. - bool is_end_via_z_pass = pSampleCounts->ZPass_A == kQueryFinished && - pSampleCounts->ZPass_B == kQueryFinished; - // Older versions of D3D also checks for ZFail (4D5307D5). - bool is_end_via_z_fail = pSampleCounts->ZFail_A == kQueryFinished && - pSampleCounts->ZFail_B == kQueryFinished; - std::memset(pSampleCounts, 0, sizeof(xe_gpu_depth_sample_counts)); - if (is_end_via_z_pass || is_end_via_z_fail) { - pSampleCounts->ZPass_A = fake_sample_count; - pSampleCounts->Total_A = fake_sample_count; - } - } - - return true; -} - -bool CommandProcessor::ExecutePacketType3Draw(RingBuffer* reader, - uint32_t packet, - const char* opcode_name, - uint32_t viz_query_condition, - uint32_t count_remaining) { - // if viz_query_condition != 0, this is a conditional draw based on viz query. - // This ID matches the one issued in PM4_VIZ_QUERY - // uint32_t viz_id = viz_query_condition & 0x3F; - // when true, render conditionally based on query result - // uint32_t viz_use = viz_query_condition & 0x100; - - assert_not_zero(count_remaining); - if (!count_remaining) { - XELOGE("{}: Packet too small, can't read VGT_DRAW_INITIATOR", opcode_name); - return false; - } - reg::VGT_DRAW_INITIATOR vgt_draw_initiator; - vgt_draw_initiator.value = reader->ReadAndSwap(); - --count_remaining; - WriteRegister(XE_GPU_REG_VGT_DRAW_INITIATOR, vgt_draw_initiator.value); - - bool draw_succeeded = true; - // TODO(Triang3l): Remove IndexBufferInfo and replace handling of all this - // with PrimitiveProcessor when the old Vulkan renderer is removed. - bool is_indexed = false; - IndexBufferInfo index_buffer_info; - switch (vgt_draw_initiator.source_select) { - case xenos::SourceSelect::kDMA: { - // Indexed draw. - is_indexed = true; - - // Two separate bounds checks so if there's only one missing register - // value out of two, one uint32_t will be skipped in the command buffer, - // not two. - assert_not_zero(count_remaining); - if (!count_remaining) { - XELOGE("{}: Packet too small, can't read VGT_DMA_BASE", opcode_name); - return false; - } - uint32_t vgt_dma_base = reader->ReadAndSwap(); - --count_remaining; - WriteRegister(XE_GPU_REG_VGT_DMA_BASE, vgt_dma_base); - reg::VGT_DMA_SIZE vgt_dma_size; - assert_not_zero(count_remaining); - if (!count_remaining) { - XELOGE("{}: Packet too small, can't read VGT_DMA_SIZE", opcode_name); - return false; - } - vgt_dma_size.value = reader->ReadAndSwap(); - --count_remaining; - WriteRegister(XE_GPU_REG_VGT_DMA_SIZE, vgt_dma_size.value); - - uint32_t index_size_bytes = - vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16 - ? sizeof(uint16_t) - : sizeof(uint32_t); - // The base address must already be word-aligned according to the R6xx - // documentation, but for safety. - index_buffer_info.guest_base = vgt_dma_base & ~(index_size_bytes - 1); - index_buffer_info.endianness = vgt_dma_size.swap_mode; - index_buffer_info.format = vgt_draw_initiator.index_size; - index_buffer_info.length = vgt_dma_size.num_words * index_size_bytes; - index_buffer_info.count = vgt_draw_initiator.num_indices; - } break; - case xenos::SourceSelect::kImmediate: { - // TODO(Triang3l): VGT_IMMED_DATA. - XELOGE( - "{}: Using immediate vertex indices, which are not supported yet. " - "Report the game to Xenia developers!", - opcode_name, uint32_t(vgt_draw_initiator.source_select)); - draw_succeeded = false; - assert_always(); - } break; - case xenos::SourceSelect::kAutoIndex: { - // Auto draw. - index_buffer_info.guest_base = 0; - index_buffer_info.length = 0; - } break; - default: { - // Invalid source selection. - draw_succeeded = false; - assert_unhandled_case(vgt_draw_initiator.source_select); - } break; - } - - // Skip to the next command, for example, if there are immediate indexes that - // we don't support yet. - reader->AdvanceRead(count_remaining * sizeof(uint32_t)); - - if (draw_succeeded) { - auto viz_query = register_file_->Get(); - if (!(viz_query.viz_query_ena && viz_query.kill_pix_post_hi_z)) { - // TODO(Triang3l): Don't drop the draw call completely if the vertex - // shader has memexport. - // TODO(Triang3l || JoelLinn): Handle this properly in the render - // backends. - draw_succeeded = IssueDraw( - vgt_draw_initiator.prim_type, vgt_draw_initiator.num_indices, - is_indexed ? &index_buffer_info : nullptr, - xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode, - vgt_draw_initiator.prim_type)); - if (!draw_succeeded) { - XELOGE("{}({}, {}, {}): Failed in backend", opcode_name, - vgt_draw_initiator.num_indices, - uint32_t(vgt_draw_initiator.prim_type), - uint32_t(vgt_draw_initiator.source_select)); - } - } - } - - // If read the packed correctly, but merely couldn't execute it (because of, - // for instance, features not supported by the host), don't terminate command - // buffer processing as that would leave rendering in a way more inconsistent - // state than just a single dropped draw command. - return true; -} - -bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // "initiate fetch of index buffer and draw" - // Generally used by Xbox 360 Direct3D 9 for kDMA and kAutoIndex sources. - // With a viz query token as the first one. - uint32_t count_remaining = count; - assert_not_zero(count_remaining); - if (!count_remaining) { - XELOGE("PM4_DRAW_INDX: Packet too small, can't read the viz query token"); - return false; - } - uint32_t viz_query_condition = reader->ReadAndSwap(); - --count_remaining; - return ExecutePacketType3Draw(reader, packet, "PM4_DRAW_INDX", - viz_query_condition, count_remaining); -} - -bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // "draw using supplied indices in packet" - // Generally used by Xbox 360 Direct3D 9 for kAutoIndex source. - // No viz query token. - return ExecutePacketType3Draw(reader, packet, "PM4_DRAW_INDX_2", 0, count); -} - -bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // load constant into chip and to memory - // PM4_REG(reg) ((0x4 << 16) | (GSL_HAL_SUBBLOCK_OFFSET(reg))) - // reg - 0x2000 - uint32_t offset_type = reader->ReadAndSwap(); - uint32_t index = offset_type & 0x7FF; - uint32_t type = (offset_type >> 16) & 0xFF; - switch (type) { - case 0: // ALU - index += 0x4000; - break; - case 1: // FETCH - index += 0x4800; - break; - case 2: // BOOL - index += 0x4900; - break; - case 3: // LOOP - index += 0x4908; - break; - case 4: // REGISTERS - index += 0x2000; - break; - default: - assert_always(); - reader->AdvanceRead((count - 1) * sizeof(uint32_t)); - return true; - } - uint32_t countm1 = count - 1; - - if (countm1 != 1) { - WriteRegisterRangeFromRing(reader, index, countm1); - } else { - WriteRegister(index, reader->ReadAndSwap()); - } - return true; -} - -bool CommandProcessor::ExecutePacketType3_SET_CONSTANT2(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - uint32_t offset_type = reader->ReadAndSwap(); - uint32_t index = offset_type & 0xFFFF; - uint32_t countm1 = count - 1; - - if (countm1 != 1) { - WriteRegisterRangeFromRing(reader, index, countm1); - } else { - WriteRegister(index, reader->ReadAndSwap()); - } - return true; -} - -bool CommandProcessor::ExecutePacketType3_LOAD_ALU_CONSTANT(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // load constants from memory - uint32_t address = reader->ReadAndSwap(); - address &= 0x3FFFFFFF; - uint32_t offset_type = reader->ReadAndSwap(); - uint32_t index = offset_type & 0x7FF; - uint32_t size_dwords = reader->ReadAndSwap(); - size_dwords &= 0xFFF; - uint32_t type = (offset_type >> 16) & 0xFF; - switch (type) { - case 0: // ALU - index += 0x4000; - break; - case 1: // FETCH - index += 0x4800; - break; - case 2: // BOOL - index += 0x4900; - break; - case 3: // LOOP - index += 0x4908; - break; - case 4: // REGISTERS - index += 0x2000; - break; - default: - assert_always(); - return true; - } - - trace_writer_.WriteMemoryRead(CpuToGpu(address), size_dwords * 4); - - WriteRegistersFromMem(index, (uint32_t*)memory_->TranslatePhysical(address), - size_dwords); - - return true; -} - -bool CommandProcessor::ExecutePacketType3_SET_SHADER_CONSTANTS( - RingBuffer* reader, uint32_t packet, uint32_t count) { - uint32_t offset_type = reader->ReadAndSwap(); - uint32_t index = offset_type & 0xFFFF; - uint32_t countm1 = count - 1; - if (countm1 != 1) { - WriteRegisterRangeFromRing(reader, index, countm1); - } else { - WriteRegister(index, reader->ReadAndSwap()); - } - - return true; -} - -bool CommandProcessor::ExecutePacketType3_IM_LOAD(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - SCOPE_profile_cpu_f("gpu"); - - // load sequencer instruction memory (pointer-based) - uint32_t addr_type = reader->ReadAndSwap(); - auto shader_type = static_cast(addr_type & 0x3); - uint32_t addr = addr_type & ~0x3; - uint32_t start_size = reader->ReadAndSwap(); - uint32_t start = start_size >> 16; - uint32_t size_dwords = start_size & 0xFFFF; // dwords - assert_true(start == 0); - trace_writer_.WriteMemoryRead(CpuToGpu(addr), size_dwords * 4); - auto shader = - LoadShader(shader_type, addr, memory_->TranslatePhysical(addr), - size_dwords); - switch (shader_type) { - case xenos::ShaderType::kVertex: - active_vertex_shader_ = shader; - break; - case xenos::ShaderType::kPixel: - active_pixel_shader_ = shader; - break; - default: - assert_unhandled_case(shader_type); - return false; - } - return true; -} - -bool CommandProcessor::ExecutePacketType3_IM_LOAD_IMMEDIATE(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - SCOPE_profile_cpu_f("gpu"); - - // load sequencer instruction memory (code embedded in packet) - uint32_t dword0 = reader->ReadAndSwap(); - uint32_t dword1 = reader->ReadAndSwap(); - auto shader_type = static_cast(dword0); - uint32_t start_size = dword1; - uint32_t start = start_size >> 16; - uint32_t size_dwords = start_size & 0xFFFF; // dwords - assert_true(start == 0); - assert_true(reader->read_count() >= size_dwords * 4); - assert_true(count - 2 >= size_dwords); - auto shader = - LoadShader(shader_type, uint32_t(reader->read_ptr()), - reinterpret_cast(reader->read_ptr()), size_dwords); - switch (shader_type) { - case xenos::ShaderType::kVertex: - active_vertex_shader_ = shader; - break; - case xenos::ShaderType::kPixel: - active_pixel_shader_ = shader; - break; - default: - assert_unhandled_case(shader_type); - return false; - } - reader->AdvanceRead(size_dwords * sizeof(uint32_t)); - return true; -} - -bool CommandProcessor::ExecutePacketType3_INVALIDATE_STATE(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // selective invalidation of state pointers - /*uint32_t mask =*/reader->ReadAndSwap(); - // driver_->InvalidateState(mask); - return true; -} - -bool CommandProcessor::ExecutePacketType3_VIZ_QUERY(RingBuffer* reader, - uint32_t packet, - uint32_t count) { - // begin/end initiator for viz query extent processing - // https://www.google.com/patents/US20050195186 - assert_true(count == 1); - - uint32_t dword0 = reader->ReadAndSwap(); - - uint32_t id = dword0 & 0x3F; - uint32_t end = dword0 & 0x100; - if (!end) { - // begin a new viz query @ id - // On hardware this clears the internal state of the scan converter (which - // is different to the register) - WriteRegister(XE_GPU_REG_VGT_EVENT_INITIATOR, VIZQUERY_START); - XELOGGPU("Begin viz query ID {:02X}", id); - } else { - // end the viz query - WriteRegister(XE_GPU_REG_VGT_EVENT_INITIATOR, VIZQUERY_END); - XELOGGPU("End viz query ID {:02X}", id); - // The scan converter writes the internal result back to the register here. - // We just fake it and say it was visible in case it is read back. - if (id < 32) { - register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_0].u32 |= - uint32_t(1) << id; - } else { - register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_1].u32 |= - uint32_t(1) << (id - 32); - } - } - - return true; + } while (reader_.read_count()); + reader_ = old_reader; } void CommandProcessor::InitializeTrace() { @@ -1696,6 +712,7 @@ void CommandProcessor::InitializeTrace() { trace_writer_.WriteGammaRamp(gamma_ramp_256_entry_table(), gamma_ramp_pwl_rgb(), gamma_ramp_rw_component_); } - +#define COMMAND_PROCESSOR CommandProcessor +#include "pm4_command_processor_implement.h" } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index cde2f4fdb..72c3258f1 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -19,6 +19,7 @@ #include #include +#include "xenia/base/dma.h" #include "xenia/base/ring_buffer.h" #include "xenia/base/threading.h" #include "xenia/gpu/register_file.h" @@ -66,6 +67,11 @@ enum class GammaRampType { }; class CommandProcessor { + protected: + RingBuffer + reader_; // chrispy: instead of having ringbuffer on stack, have it near + // the start of the class so we can access it via rel8. This + // also reduces the number of params we need to pass public: enum class SwapPostEffect { kNone, @@ -76,7 +82,7 @@ class CommandProcessor { CommandProcessor(GraphicsSystem* graphics_system, kernel::KernelState* kernel_state); virtual ~CommandProcessor(); - + dma::XeDMAC* GetDMAC() const { return dmac_; } uint32_t counter() const { return counter_; } void increment_counter() { counter_++; } @@ -101,7 +107,7 @@ class CommandProcessor { // screen right in the beginning of 4D530AA4 is not a resolved render target, // for instance). virtual void IssueSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width, - uint32_t frontbuffer_height) = 0; + uint32_t frontbuffer_height) {} // May be called not only from the command processor thread when the command // processor is paused, and the termination of this function may be explicitly @@ -153,7 +159,7 @@ class CommandProcessor { // rarely needed, most register writes have no special logic here XE_NOINLINE void HandleSpecialRegisterWrite(uint32_t index, uint32_t value); - XE_FORCEINLINE + virtual void WriteRegister(uint32_t index, uint32_t value); // mem has big-endian register values @@ -165,12 +171,53 @@ class CommandProcessor { virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base, uint32_t num_registers); - XE_FORCEINLINE + XE_NOINLINE virtual void WriteOneRegisterFromRing( - xe::RingBuffer* ring, uint32_t base, + uint32_t base, uint32_t num_times); // repeatedly write a value to one register, presumably a // register with special handling for writes + + XE_FORCEINLINE + void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times); + + XE_FORCEINLINE + void WriteFetchRangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times); + + XE_FORCEINLINE + void WriteBoolRangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times); + + XE_FORCEINLINE + void WriteLoopRangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times); + + XE_FORCEINLINE + void WriteREGISTERSRangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times); + + XE_FORCEINLINE + void WriteALURangeFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + + XE_FORCEINLINE + void WriteFetchRangeFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + + XE_FORCEINLINE + void WriteBoolRangeFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + + XE_FORCEINLINE + void WriteLoopRangeFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + + XE_FORCEINLINE + void WriteREGISTERSRangeFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const { return gamma_ramp_256_entry_table_; } @@ -186,75 +233,22 @@ class CommandProcessor { uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index); virtual void OnPrimaryBufferEnd() {} - void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length); - bool ExecutePacket(RingBuffer* reader); - bool ExecutePacketType0(RingBuffer* reader, uint32_t packet); - bool ExecutePacketType1(RingBuffer* reader, uint32_t packet); - bool ExecutePacketType2(RingBuffer* reader, uint32_t packet); - bool ExecutePacketType3(RingBuffer* reader, uint32_t packet); - bool ExecutePacketType3_ME_INIT(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_NOP(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_INTERRUPT(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_XE_SWAP(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_INDIRECT_BUFFER(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_WAIT_REG_MEM(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_REG_RMW(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_REG_TO_MEM(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_MEM_WRITE(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_COND_WRITE(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_EVENT_WRITE(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_EVENT_WRITE_SHD(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_EVENT_WRITE_EXT(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3Draw(RingBuffer* reader, uint32_t packet, - const char* opcode_name, - uint32_t viz_query_condition, - uint32_t count_remaining); - bool ExecutePacketType3_DRAW_INDX(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_DRAW_INDX_2(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_SET_CONSTANT(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_SET_CONSTANT2(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_LOAD_ALU_CONSTANT(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_SET_SHADER_CONSTANTS(RingBuffer* reader, - uint32_t packet, uint32_t count); - bool ExecutePacketType3_IM_LOAD(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_IM_LOAD_IMMEDIATE(RingBuffer* reader, - uint32_t packet, uint32_t count); - bool ExecutePacketType3_INVALIDATE_STATE(RingBuffer* reader, uint32_t packet, - uint32_t count); - bool ExecutePacketType3_VIZ_QUERY(RingBuffer* reader, uint32_t packet, - uint32_t count); +#include "pm4_command_processor_declare.h" virtual Shader* LoadShader(xenos::ShaderType shader_type, uint32_t guest_address, const uint32_t* host_address, - uint32_t dword_count) = 0; + uint32_t dword_count) { + return nullptr; + } virtual bool IssueDraw(xenos::PrimitiveType prim_type, uint32_t index_count, IndexBufferInfo* index_buffer_info, - bool major_mode_explicit) = 0; - virtual bool IssueCopy() = 0; + bool major_mode_explicit) { + return false; + } + virtual bool IssueCopy() { return false; } // "Actual" is for the command processor thread, to be read by the // implementations. @@ -267,7 +261,7 @@ class CommandProcessor { Memory* memory_ = nullptr; kernel::KernelState* kernel_state_ = nullptr; GraphicsSystem* graphics_system_ = nullptr; - RegisterFile* register_file_ = nullptr; + RegisterFile* XE_RESTRICT register_file_ = nullptr; TraceWriter trace_writer_; enum class TraceState { @@ -316,6 +310,7 @@ class CommandProcessor { reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {}; reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {}; uint32_t gamma_ramp_rw_component_ = 0; + dma::XeDMAC* dmac_ = nullptr; }; } // namespace gpu diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 0b0d70b45..4e7ee919c 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -705,13 +705,34 @@ void D3D12CommandProcessor::SetExternalGraphicsRootSignature( } void D3D12CommandProcessor::SetViewport(const D3D12_VIEWPORT& viewport) { +#if XE_ARCH_AMD64 == 1 + __m128 zero_register = _mm_setzero_ps(); + __m128 ff_viewport_low4 = _mm_loadu_ps(&ff_viewport_.TopLeftX); + __m128 ff_viewport_high2 = + _mm_loadl_pi(zero_register, (const __m64*)&ff_viewport_.MinDepth); + + __m128 viewport_low4 = _mm_loadu_ps(&viewport.TopLeftX); + __m128 viewport_high2 = + _mm_loadl_pi(zero_register, (const __m64*)&viewport.MinDepth); + + __m128 first_four_cmp = _mm_cmpeq_ps(ff_viewport_low4, viewport_low4); + __m128 last_two_cmp = _mm_cmpeq_ps(ff_viewport_high2, viewport_high2); + + __m128 combined_condition = _mm_and_ps(first_four_cmp, last_two_cmp); + + int movmask = _mm_movemask_ps(combined_condition); + + XE_UNLIKELY_IF(ff_viewport_update_needed_ || movmask != 0b1111) +#else ff_viewport_update_needed_ |= ff_viewport_.TopLeftX != viewport.TopLeftX; ff_viewport_update_needed_ |= ff_viewport_.TopLeftY != viewport.TopLeftY; ff_viewport_update_needed_ |= ff_viewport_.Width != viewport.Width; ff_viewport_update_needed_ |= ff_viewport_.Height != viewport.Height; ff_viewport_update_needed_ |= ff_viewport_.MinDepth != viewport.MinDepth; ff_viewport_update_needed_ |= ff_viewport_.MaxDepth != viewport.MaxDepth; - if (XE_UNLIKELY(ff_viewport_update_needed_)) { + if (XE_UNLIKELY(ff_viewport_update_needed_)) +#endif + { ff_viewport_ = viewport; deferred_command_list_.RSSetViewport(ff_viewport_); ff_viewport_update_needed_ = false; @@ -719,11 +740,23 @@ void D3D12CommandProcessor::SetViewport(const D3D12_VIEWPORT& viewport) { } void D3D12CommandProcessor::SetScissorRect(const D3D12_RECT& scissor_rect) { +#if XE_ARCH_AMD64 == 1 + // vtune suggested that this and SetViewport be vectorized, high retiring + // figure + __m128i scissor_m128 = _mm_loadu_si128((const __m128i*)&scissor_rect); + __m128i ff_scissor_m128 = _mm_loadu_si128((const __m128i*)&ff_scissor_); + __m128i comparison_result = _mm_cmpeq_epi32(scissor_m128, ff_scissor_m128); + if (ff_scissor_update_needed_ || + _mm_movemask_epi8(comparison_result) != 0xFFFF) +#else ff_scissor_update_needed_ |= ff_scissor_.left != scissor_rect.left; ff_scissor_update_needed_ |= ff_scissor_.top != scissor_rect.top; ff_scissor_update_needed_ |= ff_scissor_.right != scissor_rect.right; ff_scissor_update_needed_ |= ff_scissor_.bottom != scissor_rect.bottom; - if (ff_scissor_update_needed_) { + + if (ff_scissor_update_needed_) +#endif + { ff_scissor_ = scissor_rect; deferred_command_list_.RSSetScissorRect(ff_scissor_); ff_scissor_update_needed_ = false; @@ -1186,13 +1219,15 @@ bool D3D12CommandProcessor::SetupContext() { } // The upload buffer is frame-buffered. gamma_ramp_buffer_desc.Width *= kQueueFrames; - if (FAILED(device->CreateCommittedResource( - &ui::d3d12::util::kHeapPropertiesUpload, heap_flag_create_not_zeroed, - &gamma_ramp_buffer_desc, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, - IID_PPV_ARGS(&gamma_ramp_upload_buffer_)))) { + + if (!GetD3D12Provider().CreateUploadResource( + heap_flag_create_not_zeroed, &gamma_ramp_buffer_desc, + D3D12_RESOURCE_STATE_GENERIC_READ, + IID_PPV_ARGS(&gamma_ramp_upload_buffer_))) { XELOGE("Failed to create the gamma ramp upload buffer"); return false; } + if (FAILED(gamma_ramp_upload_buffer_->Map( 0, nullptr, reinterpret_cast(&gamma_ramp_upload_buffer_mapping_)))) { @@ -1678,9 +1713,6 @@ void D3D12CommandProcessor::ShutdownContext() { } // todo: bit-pack the bools and use bitarith to reduce branches void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { -#if XE_ARCH_AMD64 == 1 - // CommandProcessor::WriteRegister(index, value); - __m128i to_rangecheck = _mm_set1_epi16(static_cast(index)); __m128i lower_bounds = _mm_setr_epi16( @@ -1713,9 +1745,7 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { uint32_t movmask = static_cast(_mm_movemask_epi8(is_within_range)); - if (!movmask) { - return; - } else { + if (movmask) { if (movmask & (1 << 3)) { if (frame_open_) { uint32_t float_constant_index = @@ -1747,45 +1777,12 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { } else { HandleSpecialRegisterWrite(index, value); } - } -#else - - CommandProcessor::WriteRegister(index, value); - - if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 && - index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) { - cbuffer_binding_fetch_.up_to_date = false; - // texture cache is never nullptr - // if (texture_cache_ != nullptr) { - texture_cache_->TextureFetchConstantWritten( - (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6); - // } } else { - if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X && - index <= XE_GPU_REG_SHADER_CONSTANT_511_W) { - if (frame_open_) { - uint32_t float_constant_index = - (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2; - if (float_constant_index >= 256) { - float_constant_index -= 256; - if (current_float_constant_map_pixel_[float_constant_index >> 6] & - (1ull << (float_constant_index & 63))) { - cbuffer_binding_float_pixel_.up_to_date = false; - } - } else { - if (current_float_constant_map_vertex_[float_constant_index >> 6] & - (1ull << (float_constant_index & 63))) { - cbuffer_binding_float_vertex_.up_to_date = false; - } - } - } - } else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 && - index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) { - cbuffer_binding_bool_loop_.up_to_date = false; - } + _ReadWriteBarrier(); + return; } -#endif } + void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers) { @@ -1794,6 +1791,95 @@ void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index, D3D12CommandProcessor::WriteRegister(start_index + i, data); } } + +void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + WriteRegisterRangeFromRing_WithKnownBound< + XE_GPU_REG_SHADER_CONSTANT_000_X, XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0>( + ring, base + XE_GPU_REG_SHADER_CONSTANT_000_X, num_times); +} + +void D3D12CommandProcessor::WriteFetchRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + WriteRegisterRangeFromRing_WithKnownBound<0x4800, 0x5002>(ring, base + 0x4800, + num_times); +} + +XE_FORCEINLINE +void D3D12CommandProcessor::WriteBoolRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + // D3D12CommandProcessor::WriteRegisterRangeFromRing(ring, base + 0x4900, + // num_times); + + WriteRegisterRangeFromRing_WithKnownBound<0x4900, 0x5002>(ring, base + 0x4900, + num_times); +} + +XE_FORCEINLINE +void D3D12CommandProcessor::WriteLoopRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + // D3D12CommandProcessor::WriteRegisterRangeFromRing(ring, base + 0x4908, + // num_times); + + WriteRegisterRangeFromRing_WithKnownBound<0x4908, 0x5002>(ring, base + 0x4908, + num_times); +} + +XE_FORCEINLINE +void D3D12CommandProcessor::WriteREGISTERSRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + // D3D12CommandProcessor::WriteRegisterRangeFromRing(ring, base + 0x2000, + // num_times); + + WriteRegisterRangeFromRing_WithKnownBound<0x2000, 0x2000 + 0x800>( + ring, base + 0x2000, num_times); +} + +XE_FORCEINLINE +void D3D12CommandProcessor::WriteALURangeFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + WriteRegisterRangeFromMem_WithKnownBound< + XE_GPU_REG_SHADER_CONSTANT_000_X, XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0>( + start_index + 0x4000, base, num_registers); +} + +XE_FORCEINLINE +void D3D12CommandProcessor::WriteFetchRangeFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + WriteRegisterRangeFromMem_WithKnownBound<0x4800, 0x5002>(start_index + 0x4800, + base, num_registers); +} + +XE_FORCEINLINE +void D3D12CommandProcessor::WriteBoolRangeFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + WriteRegisterRangeFromMem_WithKnownBound<0x4900, 0x5002>(start_index + 0x4900, + base, num_registers); +} + +XE_FORCEINLINE +void D3D12CommandProcessor::WriteLoopRangeFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + WriteRegisterRangeFromMem_WithKnownBound<0x4908, 0x5002>(start_index + 0x4908, + base, num_registers); +} + +XE_FORCEINLINE +void D3D12CommandProcessor::WriteREGISTERSRangeFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + WriteRegisterRangeFromMem_WithKnownBound<0x2000, 0x2000 + 0x800>( + start_index + 0x2000, base, num_registers); +} /* wraparound rarely happens, so its best to hoist this out of writeregisterrangefromring, and structure the two functions so that this can be @@ -1835,14 +1921,147 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, base, reinterpret_cast(const_cast(range.first)), num_regs_firstrange); ring->EndRead(range); - } else { + } + else { return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers); } } -void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring, - uint32_t base, + +template +constexpr bool bounds_may_have_reg(uint32_t reg) { + return reg >= register_lower_bound && reg < register_upper_bound; +} + +template +constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) { + return bounds_may_have_reg(reg) || + bounds_may_have_reg( + last_reg); +} +template +XE_FORCEINLINE void +D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound( + uint32_t base, uint32_t* range, uint32_t num_registers) { + constexpr auto bounds_has_reg = + bounds_may_have_reg; + constexpr auto bounds_has_bounds = + bounds_may_have_bounds; + + for (uint32_t i = 0; i < num_registers; ++i) { + uint32_t data = xe::load_and_swap(range + i); + + { + uint32_t index = base + i; + uint32_t value = data; + XE_MSVC_ASSUME(index >= register_lower_bound && + index < register_upper_bound); + register_file_->values[index].u32 = value; + + unsigned expr = 0; + + if constexpr (bounds_has_bounds(XE_GPU_REG_SCRATCH_REG0, + XE_GPU_REG_SCRATCH_REG7)) { + expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8); + } + if constexpr (bounds_has_reg(XE_GPU_REG_COHER_STATUS_HOST)) { + expr |= (index == XE_GPU_REG_COHER_STATUS_HOST); + } + if constexpr (bounds_has_bounds(XE_GPU_REG_DC_LUT_RW_INDEX, + XE_GPU_REG_DC_LUT_30_COLOR)) { + expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= + (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)); + } + // chrispy: reordered for msvc branch probability (assumes + // if is taken and else is not) + if (XE_LIKELY(expr == 0)) { + XE_MSVC_REORDER_BARRIER(); + + } else { + HandleSpecialRegisterWrite(index, value); + goto write_done; + } + XE_MSVC_ASSUME(index >= register_lower_bound && + index < register_upper_bound); + if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, + XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) { + if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 && + index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) { + cbuffer_binding_fetch_.up_to_date = false; + // texture cache is never nullptr + texture_cache_->TextureFetchConstantWritten( + (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6); + + goto write_done; + } + } + XE_MSVC_ASSUME(index >= register_lower_bound && + index < register_upper_bound); + if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X, + XE_GPU_REG_SHADER_CONSTANT_511_W)) { + if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X && + index <= XE_GPU_REG_SHADER_CONSTANT_511_W) { + if (frame_open_) { + uint32_t float_constant_index = + (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2; + if (float_constant_index >= 256) { + float_constant_index -= 256; + if (current_float_constant_map_pixel_[float_constant_index >> 6] & + (1ull << (float_constant_index & 63))) { + cbuffer_binding_float_pixel_.up_to_date = false; + } + } else { + if (current_float_constant_map_vertex_[float_constant_index >> + 6] & + (1ull << (float_constant_index & 63))) { + cbuffer_binding_float_vertex_.up_to_date = false; + } + } + } + goto write_done; + } + } + XE_MSVC_ASSUME(index >= register_lower_bound && + index < register_upper_bound); + if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031, + XE_GPU_REG_SHADER_CONSTANT_LOOP_31)) { + if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 && + index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) { + cbuffer_binding_bool_loop_.up_to_date = false; + goto write_done; + } + } + } + write_done:; + } +} +template +XE_FORCEINLINE void +D3D12CommandProcessor::WriteRegisterRangeFromRing_WithKnownBound( + xe::RingBuffer* ring, uint32_t base, uint32_t num_registers) { + RingBuffer::ReadRange range = + ring->BeginRead(num_registers * sizeof(uint32_t)); + + constexpr auto bounds_has_reg = + bounds_may_have_reg; + constexpr auto bounds_has_bounds = + bounds_may_have_bounds; + + XE_LIKELY_IF(!range.second) { + WriteRegisterRangeFromMem_WithKnownBound( + base, reinterpret_cast(const_cast(range.first)), + num_registers); + + ring->EndRead(range); + } + else { + return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers); + } +} +XE_NOINLINE +void D3D12CommandProcessor::WriteOneRegisterFromRing(uint32_t base, uint32_t num_times) { - auto read = ring->BeginPrefetchedRead( + auto read = reader_.BeginPrefetchedRead( num_times * sizeof(uint32_t)); uint32_t first_length = read.first_length / sizeof(uint32_t); @@ -1852,7 +2071,7 @@ void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring, base, xe::load_and_swap(read.first + (sizeof(uint32_t) * i))); } - XE_UNLIKELY_IF (read.second) { + XE_UNLIKELY_IF(read.second) { uint32_t second_length = read.second_length / sizeof(uint32_t); for (uint32_t i = 0; i < second_length; ++i) { @@ -1861,7 +2080,7 @@ void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring, xe::load_and_swap(read.second + (sizeof(uint32_t) * i))); } } - ring->EndRead(read); + reader_.EndRead(read); } void D3D12CommandProcessor::OnGammaRamp256EntryTableValueWritten() { gamma_ramp_256_entry_table_up_to_date_ = false; @@ -2510,9 +2729,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, GetSupportedMemExportFormatSize(memexport_stream.format); if (memexport_format_size == 0) { XELOGE("Unsupported memexport format {}", - FormatInfo::Get( - xenos::TextureFormat(uint32_t(memexport_stream.format))) - ->name); + FormatInfo::GetName( + xenos::TextureFormat(uint32_t(memexport_stream.format)))); return false; } uint32_t memexport_size_dwords = @@ -2551,9 +2769,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, GetSupportedMemExportFormatSize(memexport_stream.format); if (memexport_format_size == 0) { XELOGE("Unsupported memexport format {}", - FormatInfo::Get( - xenos::TextureFormat(uint32_t(memexport_stream.format))) - ->name); + FormatInfo::GetName( + xenos::TextureFormat(uint32_t(memexport_stream.format)))); return false; } uint32_t memexport_size_dwords = @@ -3353,17 +3570,12 @@ void D3D12CommandProcessor::UpdateFixedFunctionState( } } } - -void D3D12CommandProcessor::UpdateSystemConstantValues( - bool shared_memory_is_uav, bool primitive_polygonal, - uint32_t line_loop_closing_index, xenos::Endian index_endian, - const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask, - reg::RB_DEPTHCONTROL normalized_depth_control, +template +XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl( + bool shared_memory_is_uav, uint32_t line_loop_closing_index, + xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info, + uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control, uint32_t normalized_color_mask) { -#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES - SCOPE_profile_cpu_f("gpu"); -#endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES - const RegisterFile& regs = *register_file_; auto pa_cl_clip_cntl = regs.Get(); auto pa_cl_vte_cntl = regs.Get(); @@ -3382,8 +3594,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( uint32_t vgt_max_vtx_indx = regs.Get().max_indx; uint32_t vgt_min_vtx_indx = regs.Get().min_indx; - bool edram_rov_used = render_target_cache_->GetPath() == - RenderTargetCache::Path::kPixelShaderInterlock; uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x(); uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y(); @@ -3426,7 +3636,21 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( } } - bool dirty = false; + uint32_t dirty = 0u; + ArchFloatMask dirty_float_mask = floatmask_zero; + + auto update_dirty_floatmask = [&dirty_float_mask](float x, float y) { + dirty_float_mask = + ArchORFloatMask(dirty_float_mask, ArchCmpneqFloatMask(x, y)); + }; + /* + chrispy: instead of (cmp x, y; setnz lobyte; or mask, lobyte; + we can do (xor z, x, y; or mask, z) + this ought to have much better throughput on all processors + */ + auto update_dirty_uint32_cmp = [&dirty](uint32_t x, uint32_t y) { + dirty |= (x ^ y); + }; // Flags. uint32_t flags = 0; @@ -3454,7 +3678,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( flags |= DxbcShaderTranslator::kSysFlag_WNotReciprocal; } // Whether the primitive is polygonal and SV_IsFrontFace matters. - if (primitive_polygonal) { + if constexpr (primitive_polygonal) { flags |= DxbcShaderTranslator::kSysFlag_PrimitivePolygonal; } // Primitive type. @@ -3480,31 +3704,33 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( } } } - if (edram_rov_used && depth_stencil_enabled) { - flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencil; - if (normalized_depth_control.z_enable) { - flags |= uint32_t(normalized_depth_control.zfunc) - << DxbcShaderTranslator::kSysFlag_ROVDepthPassIfLess_Shift; - if (normalized_depth_control.z_write_enable) { - flags |= DxbcShaderTranslator::kSysFlag_ROVDepthWrite; + if constexpr (edram_rov_used) { + if (depth_stencil_enabled) { + flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencil; + if (normalized_depth_control.z_enable) { + flags |= uint32_t(normalized_depth_control.zfunc) + << DxbcShaderTranslator::kSysFlag_ROVDepthPassIfLess_Shift; + if (normalized_depth_control.z_write_enable) { + flags |= DxbcShaderTranslator::kSysFlag_ROVDepthWrite; + } + } else { + // In case stencil is used without depth testing - always pass, and + // don't modify the stored depth. + flags |= DxbcShaderTranslator::kSysFlag_ROVDepthPassIfLess | + DxbcShaderTranslator::kSysFlag_ROVDepthPassIfEqual | + DxbcShaderTranslator::kSysFlag_ROVDepthPassIfGreater; + } + if (normalized_depth_control.stencil_enable) { + flags |= DxbcShaderTranslator::kSysFlag_ROVStencilTest; + } + // Hint - if not applicable to the shader, will not have effect. + if (alpha_test_function == xenos::CompareFunction::kAlways && + !rb_colorcontrol.alpha_to_mask_enable) { + flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencilEarlyWrite; } - } else { - // In case stencil is used without depth testing - always pass, and - // don't modify the stored depth. - flags |= DxbcShaderTranslator::kSysFlag_ROVDepthPassIfLess | - DxbcShaderTranslator::kSysFlag_ROVDepthPassIfEqual | - DxbcShaderTranslator::kSysFlag_ROVDepthPassIfGreater; - } - if (normalized_depth_control.stencil_enable) { - flags |= DxbcShaderTranslator::kSysFlag_ROVStencilTest; - } - // Hint - if not applicable to the shader, will not have effect. - if (alpha_test_function == xenos::CompareFunction::kAlways && - !rb_colorcontrol.alpha_to_mask_enable) { - flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencilEarlyWrite; } } - dirty |= system_constants_.flags != flags; + update_dirty_uint32_cmp(system_constants_.flags, flags); system_constants_.flags = flags; // Tessellation factor range, plus 1.0 according to the images in @@ -3513,29 +3739,39 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( regs[XE_GPU_REG_VGT_HOS_MIN_TESS_LEVEL].f32 + 1.0f; float tessellation_factor_max = regs[XE_GPU_REG_VGT_HOS_MAX_TESS_LEVEL].f32 + 1.0f; - dirty |= system_constants_.tessellation_factor_range_min != - tessellation_factor_min; + + update_dirty_floatmask(system_constants_.tessellation_factor_range_min, + tessellation_factor_min); + system_constants_.tessellation_factor_range_min = tessellation_factor_min; - dirty |= system_constants_.tessellation_factor_range_max != - tessellation_factor_max; + update_dirty_floatmask(system_constants_.tessellation_factor_range_max, + tessellation_factor_max); system_constants_.tessellation_factor_range_max = tessellation_factor_max; // Line loop closing index (or 0 when drawing other primitives or using an // index buffer). - dirty |= system_constants_.line_loop_closing_index != line_loop_closing_index; + + update_dirty_uint32_cmp(system_constants_.line_loop_closing_index, + line_loop_closing_index); system_constants_.line_loop_closing_index = line_loop_closing_index; // Index or tessellation edge factor buffer endianness. - dirty |= system_constants_.vertex_index_endian != index_endian; + update_dirty_uint32_cmp( + static_cast(system_constants_.vertex_index_endian), + static_cast(index_endian)); system_constants_.vertex_index_endian = index_endian; // Vertex index offset. - dirty |= system_constants_.vertex_index_offset != vgt_indx_offset; + + update_dirty_uint32_cmp(system_constants_.vertex_index_offset, + vgt_indx_offset); system_constants_.vertex_index_offset = vgt_indx_offset; // Vertex index range. - dirty |= system_constants_.vertex_index_min != vgt_min_vtx_indx; - dirty |= system_constants_.vertex_index_max != vgt_max_vtx_indx; + + update_dirty_uint32_cmp(system_constants_.vertex_index_min, vgt_min_vtx_indx); + update_dirty_uint32_cmp(system_constants_.vertex_index_max, vgt_max_vtx_indx); + system_constants_.vertex_index_min = vgt_min_vtx_indx; system_constants_.vertex_index_max = vgt_max_vtx_indx; @@ -3563,8 +3799,12 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( // Conversion to Direct3D 12 normalized device coordinates. for (uint32_t i = 0; i < 3; ++i) { - dirty |= system_constants_.ndc_scale[i] != viewport_info.ndc_scale[i]; - dirty |= system_constants_.ndc_offset[i] != viewport_info.ndc_offset[i]; + update_dirty_floatmask(system_constants_.ndc_scale[i], + viewport_info.ndc_scale[i]); + + update_dirty_floatmask(system_constants_.ndc_offset[i], + viewport_info.ndc_offset[i]); + system_constants_.ndc_scale[i] = viewport_info.ndc_scale[i]; system_constants_.ndc_offset[i] = viewport_info.ndc_offset[i]; } @@ -3581,14 +3821,18 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( float(pa_su_point_size.width) * (2.0f / 16.0f); float point_constant_diameter_y = float(pa_su_point_size.height) * (2.0f / 16.0f); - dirty |= system_constants_.point_vertex_diameter_min != - point_vertex_diameter_min; - dirty |= system_constants_.point_vertex_diameter_max != - point_vertex_diameter_max; - dirty |= system_constants_.point_constant_diameter[0] != - point_constant_diameter_x; - dirty |= system_constants_.point_constant_diameter[1] != - point_constant_diameter_y; + + update_dirty_floatmask(system_constants_.point_vertex_diameter_min, + point_vertex_diameter_min); + + update_dirty_floatmask(system_constants_.point_vertex_diameter_max, + point_vertex_diameter_max); + + update_dirty_floatmask(system_constants_.point_constant_diameter[0], + point_constant_diameter_x); + update_dirty_floatmask(system_constants_.point_constant_diameter[1], + point_constant_diameter_y); + system_constants_.point_vertex_diameter_min = point_vertex_diameter_min; system_constants_.point_vertex_diameter_max = point_vertex_diameter_max; system_constants_.point_constant_diameter[0] = point_constant_diameter_x; @@ -3602,10 +3846,15 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( float point_screen_diameter_to_ndc_radius_y = (/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) / std::max(viewport_info.xy_extent[1], uint32_t(1)); - dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] != - point_screen_diameter_to_ndc_radius_x; - dirty |= system_constants_.point_screen_diameter_to_ndc_radius[1] != - point_screen_diameter_to_ndc_radius_y; + + update_dirty_floatmask( + system_constants_.point_screen_diameter_to_ndc_radius[0], + point_screen_diameter_to_ndc_radius_x); + + update_dirty_floatmask( + system_constants_.point_screen_diameter_to_ndc_radius[1], + point_screen_diameter_to_ndc_radius_y); + system_constants_.point_screen_diameter_to_ndc_radius[0] = point_screen_diameter_to_ndc_radius_x; system_constants_.point_screen_diameter_to_ndc_radius[1] = @@ -3628,14 +3877,20 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( uint32_t texture_signs_shifted = uint32_t(texture_signs) << texture_signs_shift; uint32_t texture_signs_mask = uint32_t(0b11111111) << texture_signs_shift; - dirty |= (texture_signs_uint & texture_signs_mask) != texture_signs_shifted; + + update_dirty_uint32_cmp((texture_signs_uint & texture_signs_mask), + texture_signs_shifted); + texture_signs_uint = (texture_signs_uint & ~texture_signs_mask) | texture_signs_shifted; + // cache misses here, we're accessing the texture bindings out of order textures_resolved |= uint32_t(texture_cache_->IsActiveTextureResolved(texture_index)) << texture_index; } - dirty |= system_constants_.textures_resolved != textures_resolved; + + update_dirty_uint32_cmp(system_constants_.textures_resolved, + textures_resolved); system_constants_.textures_resolved = textures_resolved; // Log2 of sample count, for alpha to mask and with ROV, for EDRAM address @@ -3644,18 +3899,22 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X ? 1 : 0; uint32_t sample_count_log2_y = rb_surface_info.msaa_samples >= xenos::MsaaSamples::k2X ? 1 : 0; - dirty |= system_constants_.sample_count_log2[0] != sample_count_log2_x; - dirty |= system_constants_.sample_count_log2[1] != sample_count_log2_y; + + update_dirty_uint32_cmp(system_constants_.sample_count_log2[0], + sample_count_log2_x); + update_dirty_uint32_cmp(system_constants_.sample_count_log2[1], + sample_count_log2_y); system_constants_.sample_count_log2[0] = sample_count_log2_x; system_constants_.sample_count_log2[1] = sample_count_log2_y; // Alpha test and alpha to coverage. - dirty |= system_constants_.alpha_test_reference != rb_alpha_ref; + update_dirty_floatmask(system_constants_.alpha_test_reference, rb_alpha_ref); system_constants_.alpha_test_reference = rb_alpha_ref; uint32_t alpha_to_mask = rb_colorcontrol.alpha_to_mask_enable ? (rb_colorcontrol.value >> 24) | (1 << 8) : 0; - dirty |= system_constants_.alpha_to_mask != alpha_to_mask; + + update_dirty_uint32_cmp(system_constants_.alpha_to_mask, alpha_to_mask); system_constants_.alpha_to_mask = alpha_to_mask; uint32_t edram_tile_dwords_scaled = @@ -3663,19 +3922,23 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( (draw_resolution_scale_x * draw_resolution_scale_y); // EDRAM pitch for ROV writing. - if (edram_rov_used) { + if constexpr (edram_rov_used) { // Align, then multiply by 32bpp tile size in dwords. uint32_t edram_32bpp_tile_pitch_dwords_scaled = ((rb_surface_info.surface_pitch * (rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X ? 2 : 1)) + (xenos::kEdramTileWidthSamples - 1)) / xenos::kEdramTileWidthSamples * edram_tile_dwords_scaled; - dirty |= system_constants_.edram_32bpp_tile_pitch_dwords_scaled != - edram_32bpp_tile_pitch_dwords_scaled; + update_dirty_uint32_cmp( + system_constants_.edram_32bpp_tile_pitch_dwords_scaled, + edram_32bpp_tile_pitch_dwords_scaled); system_constants_.edram_32bpp_tile_pitch_dwords_scaled = edram_32bpp_tile_pitch_dwords_scaled; } +#if XE_ARCH_AMD64 == 1 + __m128i rt_clamp_dirty = _mm_set1_epi8((char)0xff); +#endif // Color exponent bias and ROV render target writing. for (uint32_t i = 0; i < 4; ++i) { reg::RB_COLOR_INFO color_info = color_infos[i]; @@ -3695,47 +3958,80 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( float color_exp_bias_scale; *reinterpret_cast(&color_exp_bias_scale) = 0x3F800000 + (color_exp_bias << 23); - dirty |= system_constants_.color_exp_bias[i] != color_exp_bias_scale; + + update_dirty_floatmask(system_constants_.color_exp_bias[i], + color_exp_bias_scale); + system_constants_.color_exp_bias[i] = color_exp_bias_scale; - if (edram_rov_used) { - dirty |= - system_constants_.edram_rt_keep_mask[i][0] != rt_keep_masks[i][0]; + if constexpr (edram_rov_used) { + update_dirty_uint32_cmp(system_constants_.edram_rt_keep_mask[i][0], + rt_keep_masks[i][0]); + system_constants_.edram_rt_keep_mask[i][0] = rt_keep_masks[i][0]; - dirty |= - system_constants_.edram_rt_keep_mask[i][1] != rt_keep_masks[i][1]; + + update_dirty_uint32_cmp(system_constants_.edram_rt_keep_mask[i][1], + rt_keep_masks[i][1]); + system_constants_.edram_rt_keep_mask[i][1] = rt_keep_masks[i][1]; if (rt_keep_masks[i][0] != UINT32_MAX || rt_keep_masks[i][1] != UINT32_MAX) { uint32_t rt_base_dwords_scaled = color_info.color_base * edram_tile_dwords_scaled; - dirty |= system_constants_.edram_rt_base_dwords_scaled[i] != - rt_base_dwords_scaled; + update_dirty_uint32_cmp( + system_constants_.edram_rt_base_dwords_scaled[i], + rt_base_dwords_scaled); system_constants_.edram_rt_base_dwords_scaled[i] = rt_base_dwords_scaled; uint32_t format_flags = DxbcShaderTranslator::ROV_AddColorFormatFlags( color_info.color_format); - dirty |= system_constants_.edram_rt_format_flags[i] != format_flags; + update_dirty_uint32_cmp(system_constants_.edram_rt_format_flags[i], + format_flags); + system_constants_.edram_rt_format_flags[i] = format_flags; // Can't do float comparisons here because NaNs would result in always // setting the dirty flag. + +#if XE_ARCH_AMD64 == 1 + + __m128i edram_rt_clamp_loaded = _mm_loadu_si128( + (const __m128i*)&system_constants_.edram_rt_clamp[i]); + __m128i rt_clamp_loaded = _mm_loadu_si128((const __m128i*)&rt_clamp[i]); + + rt_clamp_dirty = _mm_and_si128( + rt_clamp_dirty, + _mm_cmpeq_epi8(edram_rt_clamp_loaded, rt_clamp_loaded)); + _mm_storeu_si128((__m128i*)&system_constants_.edram_rt_clamp[i], + rt_clamp_loaded); +#else dirty |= std::memcmp(system_constants_.edram_rt_clamp[i], rt_clamp[i], 4 * sizeof(float)) != 0; std::memcpy(system_constants_.edram_rt_clamp[i], rt_clamp[i], 4 * sizeof(float)); + +#endif uint32_t blend_factors_ops = regs[reg::RB_BLENDCONTROL::rt_register_indices[i]].u32 & 0x1FFF1FFF; - dirty |= system_constants_.edram_rt_blend_factors_ops[i] != - blend_factors_ops; + + update_dirty_uint32_cmp(system_constants_.edram_rt_blend_factors_ops[i], + blend_factors_ops); + system_constants_.edram_rt_blend_factors_ops[i] = blend_factors_ops; } } } +#if XE_ARCH_AMD64 == 1 + if constexpr (edram_rov_used) { + update_dirty_uint32_cmp( + static_cast(_mm_movemask_epi8(rt_clamp_dirty)), 0xFFFFU); + } - if (edram_rov_used) { +#endif + if constexpr (edram_rov_used) { uint32_t depth_base_dwords_scaled = rb_depth_info.depth_base * edram_tile_dwords_scaled; - dirty |= system_constants_.edram_depth_base_dwords_scaled != - depth_base_dwords_scaled; + update_dirty_uint32_cmp(system_constants_.edram_depth_base_dwords_scaled, + depth_base_dwords_scaled); + system_constants_.edram_depth_base_dwords_scaled = depth_base_dwords_scaled; // For non-polygons, front polygon offset is used, and it's enabled if @@ -3775,55 +4071,59 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( std::max(draw_resolution_scale_x, draw_resolution_scale_y); poly_offset_front_scale *= poly_offset_scale_factor; poly_offset_back_scale *= poly_offset_scale_factor; - dirty |= system_constants_.edram_poly_offset_front_scale != - poly_offset_front_scale; + update_dirty_floatmask(system_constants_.edram_poly_offset_front_scale, + poly_offset_front_scale); + system_constants_.edram_poly_offset_front_scale = poly_offset_front_scale; - dirty |= system_constants_.edram_poly_offset_front_offset != - poly_offset_front_offset; + + update_dirty_floatmask(system_constants_.edram_poly_offset_front_offset, + poly_offset_front_offset); + system_constants_.edram_poly_offset_front_offset = poly_offset_front_offset; - dirty |= system_constants_.edram_poly_offset_back_scale != - poly_offset_back_scale; + update_dirty_floatmask(system_constants_.edram_poly_offset_back_scale, + poly_offset_back_scale); system_constants_.edram_poly_offset_back_scale = poly_offset_back_scale; - dirty |= system_constants_.edram_poly_offset_back_offset != - poly_offset_back_offset; + update_dirty_floatmask(system_constants_.edram_poly_offset_back_offset, + poly_offset_back_offset); system_constants_.edram_poly_offset_back_offset = poly_offset_back_offset; if (depth_stencil_enabled && normalized_depth_control.stencil_enable) { - dirty |= system_constants_.edram_stencil_front_reference != - rb_stencilrefmask.stencilref; + update_dirty_uint32_cmp(system_constants_.edram_stencil_front_reference, + rb_stencilrefmask.stencilref); + system_constants_.edram_stencil_front_reference = rb_stencilrefmask.stencilref; - dirty |= system_constants_.edram_stencil_front_read_mask != - rb_stencilrefmask.stencilmask; + update_dirty_uint32_cmp(system_constants_.edram_stencil_front_read_mask, + rb_stencilrefmask.stencilmask); system_constants_.edram_stencil_front_read_mask = rb_stencilrefmask.stencilmask; - dirty |= system_constants_.edram_stencil_front_write_mask != - rb_stencilrefmask.stencilwritemask; + update_dirty_uint32_cmp(system_constants_.edram_stencil_front_write_mask, + rb_stencilrefmask.stencilwritemask); system_constants_.edram_stencil_front_write_mask = rb_stencilrefmask.stencilwritemask; uint32_t stencil_func_ops = (normalized_depth_control.value >> 8) & ((1 << 12) - 1); - dirty |= - system_constants_.edram_stencil_front_func_ops != stencil_func_ops; + update_dirty_uint32_cmp(system_constants_.edram_stencil_front_func_ops, + stencil_func_ops); system_constants_.edram_stencil_front_func_ops = stencil_func_ops; if (primitive_polygonal && normalized_depth_control.backface_enable) { - dirty |= system_constants_.edram_stencil_back_reference != - rb_stencilrefmask_bf.stencilref; + update_dirty_uint32_cmp(system_constants_.edram_stencil_back_reference, + rb_stencilrefmask_bf.stencilref); system_constants_.edram_stencil_back_reference = rb_stencilrefmask_bf.stencilref; - dirty |= system_constants_.edram_stencil_back_read_mask != - rb_stencilrefmask_bf.stencilmask; + update_dirty_uint32_cmp(system_constants_.edram_stencil_back_read_mask, + rb_stencilrefmask_bf.stencilmask); system_constants_.edram_stencil_back_read_mask = rb_stencilrefmask_bf.stencilmask; - dirty |= system_constants_.edram_stencil_back_write_mask != - rb_stencilrefmask_bf.stencilwritemask; + update_dirty_uint32_cmp(system_constants_.edram_stencil_back_write_mask, + rb_stencilrefmask_bf.stencilwritemask); system_constants_.edram_stencil_back_write_mask = rb_stencilrefmask_bf.stencilwritemask; uint32_t stencil_func_ops_bf = (normalized_depth_control.value >> 20) & ((1 << 12) - 1); - dirty |= system_constants_.edram_stencil_back_func_ops != - stencil_func_ops_bf; + update_dirty_uint32_cmp(system_constants_.edram_stencil_back_func_ops, + stencil_func_ops_bf); system_constants_.edram_stencil_back_func_ops = stencil_func_ops_bf; } else { dirty |= std::memcmp(system_constants_.edram_stencil_back, @@ -3834,28 +4134,69 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( 4 * sizeof(uint32_t)); } } + update_dirty_floatmask(system_constants_.edram_blend_constant[0], + regs[XE_GPU_REG_RB_BLEND_RED].f32); - dirty |= system_constants_.edram_blend_constant[0] != - regs[XE_GPU_REG_RB_BLEND_RED].f32; system_constants_.edram_blend_constant[0] = regs[XE_GPU_REG_RB_BLEND_RED].f32; - dirty |= system_constants_.edram_blend_constant[1] != - regs[XE_GPU_REG_RB_BLEND_GREEN].f32; + + update_dirty_floatmask(system_constants_.edram_blend_constant[1], + regs[XE_GPU_REG_RB_BLEND_GREEN].f32); + system_constants_.edram_blend_constant[1] = regs[XE_GPU_REG_RB_BLEND_GREEN].f32; - dirty |= system_constants_.edram_blend_constant[2] != - regs[XE_GPU_REG_RB_BLEND_BLUE].f32; + update_dirty_floatmask(system_constants_.edram_blend_constant[2], + regs[XE_GPU_REG_RB_BLEND_BLUE].f32); + system_constants_.edram_blend_constant[2] = regs[XE_GPU_REG_RB_BLEND_BLUE].f32; - dirty |= system_constants_.edram_blend_constant[3] != - regs[XE_GPU_REG_RB_BLEND_ALPHA].f32; + update_dirty_floatmask(system_constants_.edram_blend_constant[3], + regs[XE_GPU_REG_RB_BLEND_ALPHA].f32); + system_constants_.edram_blend_constant[3] = regs[XE_GPU_REG_RB_BLEND_ALPHA].f32; } + dirty |= ArchFloatMaskSignbit(dirty_float_mask); cbuffer_binding_system_.up_to_date &= !dirty; } +void D3D12CommandProcessor::UpdateSystemConstantValues( + bool shared_memory_is_uav, bool primitive_polygonal, + uint32_t line_loop_closing_index, xenos::Endian index_endian, + const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask, + reg::RB_DEPTHCONTROL normalized_depth_control, + uint32_t normalized_color_mask) { + bool edram_rov_used = render_target_cache_->GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock; + + if (!edram_rov_used) { + if (primitive_polygonal) { + UpdateSystemConstantValues_Impl( + shared_memory_is_uav, line_loop_closing_index, index_endian, + viewport_info, used_texture_mask, normalized_depth_control, + normalized_color_mask); + } else { + UpdateSystemConstantValues_Impl( + shared_memory_is_uav, line_loop_closing_index, index_endian, + viewport_info, used_texture_mask, normalized_depth_control, + normalized_color_mask); + } + } else { + if (primitive_polygonal) { + UpdateSystemConstantValues_Impl( + shared_memory_is_uav, line_loop_closing_index, index_endian, + viewport_info, used_texture_mask, normalized_depth_control, + normalized_color_mask); + } else { + UpdateSystemConstantValues_Impl( + shared_memory_is_uav, line_loop_closing_index, index_endian, + viewport_info, used_texture_mask, normalized_depth_control, + normalized_color_mask); + } + } +} + bool D3D12CommandProcessor::UpdateBindings( const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, ID3D12RootSignature* root_signature) { @@ -4081,6 +4422,9 @@ bool D3D12CommandProcessor::UpdateBindings( current_samplers_vertex_.resize( std::max(current_samplers_vertex_.size(), sampler_count_vertex)); for (size_t i = 0; i < sampler_count_vertex; ++i) { + if (i + 2 < sampler_count_vertex) { + texture_cache_->PrefetchSamplerParameters(samplers_vertex[i + 2]); + } D3D12TextureCache::SamplerParameters parameters = texture_cache_->GetSamplerParameters(samplers_vertex[i]); if (current_samplers_vertex_[i] != parameters) { @@ -4112,9 +4456,15 @@ bool D3D12CommandProcessor::UpdateBindings( } current_samplers_pixel_.resize(std::max(current_samplers_pixel_.size(), size_t(sampler_count_pixel))); + const auto samplers_pixel_derefed = samplers_pixel->data(); + for (uint32_t i = 0; i < sampler_count_pixel; ++i) { + if (i + 2 < sampler_count_pixel) { + texture_cache_->PrefetchSamplerParameters( + samplers_pixel_derefed[i + 2]); + } D3D12TextureCache::SamplerParameters parameters = - texture_cache_->GetSamplerParameters((*samplers_pixel)[i]); + texture_cache_->GetSamplerParameters(samplers_pixel_derefed[i]); if (current_samplers_pixel_[i] != parameters) { current_samplers_pixel_[i] = parameters; cbuffer_binding_descriptor_indices_pixel_.up_to_date = false; @@ -4293,6 +4643,10 @@ bool D3D12CommandProcessor::UpdateBindings( return false; } for (size_t i = 0; i < texture_count_vertex; ++i) { + if (i + 8 < texture_count_vertex) { + texture_cache_->PrefetchTextureBinding( + textures_vertex[i + 8].fetch_constant); + } const D3D12Shader::TextureBinding& texture = textures_vertex[i]; descriptor_indices[texture.bindless_descriptor_index] = texture_cache_->GetActiveTextureBindlessSRVIndex(texture) - @@ -4740,6 +5094,9 @@ void D3D12CommandProcessor::WriteGammaRampSRV( device->CreateShaderResourceView(gamma_ramp_buffer_.Get(), &desc, handle); } +#define COMMAND_PROCESSOR D3D12CommandProcessor + +#include "../pm4_command_processor_implement.h" } // namespace d3d12 } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 998141f49..37d048d29 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -1,4 +1,5 @@ /** +/** /** ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * @@ -35,6 +36,7 @@ #include "xenia/gpu/registers.h" #include "xenia/gpu/xenos.h" #include "xenia/kernel/kernel_state.h" +#include "xenia/kernel/user_module.h" #include "xenia/ui/d3d12/d3d12_descriptor_heap_pool.h" #include "xenia/ui/d3d12/d3d12_provider.h" #include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h" @@ -46,6 +48,7 @@ namespace d3d12 { class D3D12CommandProcessor final : public CommandProcessor { public: +#include "../pm4_command_processor_declare.h" explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system, kernel::KernelState* kernel_state); ~D3D12CommandProcessor(); @@ -205,22 +208,70 @@ class D3D12CommandProcessor final : public CommandProcessor { protected: bool SetupContext() override; void ShutdownContext() override; - XE_FORCEINLINE + void WriteRegister(uint32_t index, uint32_t value) override; XE_FORCEINLINE virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers) override; + + template + XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound( + uint32_t start_index, uint32_t* base, uint32_t num_registers); XE_FORCEINLINE virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base, uint32_t num_registers) override; + template + XE_FORCEINLINE void WriteRegisterRangeFromRing_WithKnownBound( + xe::RingBuffer* ring, uint32_t base, uint32_t num_registers); XE_NOINLINE void WriteRegisterRangeFromRing_WraparoundCase(xe::RingBuffer* ring, uint32_t base, - uint32_t num_registers); - XE_FORCEINLINE - virtual void WriteOneRegisterFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_registers); + XE_NOINLINE + virtual void WriteOneRegisterFromRing(uint32_t base, uint32_t num_times) override; + + XE_FORCEINLINE + void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times); + + XE_FORCEINLINE + void WriteFetchRangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times); + + XE_FORCEINLINE + void WriteBoolRangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times); + + XE_FORCEINLINE + void WriteLoopRangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times); + + XE_FORCEINLINE + void WriteREGISTERSRangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times); + + XE_FORCEINLINE + void WriteALURangeFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + + XE_FORCEINLINE + void WriteFetchRangeFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + + XE_FORCEINLINE + void WriteBoolRangeFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + + XE_FORCEINLINE + void WriteLoopRangeFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + + XE_FORCEINLINE + void WriteREGISTERSRangeFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + void OnGammaRamp256EntryTableValueWritten() override; void OnGammaRampPWLValueWritten() override; @@ -367,6 +418,14 @@ class D3D12CommandProcessor final : public CommandProcessor { const draw_util::Scissor& scissor, bool primitive_polygonal, reg::RB_DEPTHCONTROL normalized_depth_control); + + template + XE_NOINLINE void UpdateSystemConstantValues_Impl( + bool shared_memory_is_uav, uint32_t line_loop_closing_index, + xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info, + uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control, + uint32_t normalized_color_mask); + void UpdateSystemConstantValues(bool shared_memory_is_uav, bool primitive_polygonal, uint32_t line_loop_closing_index, @@ -619,8 +678,8 @@ class D3D12CommandProcessor final : public CommandProcessor { uint32_t current_graphics_root_up_to_date_; // System shader constants. - alignas(XE_HOST_CACHE_LINE_SIZE) - DxbcShaderTranslator::SystemConstants system_constants_; + alignas(XE_HOST_CACHE_LINE_SIZE) + DxbcShaderTranslator::SystemConstants system_constants_; // Float constant usage masks of the last draw call. // chrispy: make sure accesses to these cant cross cacheline boundaries diff --git a/src/xenia/gpu/d3d12/d3d12_nvapi.hpp b/src/xenia/gpu/d3d12/d3d12_nvapi.hpp new file mode 100644 index 000000000..023699375 --- /dev/null +++ b/src/xenia/gpu/d3d12/d3d12_nvapi.hpp @@ -0,0 +1,122 @@ +#pragma once +// requires windows.h +#include + +namespace lightweight_nvapi { + +using nvstatus_t = int; + +using nvintfid_t = unsigned int; + +#ifndef LIGHTWEIGHT_NVAPI_EXCLUDE_D3D12 +constexpr nvintfid_t id_NvAPI_D3D12_QueryCpuVisibleVidmem = 0x26322BC3; + +using cb_NvAPI_D3D12_QueryCpuVisibleVidmem = nvstatus_t (*)( + ID3D12Device* pDevice, uint64_t* pTotalBytes, uint64_t* pFreeBytes); + +constexpr nvintfid_t id_NvAPI_D3D12_UseDriverHeapPriorities = 0xF0D978A8; +using cb_NvAPI_D3D12_UseDriverHeapPriorities = + nvstatus_t (*)(ID3D12Device* pDevice); +enum NV_D3D12_RESOURCE_FLAGS { + NV_D3D12_RESOURCE_FLAG_NONE = 0, + NV_D3D12_RESOURCE_FLAG_HTEX = 1, //!< Create HTEX texture + NV_D3D12_RESOURCE_FLAG_CPUVISIBLE_VIDMEM = + 2, //!< Hint to create resource in cpuvisible vidmem +}; + +struct NV_RESOURCE_PARAMS { + uint32_t version; //!< Version of structure. Must always be first member + NV_D3D12_RESOURCE_FLAGS + NVResourceFlags; //!< Additional NV specific flags (set the + //!< NV_D3D12_RESOURCE_FLAG_HTEX bit to create HTEX + //!< texture) +}; + +using cb_NvAPI_D3D12_CreateCommittedResource = nvstatus_t (*)( + ID3D12Device* pDevice, const D3D12_HEAP_PROPERTIES* pHeapProperties, + D3D12_HEAP_FLAGS HeapFlags, const D3D12_RESOURCE_DESC* pDesc, + D3D12_RESOURCE_STATES InitialState, + const D3D12_CLEAR_VALUE* pOptimizedClearValue, + const NV_RESOURCE_PARAMS* pNVResourceParams, REFIID riid, + void** ppvResource, bool* pSupported); +constexpr nvintfid_t id_NvAPI_D3D12_CreateCommittedResource = 0x27E98AEu; +#endif +class nvapi_state_t { + HMODULE nvapi64_; + void* (*queryinterface_)(unsigned int intfid); + bool available_; + bool init_ptrs(); + + bool call_init_interface(); + void call_deinit_interface(); + + public: + nvapi_state_t() : nvapi64_(LoadLibraryA("nvapi64.dll")), available_(false) { + available_ = init_ptrs(); + } + ~nvapi_state_t(); + template + T* query_interface(unsigned int intfid) { + if (queryinterface_ == nullptr) { + return nullptr; + } + return reinterpret_cast(queryinterface_(intfid)); + } + + bool is_available() const { return available_; } +}; +inline bool nvapi_state_t::call_init_interface() { + int result = -1; + auto initInterfaceEx = query_interface(0xAD298D3F); + if (!initInterfaceEx) { + auto initInterface = query_interface(0x150E828u); + if (initInterface) { + result = initInterface(); + } + } else { + result = initInterfaceEx(0); + } + return result == 0; +} +inline void nvapi_state_t::call_deinit_interface() { + auto deinitinterfaceex = query_interface(0xD7C61344); + if (deinitinterfaceex) { + deinitinterfaceex(1); // or 0? im not sure what the proper value is + } else { + auto deinitinterface = query_interface(0xD22BDD7E); + if (deinitinterface) { + deinitinterface(); + } + } +} +inline bool nvapi_state_t::init_ptrs() { + if (!nvapi64_) return false; + queryinterface_ = reinterpret_cast( + GetProcAddress(nvapi64_, "nvapi_QueryInterface")); + + if (!queryinterface_) { + return false; + } + if (!call_init_interface()) { + return false; + } + + return true; +} +inline nvapi_state_t::~nvapi_state_t() { + if (available_) { + call_deinit_interface(); + } +} +inline void init_nvapi() { + /// HMODULE moddy = LoadLibraryA("nvapi64.dll"); + + // FARPROC quif = GetProcAddress(moddy, "nvapi_QueryInterface"); + + nvapi_state_t nvapi{}; + + auto queryvisible = nvapi.query_interface(0x26322BC3); + return; +} + +} // namespace lightweight_nvapi \ No newline at end of file diff --git a/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc b/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc index 03e67d9ac..f65379ea3 100644 --- a/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc @@ -108,12 +108,11 @@ bool D3D12PrimitiveProcessor::InitializeBuiltinIndexBuffer( size_bytes); return false; } + Microsoft::WRL::ComPtr upload_resource; - if (FAILED(device->CreateCommittedResource( - &ui::d3d12::util::kHeapPropertiesUpload, + if (!provider.CreateUploadResource( provider.GetHeapFlagCreateNotZeroed(), &resource_desc, - D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, - IID_PPV_ARGS(&upload_resource)))) { + D3D12_RESOURCE_STATE_GENERIC_READ, IID_PPV_ARGS(&upload_resource))) { XELOGE( "D3D12 primitive processor: Failed to create the built-in index " "buffer upload resource with {} bytes", diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc index 0ae6c8552..238bbea05 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc @@ -5492,11 +5492,19 @@ void D3D12RenderTargetCache::SetCommandListRenderTargets( } // Bind the render targets. - if (are_current_command_list_render_targets_valid_ && - std::memcmp(current_command_list_render_targets_, - depth_and_color_render_targets, - sizeof(current_command_list_render_targets_))) { - are_current_command_list_render_targets_valid_ = false; + if (are_current_command_list_render_targets_valid_) { + // chrispy: the small memcmp doesnt get optimized by msvc + + for (unsigned i = 0; + i < sizeof(current_command_list_render_targets_) / + sizeof(current_command_list_render_targets_[0]); + ++i) { + if ((const void*)current_command_list_render_targets_[i] != + (const void*)depth_and_color_render_targets[i]) { + are_current_command_list_render_targets_valid_ = false; + break; + } + } } uint32_t render_targets_are_srgb; if (gamma_render_target_as_srgb_) { diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc index 94e21a7e0..0cfaaaa41 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc @@ -467,7 +467,7 @@ void D3D12TextureCache::EndFrame() { XELOGE("Unsupported texture formats used in the frame:"); unsupported_header_written = true; } - XELOGE("* {}{}{}{}", FormatInfo::Get(xenos::TextureFormat(i))->name, + XELOGE("* {}{}{}{}", FormatInfo::GetName(xenos::TextureFormat(i)), unsupported_features & kUnsupportedResourceBit ? " resource" : "", unsupported_features & kUnsupportedUnormBit ? " unsigned" : "", unsupported_features & kUnsupportedSnormBit ? " signed" : ""); @@ -523,12 +523,16 @@ void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) { } } } - +// chrispy: optimize this further bool D3D12TextureCache::AreActiveTextureSRVKeysUpToDate( const TextureSRVKey* keys, const D3D12Shader::TextureBinding* host_shader_bindings, size_t host_shader_binding_count) const { for (size_t i = 0; i < host_shader_binding_count; ++i) { + if (i + 8 < host_shader_binding_count) { + PrefetchTextureBinding( + host_shader_bindings[i + 8].fetch_constant); + } const TextureSRVKey& key = keys[i]; const TextureBinding* binding = GetValidTextureBinding(host_shader_bindings[i].fetch_constant); @@ -538,8 +542,9 @@ bool D3D12TextureCache::AreActiveTextureSRVKeysUpToDate( } continue; } - if (key.key != binding->key || key.host_swizzle != binding->host_swizzle || - key.swizzled_signs != binding->swizzled_signs) { + if ((key.key != binding->key) | + (key.host_swizzle != binding->host_swizzle) | + (key.swizzled_signs != binding->swizzled_signs)) { return false; } } @@ -666,8 +671,12 @@ uint32_t D3D12TextureCache::GetActiveTextureBindlessSRVIndex( } return descriptor_index; } - -D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters( +void D3D12TextureCache::PrefetchSamplerParameters( + const D3D12Shader::SamplerBinding& binding) const { + swcache::PrefetchL1(®ister_file()[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + + binding.fetch_constant * 6]); +} + D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters( const D3D12Shader::SamplerBinding& binding) const { const auto& regs = register_file(); const auto& fetch = regs.Get( @@ -694,7 +703,7 @@ D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters( nullptr, nullptr, nullptr, &mip_min_level, nullptr); parameters.mip_min_level = mip_min_level; - + //high cache miss count here, prefetch fetch earlier // TODO(Triang3l): Disable filtering for texture formats not supporting it. xenos::AnisoFilter aniso_filter = binding.aniso_filter == xenos::AnisoFilter::kUseFetchConst diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.h b/src/xenia/gpu/d3d12/d3d12_texture_cache.h index d5aacd617..e70954fb9 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.h +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.h @@ -119,7 +119,8 @@ class D3D12TextureCache final : public TextureCache { D3D12_CPU_DESCRIPTOR_HANDLE handle); uint32_t GetActiveTextureBindlessSRVIndex( const D3D12Shader::TextureBinding& host_shader_binding); - + void PrefetchSamplerParameters( + const D3D12Shader::SamplerBinding& binding) const; SamplerParameters GetSamplerParameters( const D3D12Shader::SamplerBinding& binding) const; void WriteSampler(SamplerParameters parameters, @@ -712,7 +713,7 @@ class D3D12TextureCache final : public TextureCache { } LoadShaderIndex GetLoadShaderIndex(TextureKey key) const; - + // chrispy: todo, can use simple branchless tests here static constexpr bool AreDimensionsCompatible( xenos::FetchOpDimension binding_dimension, xenos::DataDimension resource_dimension) { diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index 2176e777a..d9914e566 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -1047,8 +1047,7 @@ bool PipelineCache::ConfigurePipeline( PipelineDescription& description = runtime_description.description; if (current_pipeline_ != nullptr && - !std::memcmp(¤t_pipeline_->description.description, &description, - sizeof(description))) { + current_pipeline_->description.description == description) { *pipeline_handle_out = current_pipeline_; *root_signature_out = runtime_description.root_signature; return true; @@ -1059,8 +1058,7 @@ bool PipelineCache::ConfigurePipeline( auto found_range = pipelines_.equal_range(hash); for (auto it = found_range.first; it != found_range.second; ++it) { Pipeline* found_pipeline = it->second; - if (!std::memcmp(&found_pipeline->description.description, &description, - sizeof(description))) { + if (found_pipeline->description.description == description) { current_pipeline_ = found_pipeline; *pipeline_handle_out = found_pipeline; *root_signature_out = found_pipeline->description.root_signature; diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index 30969e7c4..37e73cae4 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -226,6 +226,7 @@ class PipelineCache { PipelineRenderTarget render_targets[xenos::kMaxColorRenderTargets]; + inline bool operator==(const PipelineDescription& other) const; static constexpr uint32_t kVersion = 0x20210425; }); @@ -424,7 +425,34 @@ class PipelineCache { size_t creation_threads_shutdown_from_ = SIZE_MAX; std::vector> creation_threads_; }; +inline bool PipelineCache::PipelineDescription::operator==( + const PipelineDescription& other) const { + constexpr size_t cmp_size = sizeof(PipelineDescription); +#if XE_ARCH_AMD64 == 1 + if constexpr (cmp_size == 64) { + if (vertex_shader_hash != other.vertex_shader_hash || + vertex_shader_modification != other.vertex_shader_modification) { + return false; + } + const __m128i* thiz = (const __m128i*)this; + const __m128i* thoze = (const __m128i*)&other; + __m128i cmp32 = + _mm_cmpeq_epi8(_mm_loadu_si128(thiz + 1), _mm_loadu_si128(thoze + 1)); + cmp32 = _mm_and_si128(cmp32, _mm_cmpeq_epi8(_mm_loadu_si128(thiz + 2), + _mm_loadu_si128(thoze + 2))); + + cmp32 = _mm_and_si128(cmp32, _mm_cmpeq_epi8(_mm_loadu_si128(thiz + 3), + _mm_loadu_si128(thoze + 3))); + + return _mm_movemask_epi8(cmp32) == 0xFFFF; + + } else +#endif + { + return !memcmp(this, &other, cmp_size); + } +} } // namespace d3d12 } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/draw_extent_estimator.cc b/src/xenia/gpu/draw_extent_estimator.cc index fb65fb96b..31e94dcbb 100644 --- a/src/xenia/gpu/draw_extent_estimator.cc +++ b/src/xenia/gpu/draw_extent_estimator.cc @@ -320,22 +320,38 @@ uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y, // scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it // usually exists and can't be disabled. auto pa_cl_vte_cntl = regs.Get(); + float viewport_bottom = 0.0f; + uint32_t enable_window_offset = + regs.Get().vtx_window_offset_enable; + + bool not_pix_center = !regs.Get().pix_center; + + float window_y_offset_f = float(window_y_offset); + + float yoffset = regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; + // First calculate all the integer.0 or integer.5 offsetting exactly at full // precision. - if (regs.Get().vtx_window_offset_enable) { - viewport_bottom += float(window_y_offset); + // chrispy: branch mispredicts here causing some pain according to vtune + float sm1 = .0f, sm2 = .0f, sm3 = .0f, sm4 = .0f; + + if (enable_window_offset) { + sm1 = window_y_offset_f; } - if (!regs.Get().pix_center) { - viewport_bottom += 0.5f; + if (not_pix_center) { + sm2 = 0.5f; } // Then apply the floating-point viewport offset. if (pa_cl_vte_cntl.vport_y_offset_ena) { - viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; + sm3 = yoffset; } - viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena - ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32) - : 1.0f; + sm4 = pa_cl_vte_cntl.vport_y_scale_ena + ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32) + : 1.0f; + + viewport_bottom = sm1 + sm2 + sm3 + sm4; + // Using floor, or, rather, truncation (because maxing with zero anyway) // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia // GPUs on Direct3D 12 (but not WARP), also like in diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 02d1f6750..24b1eefdc 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -929,8 +929,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, XELOGW( "Resolving to format {}, which is untested - treating like {}. " "Report the game to Xenia developers!", - FormatInfo::Get(dest_format)->name, - FormatInfo::Get(dest_closest_format)->name); + FormatInfo::GetName(dest_format), + FormatInfo::GetName(dest_closest_format)); } } @@ -1002,7 +1002,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, } } else { XELOGE("Tried to resolve to format {}, which is not a ColorFormat", - dest_format_info.name); + FormatInfo::GetName(dest_format)); copy_dest_extent_start = copy_dest_base_adjusted; copy_dest_extent_end = copy_dest_base_adjusted; } @@ -1117,7 +1117,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, xenos::DepthRenderTargetFormat(depth_edram_info.format)) : xenos::GetColorRenderTargetFormatName( xenos::ColorRenderTargetFormat(color_edram_info.format)), - dest_format_info.name, rb_copy_dest_base, copy_dest_extent_start, + FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start, copy_dest_extent_end); return true; diff --git a/src/xenia/gpu/pm4_command_processor_declare.h b/src/xenia/gpu/pm4_command_processor_declare.h new file mode 100644 index 000000000..2da2fb72c --- /dev/null +++ b/src/xenia/gpu/pm4_command_processor_declare.h @@ -0,0 +1,106 @@ + + +void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT; + +virtual bool ExecutePacket(); +XE_NOINLINE +bool ExecutePacketType0( uint32_t packet) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType1( uint32_t packet) XE_RESTRICT; + +bool ExecutePacketType2( uint32_t packet) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3( uint32_t packet) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3_ME_INIT( uint32_t packet, + uint32_t count) XE_RESTRICT; +bool ExecutePacketType3_NOP( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3_INTERRUPT( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3_XE_SWAP( uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3_INDIRECT_BUFFER( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3_WAIT_REG_MEM( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3_REG_RMW( uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3_REG_TO_MEM( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3_MEM_WRITE( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3_COND_WRITE( uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3_EVENT_WRITE( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3_EVENT_WRITE_SHD( uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3_EVENT_WRITE_EXT( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3_EVENT_WRITE_ZPD( uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3Draw( uint32_t packet, + const char* opcode_name, + uint32_t viz_query_condition, + uint32_t count_remaining) XE_RESTRICT; + +bool ExecutePacketType3_DRAW_INDX( uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3_DRAW_INDX_2( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_FORCEINLINE +bool ExecutePacketType3_SET_CONSTANT( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_NOINLINE +bool ExecutePacketType3_SET_CONSTANT2( uint32_t packet, + uint32_t count) XE_RESTRICT; +XE_FORCEINLINE +bool ExecutePacketType3_LOAD_ALU_CONSTANT( uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3_SET_SHADER_CONSTANTS( + uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3_IM_LOAD( uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3_IM_LOAD_IMMEDIATE( uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3_INVALIDATE_STATE( uint32_t packet, + uint32_t count) XE_RESTRICT; + +bool ExecutePacketType3_VIZ_QUERY( uint32_t packet, + uint32_t count) XE_RESTRICT; + + +XE_FORCEINLINE +void WriteEventInitiator(uint32_t value) XE_RESTRICT; + +XE_NOINLINE +XE_COLD +bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT; + +XE_NOINLINE +XE_NOALIAS +uint32_t GetCurrentRingReadCount(); + +XE_NOINLINE +XE_COLD +bool ExecutePacketType3_CountOverflow(uint32_t count); \ No newline at end of file diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h new file mode 100644 index 000000000..53b81b888 --- /dev/null +++ b/src/xenia/gpu/pm4_command_processor_implement.h @@ -0,0 +1,1123 @@ +#pragma once +using namespace xe::gpu::xenos; +void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr, + uint32_t count) XE_RESTRICT { + SCOPE_profile_cpu_f("gpu"); + + trace_writer_.WriteIndirectBufferStart(ptr, count * sizeof(uint32_t)); + + RingBuffer old_reader = reader_; + + // Execute commands! + // RingBuffer reader(memory_->TranslatePhysical(ptr), count * + // sizeof(uint32_t)); reader.set_write_offset(count * sizeof(uint32_t)); + new (&reader_) + RingBuffer(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)); + reader_.set_write_offset(count * sizeof(uint32_t)); + do { + if (COMMAND_PROCESSOR::ExecutePacket()) { + continue; + } else { + // Return up a level if we encounter a bad packet. + XELOGE("**** INDIRECT RINGBUFFER: Failed to execute packet."); + assert_always(); + // break; + } + } while (reader_.read_count()); + + trace_writer_.WriteIndirectBufferEnd(); + reader_ = old_reader; +} + +bool COMMAND_PROCESSOR::ExecutePacket() { + // prefetch the wraparound range + // it likely is already in L3 cache, but in a zen system it may be another + // chiplets l3 + reader_.BeginPrefetchedRead( + COMMAND_PROCESSOR::GetCurrentRingReadCount()); + const uint32_t packet = reader_.ReadAndSwap(); + const uint32_t packet_type = packet >> 30; + + XE_LIKELY_IF(packet && packet != 0x0BADF00D) { + XE_LIKELY_IF((packet != 0xCDCDCDCD)) { + actually_execute_packet: + // chrispy: reorder checks by probability + XE_LIKELY_IF(packet_type == 3) { + return COMMAND_PROCESSOR::ExecutePacketType3(packet); + } + else { + if (packet_type == + 0) { // dont know whether 0 or 1 are the next most frequent + return COMMAND_PROCESSOR::ExecutePacketType0(packet); + } else { + if (packet_type == 1) { + return COMMAND_PROCESSOR::ExecutePacketType1(packet); + } else { + // originally there was a default case that msvc couldn't optimize + // away because it doesnt have value range analysis but in reality + // there is no default, a uint32_t >> 30 only has 4 possible values + // and all are covered here + // return COMMAND_PROCESSOR::ExecutePacketType2(packet); + // executepackettype2 is identical + goto handle_bad_packet; + } + } + } + } + else { + XELOGW("GPU packet is CDCDCDCD - probably read uninitialized memory!"); + goto actually_execute_packet; + } + } + else { + handle_bad_packet: + trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1); + trace_writer_.WritePacketEnd(); + return true; + } +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT { + // Type-0 packet. + // Write count registers in sequence to the registers starting at + // (base_index << 2). + + uint32_t count = ((packet >> 16) & 0x3FFF) + 1; + if (COMMAND_PROCESSOR::GetCurrentRingReadCount() < count * sizeof(uint32_t)) { + XELOGE( + "ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})", + COMMAND_PROCESSOR::GetCurrentRingReadCount(), count * sizeof(uint32_t)); + return false; + } + + trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count); + + uint32_t base_index = (packet & 0x7FFF); + uint32_t write_one_reg = (packet >> 15) & 0x1; + + if (!write_one_reg) { + COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index, count); + + } else { + COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count); + } + + trace_writer_.WritePacketEnd(); + return true; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType1(uint32_t packet) XE_RESTRICT { + // Type-1 packet. + // Contains two registers of data. Type-0 should be more common. + trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 3); + uint32_t reg_index_1 = packet & 0x7FF; + uint32_t reg_index_2 = (packet >> 11) & 0x7FF; + uint32_t reg_data_1 = reader_.ReadAndSwap(); + uint32_t reg_data_2 = reader_.ReadAndSwap(); + COMMAND_PROCESSOR::WriteRegister(reg_index_1, reg_data_1); + COMMAND_PROCESSOR::WriteRegister(reg_index_2, reg_data_2); + trace_writer_.WritePacketEnd(); + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType2(uint32_t packet) XE_RESTRICT { + // Type-2 packet. + // No-op. Do nothing. + trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1); + trace_writer_.WritePacketEnd(); + return true; +} +XE_NOINLINE +XE_NOALIAS +uint32_t COMMAND_PROCESSOR::GetCurrentRingReadCount() { + return reader_.read_count(); +} +XE_NOINLINE +XE_COLD +bool COMMAND_PROCESSOR::ExecutePacketType3_CountOverflow(uint32_t count) { + XELOGE("ExecutePacketType3 overflow (read count {:08X}, packet count {:08X})", + COMMAND_PROCESSOR::GetCurrentRingReadCount(), + count * sizeof(uint32_t)); + return false; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3(uint32_t packet) XE_RESTRICT { + // Type-3 packet. + uint32_t opcode = (packet >> 8) & 0x7F; + uint32_t count = ((packet >> 16) & 0x3FFF) + 1; + auto data_start_offset = reader_.read_offset(); + + if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >= + count * sizeof(uint32_t)) { + // To handle nesting behavior when tracing we special case indirect buffers. + if (opcode == PM4_INDIRECT_BUFFER) { + trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 2); + } else { + trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), + 1 + count); + } + + // & 1 == predicate - when set, we do bin check to see if we should execute + // the packet. Only type 3 packets are affected. + // We also skip predicated swaps, as they are never valid (probably?). + if (packet & 1) { + bool any_pass = (bin_select_ & bin_mask_) != 0; + if (!any_pass || opcode == PM4_XE_SWAP) { + reader_.AdvanceRead(count * sizeof(uint32_t)); + trace_writer_.WritePacketEnd(); + return true; + } + } + + bool result = false; + switch (opcode) { + case PM4_ME_INIT: + result = COMMAND_PROCESSOR::ExecutePacketType3_ME_INIT(packet, count); + break; + case PM4_NOP: + result = COMMAND_PROCESSOR::ExecutePacketType3_NOP(packet, count); + break; + case PM4_INTERRUPT: + result = COMMAND_PROCESSOR::ExecutePacketType3_INTERRUPT(packet, count); + break; + case PM4_XE_SWAP: + result = COMMAND_PROCESSOR::ExecutePacketType3_XE_SWAP(packet, count); + break; + case PM4_INDIRECT_BUFFER: + case PM4_INDIRECT_BUFFER_PFD: + result = COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(packet, + count); + break; + case PM4_WAIT_REG_MEM: + result = + COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(packet, count); + break; + case PM4_REG_RMW: + result = COMMAND_PROCESSOR::ExecutePacketType3_REG_RMW(packet, count); + break; + case PM4_REG_TO_MEM: + result = + COMMAND_PROCESSOR::ExecutePacketType3_REG_TO_MEM(packet, count); + break; + case PM4_MEM_WRITE: + result = COMMAND_PROCESSOR::ExecutePacketType3_MEM_WRITE(packet, count); + break; + case PM4_COND_WRITE: + result = + COMMAND_PROCESSOR::ExecutePacketType3_COND_WRITE(packet, count); + break; + case PM4_EVENT_WRITE: + result = + COMMAND_PROCESSOR::ExecutePacketType3_EVENT_WRITE(packet, count); + break; + case PM4_EVENT_WRITE_SHD: + result = COMMAND_PROCESSOR::ExecutePacketType3_EVENT_WRITE_SHD(packet, + count); + break; + case PM4_EVENT_WRITE_EXT: + result = COMMAND_PROCESSOR::ExecutePacketType3_EVENT_WRITE_EXT(packet, + count); + break; + case PM4_EVENT_WRITE_ZPD: + result = COMMAND_PROCESSOR::ExecutePacketType3_EVENT_WRITE_ZPD(packet, + count); + break; + case PM4_DRAW_INDX: + result = COMMAND_PROCESSOR::ExecutePacketType3_DRAW_INDX(packet, count); + break; + case PM4_DRAW_INDX_2: + result = + COMMAND_PROCESSOR::ExecutePacketType3_DRAW_INDX_2(packet, count); + break; + case PM4_SET_CONSTANT: + result = + COMMAND_PROCESSOR::ExecutePacketType3_SET_CONSTANT(packet, count); + break; + case PM4_SET_CONSTANT2: + result = + COMMAND_PROCESSOR::ExecutePacketType3_SET_CONSTANT2(packet, count); + break; + case PM4_LOAD_ALU_CONSTANT: + result = COMMAND_PROCESSOR::ExecutePacketType3_LOAD_ALU_CONSTANT(packet, + count); + break; + case PM4_SET_SHADER_CONSTANTS: + result = COMMAND_PROCESSOR::ExecutePacketType3_SET_SHADER_CONSTANTS( + packet, count); + break; + case PM4_IM_LOAD: + result = COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD(packet, count); + break; + case PM4_IM_LOAD_IMMEDIATE: + result = COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(packet, + count); + break; + case PM4_INVALIDATE_STATE: + result = COMMAND_PROCESSOR::ExecutePacketType3_INVALIDATE_STATE(packet, + count); + break; + case PM4_VIZ_QUERY: + result = COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(packet, count); + break; + + case PM4_SET_BIN_MASK_LO: { + uint32_t value = reader_.ReadAndSwap(); + bin_mask_ = (bin_mask_ & 0xFFFFFFFF00000000ull) | value; + result = true; + } break; + case PM4_SET_BIN_MASK_HI: { + uint32_t value = reader_.ReadAndSwap(); + bin_mask_ = + (bin_mask_ & 0xFFFFFFFFull) | (static_cast(value) << 32); + result = true; + } break; + case PM4_SET_BIN_SELECT_LO: { + uint32_t value = reader_.ReadAndSwap(); + bin_select_ = (bin_select_ & 0xFFFFFFFF00000000ull) | value; + result = true; + } break; + case PM4_SET_BIN_SELECT_HI: { + uint32_t value = reader_.ReadAndSwap(); + bin_select_ = (bin_select_ & 0xFFFFFFFFull) | + (static_cast(value) << 32); + result = true; + } break; + case PM4_SET_BIN_MASK: { + assert_true(count == 2); + uint64_t val_hi = reader_.ReadAndSwap(); + uint64_t val_lo = reader_.ReadAndSwap(); + bin_mask_ = (val_hi << 32) | val_lo; + result = true; + } break; + case PM4_SET_BIN_SELECT: { + assert_true(count == 2); + uint64_t val_hi = reader_.ReadAndSwap(); + uint64_t val_lo = reader_.ReadAndSwap(); + bin_select_ = (val_hi << 32) | val_lo; + result = true; + } break; + case PM4_CONTEXT_UPDATE: { + assert_true(count == 1); + uint32_t value = reader_.ReadAndSwap(); + XELOGGPU("GPU context update = {:08X}", value); + assert_true(value == 0); + result = true; + break; + } + case PM4_WAIT_FOR_IDLE: { + // This opcode is used by 5454084E while going / being ingame. + assert_true(count == 1); + uint32_t value = reader_.ReadAndSwap(); + XELOGGPU("GPU wait for idle = {:08X}", value); + result = true; + break; + } + + default: + return COMMAND_PROCESSOR::HitUnimplementedOpcode(opcode, count); + } + + trace_writer_.WritePacketEnd(); +#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 + + if (opcode == PM4_XE_SWAP) { + // End the trace writer frame. + if (trace_writer_.is_open()) { + trace_writer_.WriteEvent(EventCommand::Type::kSwap); + trace_writer_.Flush(); + if (trace_state_ == TraceState::kSingleFrame) { + trace_state_ = TraceState::kDisabled; + trace_writer_.Close(); + } + } else if (trace_state_ == TraceState::kSingleFrame) { + // New trace request - we only start tracing at the beginning of a + // frame. + uint32_t title_id = kernel_state_->GetExecutableModule()->title_id(); + auto file_name = fmt::format("{:08X}_{}.xtr", title_id, counter_ - 1); + auto path = trace_frame_path_ / file_name; + trace_writer_.Open(path, title_id); + InitializeTrace(); + } + } +#endif + + assert_true(reader_.read_offset() == + (data_start_offset + (count * sizeof(uint32_t))) % + reader_.capacity()); + return result; + } else { + return COMMAND_PROCESSOR::ExecutePacketType3_CountOverflow(count); + } +} + +XE_NOINLINE +XE_COLD +bool COMMAND_PROCESSOR::HitUnimplementedOpcode(uint32_t opcode, + uint32_t count) XE_RESTRICT { + XELOGGPU("Unimplemented GPU OPCODE: 0x{:02X}\t\tCOUNT: {}\n", opcode, count); + assert_always(); + reader_.AdvanceRead(count * sizeof(uint32_t)); + trace_writer_.WritePacketEnd(); + return false; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_ME_INIT(uint32_t packet, + uint32_t count) XE_RESTRICT { + // initialize CP's micro-engine + me_bin_.resize(count); + for (uint32_t i = 0; i < count; i++) { + me_bin_[i] = reader_.ReadAndSwap(); + } + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_NOP(uint32_t packet, + uint32_t count) XE_RESTRICT { + // skip N 32-bit words to get to the next packet + // No-op, ignore some data. + reader_.AdvanceRead(count * sizeof(uint32_t)); + return true; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_INTERRUPT( + uint32_t packet, uint32_t count) XE_RESTRICT { + SCOPE_profile_cpu_f("gpu"); + + // generate interrupt from the command stream + uint32_t cpu_mask = reader_.ReadAndSwap(); + for (int n = 0; n < 6; n++) { + if (cpu_mask & (1 << n)) { + graphics_system_->DispatchInterruptCallback(1, n); + } + } + return true; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_XE_SWAP(uint32_t packet, + uint32_t count) XE_RESTRICT { + SCOPE_profile_cpu_f("gpu"); + + Profiler::Flip(); + + // Xenia-specific VdSwap hook. + // VdSwap will post this to tell us we need to swap the screen/fire an + // interrupt. + // 63 words here, but only the first has any data. + uint32_t magic = reader_.ReadAndSwap(); + assert_true(magic == kSwapSignature); + + // TODO(benvanik): only swap frontbuffer ptr. + uint32_t frontbuffer_ptr = reader_.ReadAndSwap(); + uint32_t frontbuffer_width = reader_.ReadAndSwap(); + uint32_t frontbuffer_height = reader_.ReadAndSwap(); + reader_.AdvanceRead((count - 4) * sizeof(uint32_t)); + + COMMAND_PROCESSOR::IssueSwap(frontbuffer_ptr, frontbuffer_width, + frontbuffer_height); + + ++counter_; + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER( + uint32_t packet, uint32_t count) XE_RESTRICT { + // indirect buffer dispatch + uint32_t list_ptr = CpuToGpu(reader_.ReadAndSwap()); + uint32_t list_length = reader_.ReadAndSwap(); + assert_zero(list_length & ~0xFFFFF); + list_length &= 0xFFFFF; + COMMAND_PROCESSOR::ExecuteIndirectBuffer(GpuToCpu(list_ptr), list_length); + return true; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM( + uint32_t packet, uint32_t count) XE_RESTRICT { + SCOPE_profile_cpu_f("gpu"); + + // wait until a register or memory location is a specific value + uint32_t wait_info = reader_.ReadAndSwap(); + uint32_t poll_reg_addr = reader_.ReadAndSwap(); + uint32_t ref = reader_.ReadAndSwap(); + uint32_t mask = reader_.ReadAndSwap(); + uint32_t wait = reader_.ReadAndSwap(); + bool matched = false; + do { + uint32_t value; + if (wait_info & 0x10) { + // Memory. + auto endianness = static_cast(poll_reg_addr & 0x3); + poll_reg_addr &= ~0x3; + value = xe::load(memory_->TranslatePhysical(poll_reg_addr)); + value = GpuSwap(value, endianness); + trace_writer_.WriteMemoryRead(CpuToGpu(poll_reg_addr), 4); + } else { + // Register. + assert_true(poll_reg_addr < RegisterFile::kRegisterCount); + value = register_file_->values[poll_reg_addr].u32; + if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) { + MakeCoherent(); + value = register_file_->values[poll_reg_addr].u32; + } + } + switch (wait_info & 0x7) { + case 0x0: // Never. + matched = false; + break; + case 0x1: // Less than reference. + matched = (value & mask) < ref; + break; + case 0x2: // Less than or equal to reference. + matched = (value & mask) <= ref; + break; + case 0x3: // Equal to reference. + matched = (value & mask) == ref; + break; + case 0x4: // Not equal to reference. + matched = (value & mask) != ref; + break; + case 0x5: // Greater than or equal to reference. + matched = (value & mask) >= ref; + break; + case 0x6: // Greater than reference. + matched = (value & mask) > ref; + break; + case 0x7: // Always + matched = true; + break; + } + if (!matched) { + // Wait. + if (wait >= 0x100) { + PrepareForWait(); + if (!cvars::vsync) { + // User wants it fast and dangerous. + xe::threading::MaybeYield(); + } else { + xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100)); + } + xe::threading::SyncMemory(); + ReturnFromWait(); + + if (!worker_running_) { + // Short-circuited exit. + return false; + } + } else { + xe::threading::MaybeYield(); + } + } + } while (!matched); + + return true; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_REG_RMW(uint32_t packet, + uint32_t count) XE_RESTRICT { + // register read/modify/write + // ? (used during shader upload and edram setup) + uint32_t rmw_info = reader_.ReadAndSwap(); + uint32_t and_mask = reader_.ReadAndSwap(); + uint32_t or_mask = reader_.ReadAndSwap(); + uint32_t value = register_file_->values[rmw_info & 0x1FFF].u32; + if ((rmw_info >> 31) & 0x1) { + // & reg + value &= register_file_->values[and_mask & 0x1FFF].u32; + } else { + // & imm + value &= and_mask; + } + if ((rmw_info >> 30) & 0x1) { + // | reg + value |= register_file_->values[or_mask & 0x1FFF].u32; + } else { + // | imm + value |= or_mask; + } + COMMAND_PROCESSOR::WriteRegister(rmw_info & 0x1FFF, value); + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_REG_TO_MEM( + uint32_t packet, uint32_t count) XE_RESTRICT { + // Copy Register to Memory (?) + // Count is 2, assuming a Register Addr and a Memory Addr. + + uint32_t reg_addr = reader_.ReadAndSwap(); + uint32_t mem_addr = reader_.ReadAndSwap(); + + uint32_t reg_val; + + assert_true(reg_addr < RegisterFile::kRegisterCount); + reg_val = register_file_->values[reg_addr].u32; + + auto endianness = static_cast(mem_addr & 0x3); + mem_addr &= ~0x3; + reg_val = GpuSwap(reg_val, endianness); + xe::store(memory_->TranslatePhysical(mem_addr), reg_val); + trace_writer_.WriteMemoryWrite(CpuToGpu(mem_addr), 4); + + return true; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_MEM_WRITE( + uint32_t packet, uint32_t count) XE_RESTRICT { + uint32_t write_addr = reader_.ReadAndSwap(); + for (uint32_t i = 0; i < count - 1; i++) { + uint32_t write_data = reader_.ReadAndSwap(); + + auto endianness = static_cast(write_addr & 0x3); + auto addr = write_addr & ~0x3; + write_data = GpuSwap(write_data, endianness); + xe::store(memory_->TranslatePhysical(addr), write_data); + trace_writer_.WriteMemoryWrite(CpuToGpu(addr), 4); + write_addr += 4; + } + + return true; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_COND_WRITE( + uint32_t packet, uint32_t count) XE_RESTRICT { + // conditional write to memory or register + uint32_t wait_info = reader_.ReadAndSwap(); + uint32_t poll_reg_addr = reader_.ReadAndSwap(); + uint32_t ref = reader_.ReadAndSwap(); + uint32_t mask = reader_.ReadAndSwap(); + uint32_t write_reg_addr = reader_.ReadAndSwap(); + uint32_t write_data = reader_.ReadAndSwap(); + uint32_t value; + if (wait_info & 0x10) { + // Memory. + auto endianness = static_cast(poll_reg_addr & 0x3); + poll_reg_addr &= ~0x3; + trace_writer_.WriteMemoryRead(CpuToGpu(poll_reg_addr), 4); + value = xe::load(memory_->TranslatePhysical(poll_reg_addr)); + value = GpuSwap(value, endianness); + } else { + // Register. + assert_true(poll_reg_addr < RegisterFile::kRegisterCount); + value = register_file_->values[poll_reg_addr].u32; + } + bool matched = false; + switch (wait_info & 0x7) { + case 0x0: // Never. + matched = false; + break; + case 0x1: // Less than reference. + matched = (value & mask) < ref; + break; + case 0x2: // Less than or equal to reference. + matched = (value & mask) <= ref; + break; + case 0x3: // Equal to reference. + matched = (value & mask) == ref; + break; + case 0x4: // Not equal to reference. + matched = (value & mask) != ref; + break; + case 0x5: // Greater than or equal to reference. + matched = (value & mask) >= ref; + break; + case 0x6: // Greater than reference. + matched = (value & mask) > ref; + break; + case 0x7: // Always + matched = true; + break; + } + if (matched) { + // Write. + if (wait_info & 0x100) { + // Memory. + auto endianness = static_cast(write_reg_addr & 0x3); + write_reg_addr &= ~0x3; + write_data = GpuSwap(write_data, endianness); + xe::store(memory_->TranslatePhysical(write_reg_addr), write_data); + trace_writer_.WriteMemoryWrite(CpuToGpu(write_reg_addr), 4); + } else { + // Register. + COMMAND_PROCESSOR::WriteRegister(write_reg_addr, write_data); + } + } + return true; +} +XE_FORCEINLINE +void COMMAND_PROCESSOR::WriteEventInitiator(uint32_t value) XE_RESTRICT { + register_file_->values[XE_GPU_REG_VGT_EVENT_INITIATOR].u32 = value; +} +bool COMMAND_PROCESSOR::ExecutePacketType3_EVENT_WRITE( + uint32_t packet, uint32_t count) XE_RESTRICT { + // generate an event that creates a write to memory when completed + uint32_t initiator = reader_.ReadAndSwap(); + // Writeback initiator. + + COMMAND_PROCESSOR::WriteEventInitiator(initiator & 0x3f); + if (count == 1) { + // Just an event flag? Where does this write? + } else { + // Write to an address. + assert_always(); + reader_.AdvanceRead((count - 1) * sizeof(uint32_t)); + } + return true; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_EVENT_WRITE_SHD( + uint32_t packet, uint32_t count) XE_RESTRICT { + // generate a VS|PS_done event + uint32_t initiator = reader_.ReadAndSwap(); + uint32_t address = reader_.ReadAndSwap(); + uint32_t value = reader_.ReadAndSwap(); + // Writeback initiator. + COMMAND_PROCESSOR::WriteEventInitiator(initiator & 0x3F); + uint32_t data_value; + if ((initiator >> 31) & 0x1) { + // Write counter (GPU vblank counter?). + data_value = counter_; + } else { + // Write value. + data_value = value; + } + auto endianness = static_cast(address & 0x3); + address &= ~0x3; + data_value = GpuSwap(data_value, endianness); + xe::store(memory_->TranslatePhysical(address), data_value); + trace_writer_.WriteMemoryWrite(CpuToGpu(address), 4); + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_EVENT_WRITE_EXT( + uint32_t packet, uint32_t count) XE_RESTRICT { + // generate a screen extent event + uint32_t initiator = reader_.ReadAndSwap(); + uint32_t address = reader_.ReadAndSwap(); + // Writeback initiator. + COMMAND_PROCESSOR::WriteEventInitiator(initiator & 0x3F); + auto endianness = static_cast(address & 0x3); + address &= ~0x3; + + // Let us hope we can fake this. + // This callback tells the driver the xy coordinates affected by a previous + // drawcall. + // https://www.google.com/patents/US20060055701 + uint16_t extents[] = { + byte_swap(0 >> 3), // min x + byte_swap(xenos::kTexture2DCubeMaxWidthHeight >> + 3), // max x + byte_swap(0 >> 3), // min y + byte_swap(xenos::kTexture2DCubeMaxWidthHeight >> + 3), // max y + byte_swap(0), // min z + byte_swap(1), // max z + }; + assert_true(endianness == xenos::Endian::k8in16); + + uint16_t* destination = (uint16_t*)memory_->TranslatePhysical(address); + + for (unsigned i = 0; i < 6; ++i) { + destination[i] = extents[i]; + } + // xe::copy_and_swap_16_unaligned(memory_->TranslatePhysical(address), + // extents, + // xe::countof(extents)); + + trace_writer_.WriteMemoryWrite(CpuToGpu(address), sizeof(extents)); + return true; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_EVENT_WRITE_ZPD( + uint32_t packet, uint32_t count) XE_RESTRICT { + // Set by D3D as BE but struct ABI is LE + const uint32_t kQueryFinished = xe::byte_swap(0xFFFFFEED); + assert_true(count == 1); + uint32_t initiator = reader_.ReadAndSwap(); + // Writeback initiator. + COMMAND_PROCESSOR::WriteEventInitiator(initiator & 0x3F); + + // Occlusion queries: + // This command is send on query begin and end. + // As a workaround report some fixed amount of passed samples. + auto fake_sample_count = cvars::query_occlusion_fake_sample_count; + if (fake_sample_count >= 0) { + auto* pSampleCounts = + memory_->TranslatePhysical( + register_file_->values[XE_GPU_REG_RB_SAMPLE_COUNT_ADDR].u32); + // 0xFFFFFEED is written to this two locations by D3D only on D3DISSUE_END + // and used to detect a finished query. + bool is_end_via_z_pass = pSampleCounts->ZPass_A == kQueryFinished && + pSampleCounts->ZPass_B == kQueryFinished; + // Older versions of D3D also checks for ZFail (4D5307D5). + bool is_end_via_z_fail = pSampleCounts->ZFail_A == kQueryFinished && + pSampleCounts->ZFail_B == kQueryFinished; + std::memset(pSampleCounts, 0, sizeof(xe_gpu_depth_sample_counts)); + if (is_end_via_z_pass || is_end_via_z_fail) { + pSampleCounts->ZPass_A = fake_sample_count; + pSampleCounts->Total_A = fake_sample_count; + } + } + + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3Draw( + uint32_t packet, const char* opcode_name, uint32_t viz_query_condition, + uint32_t count_remaining) XE_RESTRICT { + // if viz_query_condition != 0, this is a conditional draw based on viz query. + // This ID matches the one issued in PM4_VIZ_QUERY + // uint32_t viz_id = viz_query_condition & 0x3F; + // when true, render conditionally based on query result + // uint32_t viz_use = viz_query_condition & 0x100; + + assert_not_zero(count_remaining); + if (!count_remaining) { + XELOGE("{}: Packet too small, can't read VGT_DRAW_INITIATOR", opcode_name); + return false; + } + reg::VGT_DRAW_INITIATOR vgt_draw_initiator; + vgt_draw_initiator.value = reader_.ReadAndSwap(); + --count_remaining; + + register_file_->values[XE_GPU_REG_VGT_DRAW_INITIATOR].u32 = + vgt_draw_initiator.value; + bool draw_succeeded = true; + // TODO(Triang3l): Remove IndexBufferInfo and replace handling of all this + // with PrimitiveProcessor when the old Vulkan renderer is removed. + bool is_indexed = false; + IndexBufferInfo index_buffer_info; + switch (vgt_draw_initiator.source_select) { + case xenos::SourceSelect::kDMA: { + // Indexed draw. + is_indexed = true; + + // Two separate bounds checks so if there's only one missing register + // value out of two, one uint32_t will be skipped in the command buffer, + // not two. + assert_not_zero(count_remaining); + if (!count_remaining) { + XELOGE("{}: Packet too small, can't read VGT_DMA_BASE", opcode_name); + return false; + } + uint32_t vgt_dma_base = reader_.ReadAndSwap(); + --count_remaining; + register_file_->values[XE_GPU_REG_VGT_DMA_BASE].u32 = vgt_dma_base; + reg::VGT_DMA_SIZE vgt_dma_size; + assert_not_zero(count_remaining); + if (!count_remaining) { + XELOGE("{}: Packet too small, can't read VGT_DMA_SIZE", opcode_name); + return false; + } + vgt_dma_size.value = reader_.ReadAndSwap(); + --count_remaining; + register_file_->values[XE_GPU_REG_VGT_DMA_SIZE].u32 = vgt_dma_size.value; + + uint32_t index_size_bytes = + vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16 + ? sizeof(uint16_t) + : sizeof(uint32_t); + // The base address must already be word-aligned according to the R6xx + // documentation, but for safety. + index_buffer_info.guest_base = vgt_dma_base & ~(index_size_bytes - 1); + index_buffer_info.endianness = vgt_dma_size.swap_mode; + index_buffer_info.format = vgt_draw_initiator.index_size; + index_buffer_info.length = vgt_dma_size.num_words * index_size_bytes; + index_buffer_info.count = vgt_draw_initiator.num_indices; + } break; + case xenos::SourceSelect::kImmediate: { + // TODO(Triang3l): VGT_IMMED_DATA. + XELOGE( + "{}: Using immediate vertex indices, which are not supported yet. " + "Report the game to Xenia developers!", + opcode_name, uint32_t(vgt_draw_initiator.source_select)); + draw_succeeded = false; + assert_always(); + } break; + case xenos::SourceSelect::kAutoIndex: { + // Auto draw. + index_buffer_info.guest_base = 0; + index_buffer_info.length = 0; + } break; + default: { + // Invalid source selection. + draw_succeeded = false; + assert_unhandled_case(vgt_draw_initiator.source_select); + } break; + } + + // Skip to the next command, for example, if there are immediate indexes that + // we don't support yet. + reader_.AdvanceRead(count_remaining * sizeof(uint32_t)); + + if (draw_succeeded) { + auto viz_query = register_file_->Get(); + if (!(viz_query.viz_query_ena && viz_query.kill_pix_post_hi_z)) { + // TODO(Triang3l): Don't drop the draw call completely if the vertex + // shader has memexport. + // TODO(Triang3l || JoelLinn): Handle this properly in the render + // backends. + draw_succeeded = COMMAND_PROCESSOR::IssueDraw( + vgt_draw_initiator.prim_type, vgt_draw_initiator.num_indices, + is_indexed ? &index_buffer_info : nullptr, + xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode, + vgt_draw_initiator.prim_type)); + if (!draw_succeeded) { + XELOGE("{}({}, {}, {}): Failed in backend", opcode_name, + vgt_draw_initiator.num_indices, + uint32_t(vgt_draw_initiator.prim_type), + uint32_t(vgt_draw_initiator.source_select)); + } + } + } + + // If read the packed correctly, but merely couldn't execute it (because of, + // for instance, features not supported by the host), don't terminate command + // buffer processing as that would leave rendering in a way more inconsistent + // state than just a single dropped draw command. + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_DRAW_INDX( + uint32_t packet, uint32_t count) XE_RESTRICT { + // "initiate fetch of index buffer and draw" + // Generally used by Xbox 360 Direct3D 9 for kDMA and kAutoIndex sources. + // With a viz query token as the first one. + uint32_t count_remaining = count; + assert_not_zero(count_remaining); + if (!count_remaining) { + XELOGE("PM4_DRAW_INDX: Packet too small, can't read the viz query token"); + return false; + } + uint32_t viz_query_condition = reader_.ReadAndSwap(); + --count_remaining; + return COMMAND_PROCESSOR::ExecutePacketType3Draw( + packet, "PM4_DRAW_INDX", viz_query_condition, count_remaining); +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_DRAW_INDX_2( + uint32_t packet, uint32_t count) XE_RESTRICT { + // "draw using supplied indices in packet" + // Generally used by Xbox 360 Direct3D 9 for kAutoIndex source. + // No viz query token. + return COMMAND_PROCESSOR::ExecutePacketType3Draw(packet, "PM4_DRAW_INDX_2", 0, + count); +} +XE_FORCEINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_SET_CONSTANT( + uint32_t packet, uint32_t count) XE_RESTRICT { + // load constant into chip and to memory + // PM4_REG(reg) ((0x4 << 16) | (GSL_HAL_SUBBLOCK_OFFSET(reg))) + // reg - 0x2000 + uint32_t offset_type = reader_.ReadAndSwap(); + uint32_t index = offset_type & 0x7FF; + uint32_t type = (offset_type >> 16) & 0xFF; + uint32_t countm1 = count - 1; + switch (type) { + case 0: // ALU + // index += 0x4000; + // COMMAND_PROCESSOR::WriteRegisterRangeFromRing( index, countm1); + COMMAND_PROCESSOR::WriteALURangeFromRing(&reader_, index, countm1); + break; + case 1: // FETCH + + COMMAND_PROCESSOR::WriteFetchRangeFromRing(&reader_, index, countm1); + + break; + case 2: // BOOL + COMMAND_PROCESSOR::WriteBoolRangeFromRing(&reader_, index, countm1); + + break; + case 3: // LOOP + + COMMAND_PROCESSOR::WriteLoopRangeFromRing(&reader_, index, countm1); + + break; + case 4: // REGISTERS + + COMMAND_PROCESSOR::WriteREGISTERSRangeFromRing(&reader_, index, countm1); + + break; + default: + assert_always(); + reader_.AdvanceRead((count - 1) * sizeof(uint32_t)); + return true; + } + + return true; +} +XE_NOINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_SET_CONSTANT2( + uint32_t packet, uint32_t count) XE_RESTRICT { + uint32_t offset_type = reader_.ReadAndSwap(); + uint32_t index = offset_type & 0xFFFF; + uint32_t countm1 = count - 1; + + COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, index, countm1); + + return true; +} +XE_FORCEINLINE +bool COMMAND_PROCESSOR::ExecutePacketType3_LOAD_ALU_CONSTANT( + uint32_t packet, uint32_t count) XE_RESTRICT { + // load constants from memory + uint32_t address = reader_.ReadAndSwap(); + address &= 0x3FFFFFFF; + uint32_t offset_type = reader_.ReadAndSwap(); + uint32_t index = offset_type & 0x7FF; + uint32_t size_dwords = reader_.ReadAndSwap(); + size_dwords &= 0xFFF; + uint32_t type = (offset_type >> 16) & 0xFF; + + auto xlat_address = (uint32_t*)memory_->TranslatePhysical(address); + + switch (type) { + case 0: // ALU + trace_writer_.WriteMemoryRead(CpuToGpu(address), size_dwords * 4); + COMMAND_PROCESSOR::WriteALURangeFromMem(index, xlat_address, size_dwords); + + break; + case 1: // FETCH + trace_writer_.WriteMemoryRead(CpuToGpu(address), size_dwords * 4); + COMMAND_PROCESSOR::WriteFetchRangeFromMem(index, xlat_address, + size_dwords); + break; + case 2: // BOOL + trace_writer_.WriteMemoryRead(CpuToGpu(address), size_dwords * 4); + + COMMAND_PROCESSOR::WriteBoolRangeFromMem(index, xlat_address, + size_dwords); + break; + case 3: // LOOP + trace_writer_.WriteMemoryRead(CpuToGpu(address), size_dwords * 4); + + COMMAND_PROCESSOR::WriteLoopRangeFromMem(index, xlat_address, + size_dwords); + + break; + case 4: // REGISTERS + // chrispy: todo, REGISTERS cannot write any special regs, so optimize for + // that + trace_writer_.WriteMemoryRead(CpuToGpu(address), size_dwords * 4); + + COMMAND_PROCESSOR::WriteREGISTERSRangeFromMem(index, xlat_address, + size_dwords); + break; + default: + assert_always(); + return true; + } + + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_SET_SHADER_CONSTANTS( + uint32_t packet, uint32_t count) XE_RESTRICT { + uint32_t offset_type = reader_.ReadAndSwap(); + uint32_t index = offset_type & 0xFFFF; + uint32_t countm1 = count - 1; + COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, index, countm1); + + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD(uint32_t packet, + uint32_t count) XE_RESTRICT { + SCOPE_profile_cpu_f("gpu"); + + // load sequencer instruction memory (pointer-based) + uint32_t addr_type = reader_.ReadAndSwap(); + auto shader_type = static_cast(addr_type & 0x3); + uint32_t addr = addr_type & ~0x3; + uint32_t start_size = reader_.ReadAndSwap(); + uint32_t start = start_size >> 16; + uint32_t size_dwords = start_size & 0xFFFF; // dwords + assert_true(start == 0); + trace_writer_.WriteMemoryRead(CpuToGpu(addr), size_dwords * 4); + auto shader = COMMAND_PROCESSOR::LoadShader( + shader_type, addr, memory_->TranslatePhysical(addr), + size_dwords); + switch (shader_type) { + case xenos::ShaderType::kVertex: + active_vertex_shader_ = shader; + break; + case xenos::ShaderType::kPixel: + active_pixel_shader_ = shader; + break; + default: + assert_unhandled_case(shader_type); + return false; + } + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE( + uint32_t packet, uint32_t count) XE_RESTRICT { + SCOPE_profile_cpu_f("gpu"); + + // load sequencer instruction memory (code embedded in packet) + uint32_t dword0 = reader_.ReadAndSwap(); + uint32_t dword1 = reader_.ReadAndSwap(); + auto shader_type = static_cast(dword0); + uint32_t start_size = dword1; + uint32_t start = start_size >> 16; + uint32_t size_dwords = start_size & 0xFFFF; // dwords + assert_true(start == 0); + assert_true(reader_.read_count() >= size_dwords * 4); + assert_true(count - 2 >= size_dwords); + auto shader = COMMAND_PROCESSOR::LoadShader( + shader_type, uint32_t(reader_.read_ptr()), + reinterpret_cast(reader_.read_ptr()), size_dwords); + switch (shader_type) { + case xenos::ShaderType::kVertex: + active_vertex_shader_ = shader; + break; + case xenos::ShaderType::kPixel: + active_pixel_shader_ = shader; + break; + default: + assert_unhandled_case(shader_type); + return false; + } + reader_.AdvanceRead(size_dwords * sizeof(uint32_t)); + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_INVALIDATE_STATE( + uint32_t packet, uint32_t count) XE_RESTRICT { + // selective invalidation of state pointers + /*uint32_t mask =*/reader_.ReadAndSwap(); + // driver_->InvalidateState(mask); + return true; +} + +bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY( + uint32_t packet, uint32_t count) XE_RESTRICT { + // begin/end initiator for viz query extent processing + // https://www.google.com/patents/US20050195186 + assert_true(count == 1); + + uint32_t dword0 = reader_.ReadAndSwap(); + + uint32_t id = dword0 & 0x3F; + uint32_t end = dword0 & 0x100; + if (!end) { + // begin a new viz query @ id + // On hardware this clears the internal state of the scan converter (which + // is different to the register) + COMMAND_PROCESSOR::WriteEventInitiator(VIZQUERY_START); + // XELOGGPU("Begin viz query ID {:02X}", id); + } else { + // end the viz query + COMMAND_PROCESSOR::WriteEventInitiator(VIZQUERY_END); + // XELOGGPU("End viz query ID {:02X}", id); + // The scan converter writes the internal result back to the register here. + // We just fake it and say it was visible in case it is read back. + if (id < 32) { + register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_0].u32 |= + uint32_t(1) << id; + } else { + register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_1].u32 |= + uint32_t(1) << (id - 32); + } + } + + return true; +} diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc index 428a18f78..ffd77246e 100644 --- a/src/xenia/gpu/shared_memory.cc +++ b/src/xenia/gpu/shared_memory.cc @@ -233,15 +233,27 @@ void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last, // Fire per-range watches. for (uint32_t i = bucket_first; i <= bucket_last; ++i) { WatchNode* node = watch_buckets_[i]; + if (i + 1 <= bucket_last) { + WatchNode* nextnode = watch_buckets_[i + 1]; + if (nextnode) { + swcache::PrefetchL1(nextnode->range); + } + } while (node != nullptr) { WatchRange* range = node->range; // Store the next node now since when the callback is triggered, the links // will be broken. node = node->bucket_node_next; + if (node) { + swcache::PrefetchL1(node); + } if (page_first <= range->page_last && page_last >= range->page_first) { range->callback(global_lock, range->callback_context, range->callback_data, range->callback_argument, invalidated_by_gpu); + if (node && node->range) { + swcache::PrefetchL1(node->range); + } UnlinkWatchRange(range); } } diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc index b697ff73c..dfc3264ca 100644 --- a/src/xenia/gpu/texture_cache.cc +++ b/src/xenia/gpu/texture_cache.cc @@ -440,7 +440,7 @@ void TextureCache::TextureKey::LogAction(const char* action) const { "base at 0x{:08X} (pitch {}), mips at 0x{:08X}", action, tiled ? "tiled" : "linear", scaled_resolve ? "scaled " : "", GetWidth(), GetHeight(), GetDepthOrArraySize(), GetLogDimensionName(), - FormatInfo::Get(format)->name, mip_max_level + 1, packed_mips ? "" : "un", + FormatInfo::GetName(format), mip_max_level + 1, packed_mips ? "" : "un", mip_max_level != 0 ? "s" : "", base_page << 12, pitch << 5, mip_page << 12); } @@ -453,7 +453,7 @@ void TextureCache::Texture::LogAction(const char* action) const { action, key_.tiled ? "tiled" : "linear", key_.scaled_resolve ? "scaled " : "", key_.GetWidth(), key_.GetHeight(), key_.GetDepthOrArraySize(), key_.GetLogDimensionName(), - FormatInfo::Get(key_.format)->name, key_.mip_max_level + 1, + FormatInfo::GetName(key_.format), key_.mip_max_level + 1, key_.packed_mips ? "" : "un", key_.mip_max_level != 0 ? "s" : "", key_.base_page << 12, key_.pitch << 5, GetGuestBaseSize(), key_.mip_page << 12, GetGuestMipsSize()); diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h index c028c1be4..197124e37 100644 --- a/src/xenia/gpu/texture_cache.h +++ b/src/xenia/gpu/texture_cache.h @@ -128,6 +128,14 @@ class TextureCache { return (binding->texture && binding->texture->IsResolved()) || (binding->texture_signed && binding->texture_signed->IsResolved()); } + template + void PrefetchTextureBinding(uint32_t fetch_constant_index) const { + swcache::Prefetch(&texture_bindings_[fetch_constant_index]); + swcache::Prefetch( + &texture_bindings_[fetch_constant_index + + 1]); // we may cross a cache line boundary :( size + // of the structure is 0x28 + } protected: struct TextureKey { diff --git a/src/xenia/gpu/texture_dump.cc b/src/xenia/gpu/texture_dump.cc index d105d3cbf..53ecb58a6 100644 --- a/src/xenia/gpu/texture_dump.cc +++ b/src/xenia/gpu/texture_dump.cc @@ -85,7 +85,7 @@ void TextureDump(const TextureInfo& src, void* buffer, size_t length) { assert_unhandled_case(src.format); std::memset(&dds_header.pixel_format, 0xCD, sizeof(dds_header.pixel_format)); - XELOGW("Skipping {} for texture dump.", src.format_info()->name); + XELOGW("Skipping {} for texture dump.", src.format_name()); return; } } @@ -96,7 +96,7 @@ void TextureDump(const TextureInfo& src, void* buffer, size_t length) { std::filesystem::path path = "texture_dumps"; path /= fmt::format("{:05d}_{:08X}_{:08X}_{:08X}.dds", dump_counter++, src.memory.base_address, src.memory.mip_address, - src.format_info()->name); + src.format_name()); FILE* handle = filesystem::OpenFile(path, "wb"); if (handle) { diff --git a/src/xenia/gpu/texture_info.cc b/src/xenia/gpu/texture_info.cc index 4522c5cfa..67c465b33 100644 --- a/src/xenia/gpu/texture_info.cc +++ b/src/xenia/gpu/texture_info.cc @@ -159,151 +159,6 @@ void TextureInfo::GetMipSize(uint32_t mip, uint32_t* out_width, *out_height = std::max(height_pow2 >> mip, 1u); } -uint32_t TextureInfo::GetMipLocation(uint32_t mip, uint32_t* offset_x, - uint32_t* offset_y, bool is_guest) const { - if (mip == 0) { - // Short-circuit. Mip 0 is always stored in base_address. - if (!has_packed_mips) { - *offset_x = 0; - *offset_y = 0; - } else { - GetPackedTileOffset(0, offset_x, offset_y); - } - return memory.base_address; - } - - if (!memory.mip_address) { - // Short-circuit. There is no mip data. - *offset_x = 0; - *offset_y = 0; - return 0; - } - - uint32_t address_base, address_offset; - address_base = memory.mip_address; - address_offset = 0; - - auto bytes_per_block = format_info()->bytes_per_block(); - - if (!has_packed_mips) { - for (uint32_t i = 1; i < mip; i++) { - address_offset += - GetMipExtent(i, is_guest).all_blocks() * bytes_per_block; - } - *offset_x = 0; - *offset_y = 0; - return address_base + address_offset; - } - - uint32_t width_pow2 = xe::next_pow2(width + 1); - uint32_t height_pow2 = xe::next_pow2(height + 1); - - // Walk forward to find the address of the mip. - uint32_t packed_mip_base = 1; - for (uint32_t i = packed_mip_base; i < mip; i++, packed_mip_base++) { - uint32_t mip_width = std::max(width_pow2 >> i, 1u); - uint32_t mip_height = std::max(height_pow2 >> i, 1u); - if (std::min(mip_width, mip_height) <= 16) { - // We've reached the point where the mips are packed into a single tile. - break; - } - address_offset += GetMipExtent(i, is_guest).all_blocks() * bytes_per_block; - } - - // Now, check if the mip is packed at an offset. - GetPackedTileOffset(width_pow2 >> mip, height_pow2 >> mip, format_info(), - mip - packed_mip_base, offset_x, offset_y); - return address_base + address_offset; -} - -bool TextureInfo::GetPackedTileOffset(uint32_t width, uint32_t height, - const FormatInfo* format_info, - int packed_tile, uint32_t* offset_x, - uint32_t* offset_y) { - // Tile size is 32x32, and once textures go <=16 they are packed into a - // single tile together. The math here is insane. Most sourced - // from graph paper and looking at dds dumps. - // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - // 0 +.4x4.+ +.....8x8.....+ +............16x16............+ - // 1 +.4x4.+ +.....8x8.....+ +............16x16............+ - // 2 +.4x4.+ +.....8x8.....+ +............16x16............+ - // 3 +.4x4.+ +.....8x8.....+ +............16x16............+ - // 4 x +.....8x8.....+ +............16x16............+ - // 5 +.....8x8.....+ +............16x16............+ - // 6 +.....8x8.....+ +............16x16............+ - // 7 +.....8x8.....+ +............16x16............+ - // 8 2x2 +............16x16............+ - // 9 2x2 +............16x16............+ - // 0 +............16x16............+ - // ... ..... - // This only works for square textures, or textures that are some non-pot - // <= square. As soon as the aspect ratio goes weird, the textures start to - // stretch across tiles. - // - // The 2x2 and 1x1 squares are packed in their specific positions because - // each square is the size of at least one block (which is 4x4 pixels max) - // - // if (tile_aligned(w) > tile_aligned(h)) { - // // wider than tall, so packed horizontally - // } else if (tile_aligned(w) < tile_aligned(h)) { - // // taller than wide, so packed vertically - // } else { - // square - // } - // It's important to use logical sizes here, as the input sizes will be - // for the entire packed tile set, not the actual texture. - // The minimum dimension is what matters most: if either width or height - // is <= 16 this mode kicks in. - - uint32_t log2_width = xe::log2_ceil(width); - uint32_t log2_height = xe::log2_ceil(height); - if (std::min(log2_width, log2_height) > 4) { - // Too big, not packed. - *offset_x = 0; - *offset_y = 0; - return false; - } - - // Find the block offset of the mip. - if (packed_tile < 3) { - if (log2_width > log2_height) { - // Wider than tall. Laid out vertically. - *offset_x = 0; - *offset_y = 16 >> packed_tile; - } else { - // Taller than wide. Laid out horizontally. - *offset_x = 16 >> packed_tile; - *offset_y = 0; - } - } else { - if (log2_width > log2_height) { - // Wider than tall. Laid out vertically. - *offset_x = 16 >> (packed_tile - 2); - *offset_y = 0; - } else { - // Taller than wide. Laid out horizontally. - *offset_x = 0; - *offset_y = 16 >> (packed_tile - 2); - } - } - - *offset_x /= format_info->block_width; - *offset_y /= format_info->block_height; - return true; -} - -bool TextureInfo::GetPackedTileOffset(int packed_tile, uint32_t* offset_x, - uint32_t* offset_y) const { - if (!has_packed_mips) { - *offset_x = 0; - *offset_y = 0; - return false; - } - return GetPackedTileOffset(xe::next_pow2(width + 1), - xe::next_pow2(height + 1), format_info(), - packed_tile, offset_x, offset_y); -} - uint64_t TextureInfo::hash() const { return XXH3_64bits(this, sizeof(TextureInfo)); } diff --git a/src/xenia/gpu/texture_info.h b/src/xenia/gpu/texture_info.h index 43124470d..5c514f61a 100644 --- a/src/xenia/gpu/texture_info.h +++ b/src/xenia/gpu/texture_info.h @@ -181,7 +181,7 @@ inline xenos::TextureFormat DepthRenderTargetToTextureFormat( } } -enum class FormatType { +enum class FormatType : uint32_t { // Uncompressed, and is also a ColorFormat. kResolvable, // Uncompressed, but resolve or memory export cannot be done to the format. @@ -190,12 +190,12 @@ enum class FormatType { }; struct FormatInfo { - xenos::TextureFormat format; - const char* name; - FormatType type; - uint32_t block_width; - uint32_t block_height; - uint32_t bits_per_pixel; + const xenos::TextureFormat format; + + const FormatType type; + const uint32_t block_width; + const uint32_t block_height; + const uint32_t bits_per_pixel; uint32_t bytes_per_block() const { return block_width * block_height * bits_per_pixel / 8; @@ -203,6 +203,20 @@ struct FormatInfo { static const FormatInfo* Get(uint32_t gpu_format); + static const char* GetName(uint32_t gpu_format); + static const char* GetName(xenos::TextureFormat format) { + return GetName(static_cast(format)); + } + + static unsigned char GetWidthShift(uint32_t gpu_format); + static unsigned char GetHeightShift(uint32_t gpu_format); + + static unsigned char GetWidthShift(xenos::TextureFormat gpu_format) { + return GetWidthShift(static_cast(gpu_format)); + } + static unsigned char GetHeightShift(xenos::TextureFormat gpu_format) { + return GetHeightShift(static_cast(gpu_format)); + } static const FormatInfo* Get(xenos::TextureFormat format) { return Get(static_cast(format)); } @@ -259,7 +273,9 @@ struct TextureInfo { const FormatInfo* format_info() const { return FormatInfo::Get(static_cast(format)); } - + const char* format_name() const { + return FormatInfo::GetName(static_cast(format)); + } bool is_compressed() const { return format_info()->type == FormatType::kCompressed; } @@ -281,18 +297,6 @@ struct TextureInfo { void GetMipSize(uint32_t mip, uint32_t* width, uint32_t* height) const; - // Get the memory location of a mip. offset_x and offset_y are in blocks. - uint32_t GetMipLocation(uint32_t mip, uint32_t* offset_x, uint32_t* offset_y, - bool is_guest) const; - - static bool GetPackedTileOffset(uint32_t width, uint32_t height, - const FormatInfo* format_info, - int packed_tile, uint32_t* offset_x, - uint32_t* offset_y); - - bool GetPackedTileOffset(int packed_tile, uint32_t* offset_x, - uint32_t* offset_y) const; - uint64_t hash() const; bool operator==(const TextureInfo& other) const { return std::memcmp(this, &other, sizeof(TextureInfo)) == 0; diff --git a/src/xenia/gpu/texture_info_formats.cc b/src/xenia/gpu/texture_info_formats.cc index 1ca4e1567..2ef0ed330 100644 --- a/src/xenia/gpu/texture_info_formats.cc +++ b/src/xenia/gpu/texture_info_formats.cc @@ -17,77 +17,60 @@ namespace gpu { using namespace xe::gpu::xenos; #define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) \ - {xenos::TextureFormat::texture_format, #texture_format, FormatType::format, block_width, block_height, bits_per_pixel} + {xenos::TextureFormat::texture_format, FormatType::format, block_width, block_height, bits_per_pixel} const FormatInfo* FormatInfo::Get(uint32_t gpu_format) { static const FormatInfo format_infos[64] = { - FORMAT_INFO(k_1_REVERSE , kUncompressed, 1, 1, 1), - FORMAT_INFO(k_1 , kUncompressed, 1, 1, 1), - FORMAT_INFO(k_8 , kResolvable, 1, 1, 8), - FORMAT_INFO(k_1_5_5_5 , kResolvable, 1, 1, 16), - FORMAT_INFO(k_5_6_5 , kResolvable, 1, 1, 16), - FORMAT_INFO(k_6_5_5 , kResolvable, 1, 1, 16), - FORMAT_INFO(k_8_8_8_8 , kResolvable, 1, 1, 32), - FORMAT_INFO(k_2_10_10_10 , kResolvable, 1, 1, 32), - FORMAT_INFO(k_8_A , kResolvable, 1, 1, 8), - FORMAT_INFO(k_8_B , kResolvable, 1, 1, 8), - FORMAT_INFO(k_8_8 , kResolvable, 1, 1, 16), - FORMAT_INFO(k_Cr_Y1_Cb_Y0_REP , kCompressed, 2, 1, 16), - FORMAT_INFO(k_Y1_Cr_Y0_Cb_REP , kCompressed, 2, 1, 16), - FORMAT_INFO(k_16_16_EDRAM , kUncompressed, 1, 1, 32), - FORMAT_INFO(k_8_8_8_8_A , kResolvable, 1, 1, 32), - FORMAT_INFO(k_4_4_4_4 , kResolvable, 1, 1, 16), - FORMAT_INFO(k_10_11_11 , kResolvable, 1, 1, 32), - FORMAT_INFO(k_11_11_10 , kResolvable, 1, 1, 32), - FORMAT_INFO(k_DXT1 , kCompressed, 4, 4, 4), - FORMAT_INFO(k_DXT2_3 , kCompressed, 4, 4, 8), - FORMAT_INFO(k_DXT4_5 , kCompressed, 4, 4, 8), - FORMAT_INFO(k_16_16_16_16_EDRAM , kUncompressed, 1, 1, 64), - FORMAT_INFO(k_24_8 , kUncompressed, 1, 1, 32), - FORMAT_INFO(k_24_8_FLOAT , kUncompressed, 1, 1, 32), - FORMAT_INFO(k_16 , kResolvable, 1, 1, 16), - FORMAT_INFO(k_16_16 , kResolvable, 1, 1, 32), - FORMAT_INFO(k_16_16_16_16 , kResolvable, 1, 1, 64), - FORMAT_INFO(k_16_EXPAND , kUncompressed, 1, 1, 16), - FORMAT_INFO(k_16_16_EXPAND , kUncompressed, 1, 1, 32), - FORMAT_INFO(k_16_16_16_16_EXPAND , kUncompressed, 1, 1, 64), - FORMAT_INFO(k_16_FLOAT , kResolvable, 1, 1, 16), - FORMAT_INFO(k_16_16_FLOAT , kResolvable, 1, 1, 32), - FORMAT_INFO(k_16_16_16_16_FLOAT , kResolvable, 1, 1, 64), - FORMAT_INFO(k_32 , kUncompressed, 1, 1, 32), - FORMAT_INFO(k_32_32 , kUncompressed, 1, 1, 64), - FORMAT_INFO(k_32_32_32_32 , kUncompressed, 1, 1, 128), - FORMAT_INFO(k_32_FLOAT , kResolvable, 1, 1, 32), - FORMAT_INFO(k_32_32_FLOAT , kResolvable, 1, 1, 64), - FORMAT_INFO(k_32_32_32_32_FLOAT , kResolvable, 1, 1, 128), - FORMAT_INFO(k_32_AS_8 , kCompressed, 4, 1, 8), - FORMAT_INFO(k_32_AS_8_8 , kCompressed, 2, 1, 16), - FORMAT_INFO(k_16_MPEG , kUncompressed, 1, 1, 16), - FORMAT_INFO(k_16_16_MPEG , kUncompressed, 1, 1, 32), - FORMAT_INFO(k_8_INTERLACED , kUncompressed, 1, 1, 8), - FORMAT_INFO(k_32_AS_8_INTERLACED , kCompressed, 4, 1, 8), - FORMAT_INFO(k_32_AS_8_8_INTERLACED , kCompressed, 1, 1, 16), - FORMAT_INFO(k_16_INTERLACED , kUncompressed, 1, 1, 16), - FORMAT_INFO(k_16_MPEG_INTERLACED , kUncompressed, 1, 1, 16), - FORMAT_INFO(k_16_16_MPEG_INTERLACED , kUncompressed, 1, 1, 32), - FORMAT_INFO(k_DXN , kCompressed, 4, 4, 8), - FORMAT_INFO(k_8_8_8_8_AS_16_16_16_16 , kResolvable, 1, 1, 32), - FORMAT_INFO(k_DXT1_AS_16_16_16_16 , kCompressed, 4, 4, 4), - FORMAT_INFO(k_DXT2_3_AS_16_16_16_16 , kCompressed, 4, 4, 8), - FORMAT_INFO(k_DXT4_5_AS_16_16_16_16 , kCompressed, 4, 4, 8), - FORMAT_INFO(k_2_10_10_10_AS_16_16_16_16, kResolvable, 1, 1, 32), - FORMAT_INFO(k_10_11_11_AS_16_16_16_16 , kResolvable, 1, 1, 32), - FORMAT_INFO(k_11_11_10_AS_16_16_16_16 , kResolvable, 1, 1, 32), - FORMAT_INFO(k_32_32_32_FLOAT , kUncompressed, 1, 1, 96), - FORMAT_INFO(k_DXT3A , kCompressed, 4, 4, 4), - FORMAT_INFO(k_DXT5A , kCompressed, 4, 4, 4), - FORMAT_INFO(k_CTX1 , kCompressed, 4, 4, 4), - FORMAT_INFO(k_DXT3A_AS_1_1_1_1 , kCompressed, 4, 4, 4), - FORMAT_INFO(k_8_8_8_8_GAMMA_EDRAM , kUncompressed, 1, 1, 32), - FORMAT_INFO(k_2_10_10_10_FLOAT_EDRAM , kUncompressed, 1, 1, 32), + #include "texture_info_formats.inl" }; return &format_infos[gpu_format]; } #undef FORMAT_INFO + +constexpr unsigned char GetShift(unsigned pow) { + unsigned char sh = 0; + + while (!(pow & 1)) { + pow>>=1; + sh++; + } + + return sh; +} +/* + todo: getwidthshift and getheightshift should not need a full 64 byte table each + there are 15 elements for GetWidthShift where the shift will not be 0. the max shift that will be returned is 2, and there are 64 elements total + this means we can use a boolean table that also acts as a sparse indexer ( popcnt preceding bits to get index) and shift and mask a 32 bit word to get the shift +*/ +unsigned char FormatInfo::GetWidthShift(uint32_t gpu_format) { + #define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) GetShift(block_width) + alignas(XE_HOST_CACHE_LINE_SIZE) + constexpr unsigned char wshift_table[64] = { + #include "texture_info_formats.inl" + }; + #undef FORMAT_INFO + + return wshift_table[gpu_format]; +} +unsigned char FormatInfo::GetHeightShift(uint32_t gpu_format) { +#define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) GetShift(block_height) + alignas(XE_HOST_CACHE_LINE_SIZE) + constexpr unsigned char hshift_table[64] = { + #include "texture_info_formats.inl" + }; + #undef FORMAT_INFO + + return hshift_table[gpu_format]; +} +#define FORMAT_INFO(texture_format,...) #texture_format +static constexpr const char* const format_name_table[64] = { + #include "texture_info_formats.inl" + +}; +#undef FORMAT_INFO +const char* FormatInfo::GetName(uint32_t gpu_format) { + + return format_name_table[gpu_format]; +} } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/texture_info_formats.inl b/src/xenia/gpu/texture_info_formats.inl new file mode 100644 index 000000000..2db7d7cf7 --- /dev/null +++ b/src/xenia/gpu/texture_info_formats.inl @@ -0,0 +1,64 @@ +FORMAT_INFO(k_1_REVERSE, kUncompressed, 1, 1, 1), +FORMAT_INFO(k_1, kUncompressed, 1, 1, 1), +FORMAT_INFO(k_8, kResolvable, 1, 1, 8), +FORMAT_INFO(k_1_5_5_5, kResolvable, 1, 1, 16), +FORMAT_INFO(k_5_6_5, kResolvable, 1, 1, 16), +FORMAT_INFO(k_6_5_5, kResolvable, 1, 1, 16), +FORMAT_INFO(k_8_8_8_8, kResolvable, 1, 1, 32), +FORMAT_INFO(k_2_10_10_10, kResolvable, 1, 1, 32), +FORMAT_INFO(k_8_A, kResolvable, 1, 1, 8), +FORMAT_INFO(k_8_B, kResolvable, 1, 1, 8), +FORMAT_INFO(k_8_8, kResolvable, 1, 1, 16), +FORMAT_INFO(k_Cr_Y1_Cb_Y0_REP, kCompressed, 2, 1, 16), +FORMAT_INFO(k_Y1_Cr_Y0_Cb_REP, kCompressed, 2, 1, 16), +FORMAT_INFO(k_16_16_EDRAM, kUncompressed, 1, 1, 32), +FORMAT_INFO(k_8_8_8_8_A, kResolvable, 1, 1, 32), +FORMAT_INFO(k_4_4_4_4, kResolvable, 1, 1, 16), +FORMAT_INFO(k_10_11_11, kResolvable, 1, 1, 32), +FORMAT_INFO(k_11_11_10, kResolvable, 1, 1, 32), +FORMAT_INFO(k_DXT1, kCompressed, 4, 4, 4), +FORMAT_INFO(k_DXT2_3, kCompressed, 4, 4, 8), +FORMAT_INFO(k_DXT4_5, kCompressed, 4, 4, 8), +FORMAT_INFO(k_16_16_16_16_EDRAM, kUncompressed, 1, 1, 64), +FORMAT_INFO(k_24_8, kUncompressed, 1, 1, 32), +FORMAT_INFO(k_24_8_FLOAT, kUncompressed, 1, 1, 32), +FORMAT_INFO(k_16, kResolvable, 1, 1, 16), +FORMAT_INFO(k_16_16, kResolvable, 1, 1, 32), +FORMAT_INFO(k_16_16_16_16, kResolvable, 1, 1, 64), +FORMAT_INFO(k_16_EXPAND, kUncompressed, 1, 1, 16), +FORMAT_INFO(k_16_16_EXPAND, kUncompressed, 1, 1, 32), +FORMAT_INFO(k_16_16_16_16_EXPAND, kUncompressed, 1, 1, 64), +FORMAT_INFO(k_16_FLOAT, kResolvable, 1, 1, 16), +FORMAT_INFO(k_16_16_FLOAT, kResolvable, 1, 1, 32), +FORMAT_INFO(k_16_16_16_16_FLOAT, kResolvable, 1, 1, 64), +FORMAT_INFO(k_32, kUncompressed, 1, 1, 32), +FORMAT_INFO(k_32_32, kUncompressed, 1, 1, 64), +FORMAT_INFO(k_32_32_32_32, kUncompressed, 1, 1, 128), +FORMAT_INFO(k_32_FLOAT, kResolvable, 1, 1, 32), +FORMAT_INFO(k_32_32_FLOAT, kResolvable, 1, 1, 64), +FORMAT_INFO(k_32_32_32_32_FLOAT, kResolvable, 1, 1, 128), +FORMAT_INFO(k_32_AS_8, kCompressed, 4, 1, 8), +FORMAT_INFO(k_32_AS_8_8, kCompressed, 2, 1, 16), +FORMAT_INFO(k_16_MPEG, kUncompressed, 1, 1, 16), +FORMAT_INFO(k_16_16_MPEG, kUncompressed, 1, 1, 32), +FORMAT_INFO(k_8_INTERLACED, kUncompressed, 1, 1, 8), +FORMAT_INFO(k_32_AS_8_INTERLACED, kCompressed, 4, 1, 8), +FORMAT_INFO(k_32_AS_8_8_INTERLACED, kCompressed, 1, 1, 16), +FORMAT_INFO(k_16_INTERLACED, kUncompressed, 1, 1, 16), +FORMAT_INFO(k_16_MPEG_INTERLACED, kUncompressed, 1, 1, 16), +FORMAT_INFO(k_16_16_MPEG_INTERLACED, kUncompressed, 1, 1, 32), +FORMAT_INFO(k_DXN, kCompressed, 4, 4, 8), +FORMAT_INFO(k_8_8_8_8_AS_16_16_16_16, kResolvable, 1, 1, 32), +FORMAT_INFO(k_DXT1_AS_16_16_16_16, kCompressed, 4, 4, 4), +FORMAT_INFO(k_DXT2_3_AS_16_16_16_16, kCompressed, 4, 4, 8), +FORMAT_INFO(k_DXT4_5_AS_16_16_16_16, kCompressed, 4, 4, 8), +FORMAT_INFO(k_2_10_10_10_AS_16_16_16_16, kResolvable, 1, 1, 32), +FORMAT_INFO(k_10_11_11_AS_16_16_16_16, kResolvable, 1, 1, 32), +FORMAT_INFO(k_11_11_10_AS_16_16_16_16, kResolvable, 1, 1, 32), +FORMAT_INFO(k_32_32_32_FLOAT, kUncompressed, 1, 1, 96), +FORMAT_INFO(k_DXT3A, kCompressed, 4, 4, 4), +FORMAT_INFO(k_DXT5A, kCompressed, 4, 4, 4), +FORMAT_INFO(k_CTX1, kCompressed, 4, 4, 4), +FORMAT_INFO(k_DXT3A_AS_1_1_1_1, kCompressed, 4, 4, 4), +FORMAT_INFO(k_8_8_8_8_GAMMA_EDRAM, kUncompressed, 1, 1, 32), +FORMAT_INFO(k_2_10_10_10_FLOAT_EDRAM, kUncompressed, 1, 1, 32), \ No newline at end of file diff --git a/src/xenia/gpu/texture_util.cc b/src/xenia/gpu/texture_util.cc index ada6cf140..b20194a78 100644 --- a/src/xenia/gpu/texture_util.cc +++ b/src/xenia/gpu/texture_util.cc @@ -199,9 +199,8 @@ bool GetPackedMipOffset(uint32_t width, uint32_t height, uint32_t depth, } } - const FormatInfo* format_info = FormatInfo::Get(format); - x_blocks /= format_info->block_width; - y_blocks /= format_info->block_height; + x_blocks >>= FormatInfo::GetWidthShift(format); + y_blocks >>= FormatInfo::GetHeightShift(format); return true; } @@ -273,9 +272,10 @@ TextureGuestLayout GetGuestTextureLayout( } layout.mips_total_extent_bytes = 0; - const FormatInfo* format_info = FormatInfo::Get(format); - uint32_t bytes_per_block = format_info->bytes_per_block(); - + const FormatInfo* const format_info = FormatInfo::Get(format); + const uint32_t bytes_per_block = format_info->bytes_per_block(); + const unsigned char block_width_sh = FormatInfo::GetWidthShift(format); + const unsigned char block_height_sh = FormatInfo::GetHeightShift(format); // The loop counter can mean two things depending on whether the packed mip // tail is stored as mip 0, because in this case, it would be ambiguous since // both the base and the mips would be on "level 0", but stored separately and @@ -320,10 +320,13 @@ TextureGuestLayout GetGuestTextureLayout( z_slice_stride_texel_rows_unaligned = std::max(xe::next_pow2(height_texels) >> level, uint32_t(1)); } - uint32_t row_pitch_blocks_tile_aligned = xe::align( - xe::align(row_pitch_texels_unaligned, format_info->block_width) / - format_info->block_width, - xenos::kTextureTileWidthHeight); + // maybe do 1 << block_width_sh instead of format_info->block_width, since + // we'll have cl loaded with the shift anyway + uint32_t row_pitch_blocks_tile_aligned = + xe::align(xe::align(row_pitch_texels_unaligned, + format_info->block_width) >> + block_width_sh, + xenos::kTextureTileWidthHeight); level_layout.row_pitch_bytes = row_pitch_blocks_tile_aligned * bytes_per_block; // Assuming the provided pitch is already 256-byte-aligned for linear, but @@ -335,10 +338,11 @@ TextureGuestLayout GetGuestTextureLayout( } level_layout.z_slice_stride_block_rows = dimension != xenos::DataDimension::k1D - ? xe::align(xe::align(z_slice_stride_texel_rows_unaligned, - format_info->block_height) / - format_info->block_height, - xenos::kTextureTileWidthHeight) + ? xe::align( + xe::align(z_slice_stride_texel_rows_unaligned, + format_info->block_height) >> + block_height_sh, + xenos::kTextureTileWidthHeight) : 1; level_layout.array_slice_stride_bytes = level_layout.row_pitch_bytes * level_layout.z_slice_stride_block_rows; @@ -358,13 +362,13 @@ TextureGuestLayout GetGuestTextureLayout( // the stride. For tiled textures, this is the dimensions aligned to 32x32x4 // blocks (or x1 for the missing dimensions). uint32_t level_width_blocks = - xe::align(std::max(width_texels >> level, uint32_t(1)), - format_info->block_width) / - format_info->block_width; + xe::align(std::max(width_texels >> level, uint32_t(1)), + format_info->block_width) >> + block_width_sh; uint32_t level_height_blocks = - xe::align(std::max(height_texels >> level, uint32_t(1)), - format_info->block_height) / - format_info->block_height; + xe::align(std::max(height_texels >> level, uint32_t(1)), + format_info->block_height) >> + block_height_sh; uint32_t level_depth = std::max(depth >> level, uint32_t(1)); if (is_tiled) { level_layout.x_extent_blocks = @@ -415,20 +419,20 @@ TextureGuestLayout GetGuestTextureLayout( GetPackedMipOffset(width_texels, height_texels, depth, format, packed_sublevel, packed_sublevel_x_blocks, packed_sublevel_y_blocks, packed_sublevel_z); - level_layout.x_extent_blocks = std::max( + level_layout.x_extent_blocks = std::max( level_layout.x_extent_blocks, packed_sublevel_x_blocks + - xe::align( - std::max(width_texels >> packed_sublevel, uint32_t(1)), - format_info->block_width) / - format_info->block_width); - level_layout.y_extent_blocks = std::max( + (xe::align( + std::max(width_texels >> packed_sublevel, uint32_t(1)), + format_info->block_width) >> + block_width_sh)); + level_layout.y_extent_blocks = std::max( level_layout.y_extent_blocks, packed_sublevel_y_blocks + - xe::align( - std::max(height_texels >> packed_sublevel, uint32_t(1)), - format_info->block_height) / - format_info->block_height); + (xe::align( + std::max(height_texels >> packed_sublevel, uint32_t(1)), + format_info->block_height) >> + block_height_sh)); level_layout.z_extent = std::max(level_layout.z_extent, packed_sublevel_z + diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc index 85ba32c18..58df2b8ea 100644 --- a/src/xenia/gpu/trace_viewer.cc +++ b/src/xenia/gpu/trace_viewer.cc @@ -743,7 +743,7 @@ void TraceViewer::DrawTextureInfo( ImGui::NextColumn(); ImGui::Text("Fetch Slot: %u", texture_binding.fetch_constant); ImGui::Text("Guest Address: %.8X", texture_info.memory.base_address); - ImGui::Text("Format: %s", texture_info.format_info()->name); + ImGui::Text("Format: %s", texture_info.format_name()); switch (texture_info.dimension) { case xenos::DataDimension::k1D: ImGui::Text("1D: %dpx", texture_info.width + 1); diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 9ac9f13e2..095979515 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -32,10 +32,11 @@ #include "xenia/gpu/vulkan/vulkan_shader.h" #include "xenia/gpu/vulkan/vulkan_shared_memory.h" #include "xenia/gpu/xenos.h" +#include "xenia/kernel/kernel_state.h" +#include "xenia/kernel/user_module.h" #include "xenia/ui/vulkan/vulkan_presenter.h" #include "xenia/ui/vulkan/vulkan_provider.h" #include "xenia/ui/vulkan/vulkan_util.h" - namespace xe { namespace gpu { namespace vulkan { @@ -4171,6 +4172,8 @@ uint32_t VulkanCommandProcessor::WriteTransientTextureBindings( return descriptor_set_write_count; } +#define COMMAND_PROCESSOR VulkanCommandProcessor +#include "../pm4_command_processor_implement.h" } // namespace vulkan } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 3b09e0fce..b1b2eb1cd 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -53,6 +53,7 @@ class VulkanCommandProcessor final : public CommandProcessor { kStorageBufferCompute, kCount, }; +#include "../pm4_command_processor_declare.h" class ScratchBufferAcquisition { public: diff --git a/src/xenia/gpu/vulkan/vulkan_texture_cache.cc b/src/xenia/gpu/vulkan/vulkan_texture_cache.cc index 014f9abe2..5bbeba116 100644 --- a/src/xenia/gpu/vulkan/vulkan_texture_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_texture_cache.cc @@ -2020,7 +2020,7 @@ bool VulkanTextureCache::Initialize() { // Log which formats are not supported or supported via fallbacks. const HostFormatPair& best_host_format = kBestHostFormats[i]; const char* guest_format_name = - FormatInfo::Get(xenos::TextureFormat(i))->name; + FormatInfo::GetName(xenos::TextureFormat(i)); if (best_host_format.format_unsigned.format != VK_FORMAT_UNDEFINED) { assert_not_null(guest_format_name); if (host_format.format_unsigned.format != VK_FORMAT_UNDEFINED) { diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index df59b561c..8c03be479 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -1045,8 +1045,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) { return value; } } - -inline uint32_t GpuSwap(uint32_t value, Endian endianness) { +XE_NOINLINE +XE_NOALIAS +static uint32_t GpuSwap(uint32_t value, Endian endianness) { switch (endianness) { default: case Endian::kNone: diff --git a/src/xenia/kernel/util/shim_utils.h b/src/xenia/kernel/util/shim_utils.h index 8a0411b0b..b1b85e5a8 100644 --- a/src/xenia/kernel/util/shim_utils.h +++ b/src/xenia/kernel/util/shim_utils.h @@ -511,7 +511,8 @@ template StringBuffer* thread_local_string_buffer(); template -void PrintKernelCall(cpu::Export* export_entry, const Tuple& params) { +XE_NOALIAS void PrintKernelCall(cpu::Export* export_entry, + const Tuple& params) { auto& string_buffer = *thread_local_string_buffer(); string_buffer.Reset(); string_buffer.Append(export_entry->name); @@ -526,58 +527,89 @@ void PrintKernelCall(cpu::Export* export_entry, const Tuple& params) { string_buffer.to_string_view()); } } +/* + todo: need faster string formatting/concatenation (all arguments are + always turned into strings except if kHighFrequency) +*/ template -auto KernelTrampoline(F&& f, Tuple&& t, std::index_sequence) { +XE_FORCEINLINE static auto KernelTrampoline(F&& f, Tuple&& t, + std::index_sequence) { return std::forward(f)(std::get(std::forward(t))...); } template -xe::cpu::Export* RegisterExport(R (*fn)(Ps&...), const char* name, - xe::cpu::ExportTag::type tags) { - static_assert( - std::is_void::value || std::is_base_of::value, - "R must be void or derive from shim::Result"); - static_assert((std::is_base_of_v && ...), - "Ps must derive from shim::Param"); - static const auto export_entry = new cpu::Export( - ORDINAL, xe::cpu::Export::Type::kFunction, name, - tags | xe::cpu::ExportTag::kImplemented | xe::cpu::ExportTag::kLog); - static R (*FN)(Ps & ...) = fn; - struct X { - static void Trampoline(PPCContext* ppc_context) { - ++export_entry->function_data.call_count; - Param::Init init = { - ppc_context, - 0, - }; - // Using braces initializer instead of make_tuple because braces - // enforce execution order across compilers. - // The make_tuple order is undefined per the C++ standard and - // cause inconsitencies between msvc and clang. - std::tuple params = {Ps(init)...}; - if (export_entry->tags & xe::cpu::ExportTag::kLog && - (!(export_entry->tags & xe::cpu::ExportTag::kHighFrequency) || - cvars::log_high_frequency_kernel_calls)) { - PrintKernelCall(export_entry, params); - } - if constexpr (std::is_void::value) { - KernelTrampoline(FN, std::forward>(params), - std::make_index_sequence()); - } else { - auto result = - KernelTrampoline(FN, std::forward>(params), - std::make_index_sequence()); - result.Store(ppc_context); - if (export_entry->tags & - (xe::cpu::ExportTag::kLog | xe::cpu::ExportTag::kLogResult)) { - // TODO(benvanik): log result. +struct ExportRegistrerHelper { + template + static xe::cpu::Export* RegisterExport(const char* name) { + static_assert( + std::is_void::value || std::is_base_of::value, + "R must be void or derive from shim::Result"); + static_assert((std::is_base_of_v && ...), + "Ps must derive from shim::Param"); + constexpr auto TAGS = + tags | xe::cpu::ExportTag::kImplemented | xe::cpu::ExportTag::kLog; + + static const auto export_entry = + new cpu::Export(ORDINAL, xe::cpu::Export::Type::kFunction, name, TAGS); + struct X { + static void Trampoline(PPCContext* ppc_context) { + ++export_entry->function_data.call_count; + Param::Init init = { + ppc_context, + 0, + }; + // Using braces initializer instead of make_tuple because braces + // enforce execution order across compilers. + // The make_tuple order is undefined per the C++ standard and + // cause inconsitencies between msvc and clang. + std::tuple params = {Ps(init)...}; + if (TAGS & xe::cpu::ExportTag::kLog && + (!(TAGS & xe::cpu::ExportTag::kHighFrequency) || + cvars::log_high_frequency_kernel_calls)) { + PrintKernelCall(export_entry, params); + } + if constexpr (std::is_void::value) { + KernelTrampoline(fn, std::forward>(params), + std::make_index_sequence()); + } else { + auto result = + KernelTrampoline(fn, std::forward>(params), + std::make_index_sequence()); + result.Store(ppc_context); + if (TAGS & + (xe::cpu::ExportTag::kLog | xe::cpu::ExportTag::kLogResult)) { + // TODO(benvanik): log result. + } } } - } - }; - export_entry->function_data.trampoline = &X::Trampoline; - return export_entry; + }; + struct Y { + static void Trampoline(PPCContext* ppc_context) { + Param::Init init = { + ppc_context, + 0, + }; + std::tuple params = {Ps(init)...}; + if constexpr (std::is_void::value) { + KernelTrampoline(fn, std::forward>(params), + std::make_index_sequence()); + } else { + auto result = + KernelTrampoline(fn, std::forward>(params), + std::make_index_sequence()); + result.Store(ppc_context); + } + } + }; + export_entry->function_data.trampoline = &X::Trampoline; + return export_entry; + } +}; +template +auto GetRegister(R (*fngetter)(Ps&...)) { + return static_cast*>( + nullptr); } } // namespace shim @@ -585,13 +617,17 @@ xe::cpu::Export* RegisterExport(R (*fn)(Ps&...), const char* name, using xe::cpu::ExportTag; #define DECLARE_EXPORT(module_name, name, category, tags) \ + using _register_##module_name##_##name = \ + std::remove_cv_t(&name##_entry))>>; \ const auto EXPORT_##module_name##_##name = RegisterExport_##module_name( \ - xe::kernel::shim::RegisterExport< \ - xe::kernel::shim::KernelModuleId::module_name, ordinals::name>( \ - &name##_entry, #name, \ - tags | (static_cast( \ - xe::cpu::ExportCategory::category) \ - << xe::cpu::ExportTag::CategoryShift))); + _register_##module_name##_##name ::RegisterExport< \ + &name##_entry, tags | (static_cast( \ + xe::cpu::ExportCategory::category) \ + << xe::cpu::ExportTag::CategoryShift)>( \ + #name)); #define DECLARE_EMPTY_REGISTER_EXPORTS(module_name, group_name) \ void xe::kernel::module_name::Register##group_name##Exports( \ diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index f94b0c469..16e2b8336 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -316,8 +316,46 @@ void Memory::Reset() { heaps_.v90000000.Reset(); heaps_.physical.Reset(); } - +XE_NOALIAS const BaseHeap* Memory::LookupHeap(uint32_t address) const { +#if 1 +#define HEAP_INDEX(name) \ + offsetof(Memory, heaps_.name) - offsetof(Memory, heaps_) + + const char* heap_select = (const char*)&this->heaps_; + + unsigned selected_heap_offset = 0; + unsigned high_nibble = address >> 28; + + if (high_nibble < 0x4) { + selected_heap_offset = HEAP_INDEX(v00000000); + } else if (address < 0x7F000000) { + selected_heap_offset = HEAP_INDEX(v40000000); + } else if (high_nibble < 0x8) { + heap_select = nullptr; + // return nullptr; + } else if (high_nibble < 0x9) { + selected_heap_offset = HEAP_INDEX(v80000000); + // return &heaps_.v80000000; + } else if (high_nibble < 0xA) { + // return &heaps_.v90000000; + selected_heap_offset = HEAP_INDEX(v90000000); + } else if (high_nibble < 0xC) { + // return &heaps_.vA0000000; + selected_heap_offset = HEAP_INDEX(vA0000000); + } else if (high_nibble < 0xE) { + // return &heaps_.vC0000000; + selected_heap_offset = HEAP_INDEX(vC0000000); + } else if (address < 0xFFD00000) { + // return &heaps_.vE0000000; + selected_heap_offset = HEAP_INDEX(vE0000000); + } else { + // return nullptr; + heap_select = nullptr; + } + return reinterpret_cast(selected_heap_offset + heap_select); + +#else if (address < 0x40000000) { return &heaps_.v00000000; } else if (address < 0x7F000000) { @@ -337,6 +375,7 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const { } else { return nullptr; } +#endif } BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) { @@ -465,8 +504,8 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) { } bool Memory::AccessViolationCallback( - global_unique_lock_type global_lock_locked_once, - void* host_address, bool is_write) { + global_unique_lock_type global_lock_locked_once, void* host_address, + bool is_write) { // Access via physical_membase_ is special, when need to bypass everything // (for instance, for a data provider to actually write the data) so only // triggering callbacks on virtual memory regions. @@ -493,16 +532,15 @@ bool Memory::AccessViolationCallback( } bool Memory::AccessViolationCallbackThunk( - global_unique_lock_type global_lock_locked_once, - void* context, void* host_address, bool is_write) { + global_unique_lock_type global_lock_locked_once, void* context, + void* host_address, bool is_write) { return reinterpret_cast(context)->AccessViolationCallback( std::move(global_lock_locked_once), host_address, is_write); } bool Memory::TriggerPhysicalMemoryCallbacks( - global_unique_lock_type global_lock_locked_once, - uint32_t virtual_address, uint32_t length, bool is_write, - bool unwatch_exact_range, bool unprotect) { + global_unique_lock_type global_lock_locked_once, uint32_t virtual_address, + uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) { BaseHeap* heap = LookupHeap(virtual_address); if (heap->heap_type() == HeapType::kGuestPhysical) { auto physical_heap = static_cast(heap); @@ -1711,9 +1749,8 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address, } bool PhysicalHeap::TriggerCallbacks( - global_unique_lock_type global_lock_locked_once, - uint32_t virtual_address, uint32_t length, bool is_write, - bool unwatch_exact_range, bool unprotect) { + global_unique_lock_type global_lock_locked_once, uint32_t virtual_address, + uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) { // TODO(Triang3l): Support read watches. assert_true(is_write); if (!is_write) { diff --git a/src/xenia/memory.h b/src/xenia/memory.h index ebbc814e6..3d4cf5637 100644 --- a/src/xenia/memory.h +++ b/src/xenia/memory.h @@ -473,8 +473,9 @@ class Memory { void SystemHeapFree(uint32_t address); // Gets the heap for the address space containing the given address. + XE_NOALIAS const BaseHeap* LookupHeap(uint32_t address) const; - + XE_NOALIAS inline BaseHeap* LookupHeap(uint32_t address) { return const_cast( const_cast(this)->LookupHeap(address)); diff --git a/src/xenia/ui/d3d12/d3d12_provider.cc b/src/xenia/ui/d3d12/d3d12_provider.cc index 287b64cb8..1b059dcf6 100644 --- a/src/xenia/ui/d3d12/d3d12_provider.cc +++ b/src/xenia/ui/d3d12/d3d12_provider.cc @@ -17,7 +17,7 @@ #include "xenia/base/math.h" #include "xenia/ui/d3d12/d3d12_immediate_drawer.h" #include "xenia/ui/d3d12/d3d12_presenter.h" - +#include "xenia/ui/d3d12/d3d12_util.h" DEFINE_bool(d3d12_debug, false, "Enable Direct3D 12 and DXGI debug layer.", "D3D12"); DEFINE_bool(d3d12_break_on_error, false, @@ -35,6 +35,8 @@ DEFINE_int32( "system responsibility)", "D3D12"); +DEFINE_bool(d3d12_nvapi_use_driver_heap_priorities, false, "nvidia stuff", + "D3D12"); namespace xe { namespace ui { namespace d3d12 { @@ -61,6 +63,7 @@ std::unique_ptr D3D12Provider::Create() { "supported GPUs."); return nullptr; } + return provider; } @@ -476,10 +479,69 @@ bool D3D12Provider::Initialize() { // Get the graphics analysis interface, will silently fail if PIX is not // attached. pfn_dxgi_get_debug_interface1_(0, IID_PPV_ARGS(&graphics_analysis_)); + if (GetAdapterVendorID() == ui::GraphicsProvider::GpuVendorID::kNvidia) { + nvapi_ = new lightweight_nvapi::nvapi_state_t(); + if (!nvapi_->is_available()) { + delete nvapi_; + nvapi_ = nullptr; + } else { + using namespace lightweight_nvapi; + nvapi_createcommittedresource_ = + (cb_NvAPI_D3D12_CreateCommittedResource)nvapi_->query_interface( + id_NvAPI_D3D12_CreateCommittedResource); + nvapi_querycpuvisiblevidmem_ = + (cb_NvAPI_D3D12_QueryCpuVisibleVidmem)nvapi_->query_interface( + id_NvAPI_D3D12_QueryCpuVisibleVidmem); + nvapi_usedriverheappriorities_ = + (cb_NvAPI_D3D12_UseDriverHeapPriorities)nvapi_->query_interface( + id_NvAPI_D3D12_UseDriverHeapPriorities); + + if (nvapi_usedriverheappriorities_) { + if (cvars::d3d12_nvapi_use_driver_heap_priorities) { + if (nvapi_usedriverheappriorities_(device_) != 0) { + XELOGI("Failed to enable driver heap priorities"); + } + } + } + } + } return true; } +uint32_t D3D12Provider::CreateUploadResource( + D3D12_HEAP_FLAGS HeapFlags, _In_ const D3D12_RESOURCE_DESC* pDesc, + D3D12_RESOURCE_STATES InitialResourceState, REFIID riidResource, + void** ppvResource, bool try_create_cpuvisible, + const D3D12_CLEAR_VALUE* pOptimizedClearValue) const { + auto device = GetDevice(); + if (try_create_cpuvisible && nvapi_createcommittedresource_) { + lightweight_nvapi::NV_RESOURCE_PARAMS nvrp; + nvrp.NVResourceFlags = + lightweight_nvapi::NV_D3D12_RESOURCE_FLAG_CPUVISIBLE_VIDMEM; + nvrp.version = 0; // nothing checks the version + + if (nvapi_createcommittedresource_( + device, &ui::d3d12::util::kHeapPropertiesUpload, HeapFlags, pDesc, + InitialResourceState, pOptimizedClearValue, &nvrp, riidResource, + ppvResource, nullptr) != 0) { + XELOGI( + "Failed to create CPUVISIBLE_VIDMEM upload resource, will just do " + "normal CreateCommittedResource"); + } else { + return UPLOAD_RESULT_CREATE_CPUVISIBLE; + } + } + if (FAILED(device->CreateCommittedResource( + &ui::d3d12::util::kHeapPropertiesUpload, HeapFlags, pDesc, + InitialResourceState, pOptimizedClearValue, riidResource, + ppvResource))) { + XELOGE("Failed to create the gamma ramp upload buffer"); + return UPLOAD_RESULT_CREATE_FAILED; + } + + return UPLOAD_RESULT_CREATE_SUCCESS; +} std::unique_ptr D3D12Provider::CreatePresenter( Presenter::HostGpuLossCallback host_gpu_loss_callback) { return D3D12Presenter::Create(host_gpu_loss_callback, *this); diff --git a/src/xenia/ui/d3d12/d3d12_provider.h b/src/xenia/ui/d3d12/d3d12_provider.h index 36164eaba..5136672c1 100644 --- a/src/xenia/ui/d3d12/d3d12_provider.h +++ b/src/xenia/ui/d3d12/d3d12_provider.h @@ -12,15 +12,19 @@ #include +#include "xenia/gpu/d3d12/d3d12_nvapi.hpp" #include "xenia/ui/d3d12/d3d12_api.h" #include "xenia/ui/graphics_provider.h" - #define XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES 1 namespace xe { namespace ui { namespace d3d12 { - +enum { + UPLOAD_RESULT_CREATE_FAILED = 0, + UPLOAD_RESULT_CREATE_SUCCESS = 1, + UPLOAD_RESULT_CREATE_CPUVISIBLE = 2 +}; class D3D12Provider : public GraphicsProvider { public: ~D3D12Provider(); @@ -34,6 +38,11 @@ class D3D12Provider : public GraphicsProvider { Presenter::FatalErrorHostGpuLossCallback) override; std::unique_ptr CreateImmediateDrawer() override; + uint32_t CreateUploadResource( + D3D12_HEAP_FLAGS HeapFlags, _In_ const D3D12_RESOURCE_DESC* pDesc, + D3D12_RESOURCE_STATES InitialResourceState, REFIID riidResource, + void** ppvResource, bool try_create_cpuvisible = false, + const D3D12_CLEAR_VALUE* pOptimizedClearValue = nullptr) const; IDXGIFactory2* GetDXGIFactory() const { return dxgi_factory_; } // nullptr if PIX not attached. @@ -193,6 +202,14 @@ class D3D12Provider : public GraphicsProvider { bool ps_specified_stencil_reference_supported_; bool rasterizer_ordered_views_supported_; bool unaligned_block_textures_supported_; + + lightweight_nvapi::nvapi_state_t* nvapi_; + lightweight_nvapi::cb_NvAPI_D3D12_CreateCommittedResource + nvapi_createcommittedresource_ = nullptr; + lightweight_nvapi::cb_NvAPI_D3D12_UseDriverHeapPriorities + nvapi_usedriverheappriorities_ = nullptr; + lightweight_nvapi::cb_NvAPI_D3D12_QueryCpuVisibleVidmem + nvapi_querycpuvisiblevidmem_ = nullptr; }; } // namespace d3d12 diff --git a/src/xenia/ui/d3d12/d3d12_upload_buffer_pool.cc b/src/xenia/ui/d3d12/d3d12_upload_buffer_pool.cc index b50edfd6e..e5eb9a9ed 100644 --- a/src/xenia/ui/d3d12/d3d12_upload_buffer_pool.cc +++ b/src/xenia/ui/d3d12/d3d12_upload_buffer_pool.cc @@ -81,10 +81,10 @@ D3D12UploadBufferPool::CreatePageImplementation() { util::FillBufferResourceDesc(buffer_desc, page_size_, D3D12_RESOURCE_FLAG_NONE); Microsoft::WRL::ComPtr buffer; - if (FAILED(provider_.GetDevice()->CreateCommittedResource( - &util::kHeapPropertiesUpload, provider_.GetHeapFlagCreateNotZeroed(), - &buffer_desc, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, - IID_PPV_ARGS(&buffer)))) { + + if (!provider_.CreateUploadResource( + provider_.GetHeapFlagCreateNotZeroed(), &buffer_desc, + D3D12_RESOURCE_STATE_GENERIC_READ, IID_PPV_ARGS(&buffer))) { XELOGE("Failed to create a D3D upload buffer with {} bytes", page_size_); return nullptr; }