diff --git a/src/xenia/apu/conversion.h b/src/xenia/apu/conversion.h index 211243348..0f807d67b 100644 --- a/src/xenia/apu/conversion.h +++ b/src/xenia/apu/conversion.h @@ -20,6 +20,8 @@ namespace apu { namespace conversion { #if XE_ARCH_AMD64 + +#if 0 inline void sequential_6_BE_to_interleaved_6_LE(float* output, const float* input, size_t ch_sample_count) { @@ -41,7 +43,44 @@ inline void sequential_6_BE_to_interleaved_6_LE(float* output, out[sample * 6 + 5] = sample2; } } +#else +XE_NOINLINE +static void _generic_sequential_6_BE_to_interleaved_6_LE( + float* XE_RESTRICT output, const float* XE_RESTRICT input, + unsigned ch_sample_count) { + for (unsigned sample = 0; sample < ch_sample_count; sample++) { + for (unsigned channel = 0; channel < 6; channel++) { + unsigned int value = *reinterpret_cast( + &input[channel * ch_sample_count + sample]); + *reinterpret_cast(&output[sample * 6 + channel]) = + xe::byte_swap(value); + } + } +} +XE_NOINLINE +static void _movbe_sequential_6_BE_to_interleaved_6_LE( + float* XE_RESTRICT output, const float* XE_RESTRICT input, + unsigned ch_sample_count) { + for (unsigned sample = 0; sample < ch_sample_count; sample++) { + for (unsigned channel = 0; channel < 6; channel++) { + *reinterpret_cast(&output[sample * 6 + channel]) = + _load_be_u32(reinterpret_cast( + &input[channel * ch_sample_count + sample])); + } + } +} + +inline static void sequential_6_BE_to_interleaved_6_LE( + float* output, const float* input, unsigned ch_sample_count) { + if (amd64::GetFeatureFlags() & amd64::kX64EmitMovbe) { + _movbe_sequential_6_BE_to_interleaved_6_LE(output, input, ch_sample_count); + } else { + _generic_sequential_6_BE_to_interleaved_6_LE(output, input, + ch_sample_count); + } +} +#endif inline void sequential_6_BE_to_interleaved_2_LE(float* output, const float* input, size_t ch_sample_count) { diff --git a/src/xenia/base/cvar.h b/src/xenia/base/cvar.h index 61b8faf11..144703665 100644 --- a/src/xenia/base/cvar.h +++ b/src/xenia/base/cvar.h @@ -335,7 +335,8 @@ ICommandVar* define_cmdvar(const char* name, T* default_value, #define DEFINE_uint64(name, default_value, description, category) \ DEFINE_CVar(name, default_value, description, category, false, uint64_t) - +#define DEFINE_int64(name, default_value, description, category) \ + DEFINE_CVar(name, default_value, description, category, false, int64_t) #define DEFINE_double(name, default_value, description, category) \ DEFINE_CVar(name, default_value, description, category, false, double) @@ -383,7 +384,7 @@ ICommandVar* define_cmdvar(const char* name, T* default_value, #define DECLARE_uint32(name) DECLARE_CVar(name, uint32_t) #define DECLARE_uint64(name) DECLARE_CVar(name, uint64_t) - +#define DECLARE_int64(name) DECLARE_CVar(name, int64_t) #define DECLARE_double(name) DECLARE_CVar(name, double) #define DECLARE_string(name) DECLARE_CVar(name, std::string) diff --git a/src/xenia/base/mutex.cc b/src/xenia/base/mutex.cc index 027cd7882..b975e4bc3 100644 --- a/src/xenia/base/mutex.cc +++ b/src/xenia/base/mutex.cc @@ -26,7 +26,7 @@ check this and release the mutex one way to do this is by using FlsAlloc and PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a thread exits */ -thread_local unsigned global_mutex_depth = 0; + static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) { return reinterpret_cast(mutex); } @@ -38,29 +38,16 @@ xe_global_mutex::xe_global_mutex() { xe_global_mutex ::~xe_global_mutex() { DeleteCriticalSection(global_critical_section(this)); } + void xe_global_mutex::lock() { - if (global_mutex_depth) { - } else { - EnterCriticalSection(global_critical_section(this)); - } - global_mutex_depth++; + EnterCriticalSection(global_critical_section(this)); } void xe_global_mutex::unlock() { - if (--global_mutex_depth == 0) { - LeaveCriticalSection(global_critical_section(this)); - } + LeaveCriticalSection(global_critical_section(this)); } bool xe_global_mutex::try_lock() { - if (global_mutex_depth) { - ++global_mutex_depth; - return true; - } else { - BOOL success = TryEnterCriticalSection(global_critical_section(this)); - if (success) { - ++global_mutex_depth; - } - return success; - } + BOOL success = TryEnterCriticalSection(global_critical_section(this)); + return success; } CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) { diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h index e99e8b83d..61749e4c7 100644 --- a/src/xenia/base/platform.h +++ b/src/xenia/base/platform.h @@ -116,15 +116,15 @@ #define XE_LIKELY(...) (!!(__VA_ARGS__)) #define XE_UNLIKELY(...) (!!(__VA_ARGS__)) #define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__) -#define XE_NOALIAS __declspec(noalias) +#define XE_NOALIAS __declspec(noalias) #elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1 #define XE_FORCEINLINE __attribute__((always_inline)) #define XE_NOINLINE __attribute__((noinline)) #define XE_COLD __attribute__((cold)) #define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true) #define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false) -#define XE_NOALIAS -//cant do unevaluated assume +#define XE_NOALIAS +// cant do unevaluated assume #define XE_MSVC_ASSUME(...) static_cast(0) #else #define XE_FORCEINLINE inline @@ -137,7 +137,13 @@ #define XE_MSVC_ASSUME(...) static_cast(0) #endif - +#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1 +#define XE_MSVC_OPTIMIZE_SMALL() __pragma(optimize("s", on)) +#define XE_MSVC_OPTIMIZE_REVERT() __pragma(optimize("", on)) +#else +#define XE_MSVC_OPTIMIZE_SMALL() +#define XE_MSVC_OPTIMIZE_REVERT() +#endif #if XE_COMPILER_HAS_GNU_EXTENSIONS == 1 #define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__)) #define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__)) @@ -180,7 +186,7 @@ const char kPathSeparator = '/'; const char kGuestPathSeparator = '\\'; } // namespace xe -#if XE_ARCH_AMD64==1 +#if XE_ARCH_AMD64 == 1 #include "platform_amd64.h" #endif #endif // XENIA_BASE_PLATFORM_H_ diff --git a/src/xenia/base/platform_amd64.cc b/src/xenia/base/platform_amd64.cc index 31df3c497..7005420e5 100644 --- a/src/xenia/base/platform_amd64.cc +++ b/src/xenia/base/platform_amd64.cc @@ -7,13 +7,12 @@ ****************************************************************************** */ - #include "xenia/base/cvar.h" #include "xenia/base/platform.h" #include "third_party/xbyak/xbyak/xbyak.h" #include "third_party/xbyak/xbyak/xbyak_util.h" -DEFINE_int32(x64_extension_mask, -1, +DEFINE_int64(x64_extension_mask, -1LL, "Allow the detection and utilization of specific instruction set " "features.\n" " 0 = x86_64 + AVX1\n" @@ -33,79 +32,92 @@ DEFINE_int32(x64_extension_mask, -1, "x64"); namespace xe { namespace amd64 { -static uint32_t g_feature_flags = 0U; +static uint64_t g_feature_flags = 0U; static bool g_did_initialize_feature_flags = false; -uint32_t GetFeatureFlags() { - xenia_assert(g_did_initialize_feature_flags); - return g_feature_flags; +uint64_t GetFeatureFlags() { + xenia_assert(g_did_initialize_feature_flags); + return g_feature_flags; } XE_COLD XE_NOINLINE void InitFeatureFlags() { - uint32_t feature_flags_ = 0U; - - Xbyak::util::Cpu cpu_; + uint64_t feature_flags_ = 0U; + { + Xbyak::util::Cpu cpu_; #define TEST_EMIT_FEATURE(emit, ext) \ if ((cvars::x64_extension_mask & emit) == emit) { \ feature_flags_ |= (cpu_.has(ext) ? emit : 0); \ } - TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2); - TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA); - TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT); - TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1); - TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2); - TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE); - TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI); - TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F); - TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL); - TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW); - TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ); - TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI); - TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW); + TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2); + TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA); + TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT); + TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1); + TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2); + TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE); + TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI); + TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F); + TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL); + TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW); + TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ); + TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI); + TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW); #undef TEST_EMIT_FEATURE - /* - fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in - latest version of xbyak -*/ - unsigned int data[4]; - Xbyak::util::Cpu::getCpuid(0x80000001, data); - unsigned amd_flags = data[2]; - if (amd_flags & (1U << 5)) { - if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) { - feature_flags_ |= kX64EmitLZCNT; - } - } - // todo: although not reported by cpuid, zen 1 and zen+ also have fma4 - if (amd_flags & (1U << 16)) { - if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) { - feature_flags_ |= kX64EmitFMA4; - } - } - if (amd_flags & (1U << 21)) { - if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) { - feature_flags_ |= kX64EmitTBM; - } - } - if (amd_flags & (1U << 11)) { - if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) { - feature_flags_ |= kX64EmitXOP; - } - } - if (cpu_.has(Xbyak::util::Cpu::tAMD)) { - bool is_zennish = cpu_.displayFamily >= 0x17; /* - chrispy: according to agner's tables, all amd architectures that - we support (ones with avx) have the same timings for - jrcxz/loop/loope/loopne as for other jmps - */ - feature_flags_ |= kX64FastJrcx; - feature_flags_ |= kX64FastLoop; - if (is_zennish) { - // ik that i heard somewhere that this is the case for zen, but i need to - // verify. cant find my original source for that. - // todo: ask agner? - feature_flags_ |= kX64FlagsIndependentVars; + fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in + latest version of xbyak + */ + unsigned int data[4]; + Xbyak::util::Cpu::getCpuid(0x80000001, data); + unsigned amd_flags = data[2]; + if (amd_flags & (1U << 5)) { + if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) { + feature_flags_ |= kX64EmitLZCNT; + } + } + // todo: although not reported by cpuid, zen 1 and zen+ also have fma4 + if (amd_flags & (1U << 16)) { + if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) { + feature_flags_ |= kX64EmitFMA4; + } + } + if (amd_flags & (1U << 21)) { + if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) { + feature_flags_ |= kX64EmitTBM; + } + } + if (amd_flags & (1U << 11)) { + if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) { + feature_flags_ |= kX64EmitXOP; + } + } + if (cpu_.has(Xbyak::util::Cpu::tAMD)) { + bool is_zennish = cpu_.displayFamily >= 0x17; + /* + chrispy: according to agner's tables, all amd architectures + that we support (ones with avx) have the same timings for + jrcxz/loop/loope/loopne as for other jmps + */ + feature_flags_ |= kX64FastJrcx; + feature_flags_ |= kX64FastLoop; + if (is_zennish) { + // ik that i heard somewhere that this is the case for zen, but i need + // to verify. cant find my original source for that. todo: ask agner? + feature_flags_ |= kX64FlagsIndependentVars; + } + } + } + { + unsigned int data[4]; + memset(data, 0, sizeof(data)); + // intel extended features + Xbyak::util::Cpu::getCpuidEx(7, 0, data); + if ((data[2] & (1 << 28)) && + (cvars::x64_extension_mask & kX64EmitMovdir64M)) { + feature_flags_ |= kX64EmitMovdir64M; + } + if ((data[1] & (1 << 9)) && (cvars::x64_extension_mask & kX64FastRepMovs)) { + feature_flags_ |= kX64FastRepMovs; } } g_feature_flags = feature_flags_; diff --git a/src/xenia/base/platform_amd64.h b/src/xenia/base/platform_amd64.h index 326b69139..e5c20c670 100644 --- a/src/xenia/base/platform_amd64.h +++ b/src/xenia/base/platform_amd64.h @@ -13,7 +13,7 @@ namespace xe { namespace amd64 { -enum X64FeatureFlags { +enum X64FeatureFlags : uint64_t { kX64EmitAVX2 = 1 << 0, kX64EmitFMA = 1 << 1, kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount @@ -44,14 +44,13 @@ enum X64FeatureFlags { // instructions, and FX users need the boost kX64EmitFMA4 = 1 << 17, // todo: also use on zen1? kX64EmitTBM = 1 << 18, - // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family - // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm - // instructions by using the legacy sse version if we recently cleared the - // high 128 bits of the + kX64EmitMovdir64M = 1 << 19, + kX64FastRepMovs = 1 << 20 + }; XE_NOALIAS -uint32_t GetFeatureFlags(); +uint64_t GetFeatureFlags(); XE_COLD void InitFeatureFlags(); diff --git a/src/xenia/base/threading.h b/src/xenia/base/threading.h index 67297716b..604819950 100644 --- a/src/xenia/base/threading.h +++ b/src/xenia/base/threading.h @@ -299,6 +299,12 @@ class Event : public WaitHandle { // the nonsignaled state after releasing the appropriate number of waiting // threads. virtual void Pulse() = 0; + #if XE_PLATFORM_WIN32 ==1 + //SetEvent, but if there is a waiter we immediately transfer execution to it + virtual void SetBoostPriority() = 0; + #else + void SetBoostPriority() { Set() } + #endif }; // Models a Win32-like semaphore object. diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc index 32ddf7487..01a4eb9be 100644 --- a/src/xenia/base/threading_win.cc +++ b/src/xenia/base/threading_win.cc @@ -39,6 +39,8 @@ XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject, NtWaitForSingleObjectPointer); XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer); +XE_NTDLL_IMPORT(NtSetEventBoostPriority, cls_NtSetEventBoostPriority, + NtSetEventBoostPriorityPointer); // difference between NtClearEvent and NtResetEvent is that NtResetEvent returns // the events state prior to the call, but we dont need that. might need to // check whether one or the other is faster in the kernel though yeah, just @@ -53,6 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore, XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution, NtDelayExecutionPointer); + namespace xe { namespace threading { @@ -137,7 +140,7 @@ void MaybeYield() { #endif #endif // memorybarrier is really not necessary here... - MemoryBarrier(); + // MemoryBarrier(); } void SyncMemory() { MemoryBarrier(); } @@ -288,11 +291,19 @@ class Win32Event : public Win32Handle { void Set() override { NtSetEventPointer.invoke(handle_, nullptr); } void Reset() override { NtClearEventPointer.invoke(handle_); } void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); } + void SetBoostPriority() override { + // no previous state for boostpriority + NtSetEventBoostPriorityPointer.invoke(handle_); + } #else void Set() override { SetEvent(handle_); } void Reset() override { ResetEvent(handle_); } void Pulse() override { PulseEvent(handle_); } + void SetBoostPriority() override { + // no win32 version of boostpriority + SetEvent(handle_); + } #endif }; diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index d4ded3e83..cb5a375ec 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -23,7 +23,7 @@ #define XE_X64_PROFILER_AVAILABLE 1 #endif -DECLARE_int32(x64_extension_mask); +DECLARE_int64(x64_extension_mask); namespace xe { class Exception; diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 74515d38e..03b8b4abd 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -103,74 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) "FAQ for system requirements at https://xenia.jp"); return; } -#if 1 - feature_flags_ = amd64::GetFeatureFlags(); -#else -#define TEST_EMIT_FEATURE(emit, ext) \ - if ((cvars::x64_extension_mask & emit) == emit) { \ - feature_flags_ |= (cpu_.has(ext) ? emit : 0); \ - } - TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2); - TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA); - TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT); - TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1); - TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2); - TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE); - TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI); - TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F); - TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL); - TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW); - TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ); - TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI); - TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW); -#undef TEST_EMIT_FEATURE - /* - fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in - latest version of xbyak -*/ - unsigned int data[4]; - Xbyak::util::Cpu::getCpuid(0x80000001, data); - unsigned amd_flags = data[2]; - if (amd_flags & (1U << 5)) { - if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) { - feature_flags_ |= kX64EmitLZCNT; - } - } - // todo: although not reported by cpuid, zen 1 and zen+ also have fma4 - if (amd_flags & (1U << 16)) { - if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) { - feature_flags_ |= kX64EmitFMA4; - } - } - if (amd_flags & (1U << 21)) { - if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) { - feature_flags_ |= kX64EmitTBM; - } - } - if (amd_flags & (1U << 11)) { - if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) { - feature_flags_ |= kX64EmitXOP; - XELOGCPU("Cpu support XOP!\n\n"); - } - } - if (cpu_.has(Xbyak::util::Cpu::tAMD)) { - bool is_zennish = cpu_.displayFamily >= 0x17; - /* - chrispy: according to agner's tables, all amd architectures that - we support (ones with avx) have the same timings for - jrcxz/loop/loope/loopne as for other jmps - */ - feature_flags_ |= kX64FastJrcx; - feature_flags_ |= kX64FastLoop; - if (is_zennish) { - // ik that i heard somewhere that this is the case for zen, but i need to - // verify. cant find my original source for that. - // todo: ask agner? - feature_flags_ |= kX64FlagsIndependentVars; - } - } -#endif + feature_flags_ = amd64::GetFeatureFlags(); + may_use_membase32_as_zero_reg_ = static_cast(reinterpret_cast( processor()->memory()->virtual_membase())) == 0; diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 69e3b80ec..91f4016c1 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -299,7 +299,7 @@ class X64Emitter : public Xbyak::CodeGenerator { void* FindWordConstantOffset(unsigned wordvalue); void* FindDwordConstantOffset(unsigned bytevalue); void* FindQwordConstantOffset(uint64_t bytevalue); - bool IsFeatureEnabled(uint32_t feature_flag) const { + bool IsFeatureEnabled(uint64_t feature_flag) const { return (feature_flags_ & feature_flag) == feature_flag; } @@ -395,7 +395,7 @@ class X64Emitter : public Xbyak::CodeGenerator { XbyakAllocator* allocator_ = nullptr; XexModule* guest_module_ = nullptr; Xbyak::util::Cpu cpu_; - uint32_t feature_flags_ = 0; + uint64_t feature_flags_ = 0; uint32_t current_guest_function_ = 0; Xbyak::Label* epilog_label_ = nullptr; diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 06a37ab91..28b33fd76 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -39,7 +39,7 @@ #include "xenia/cpu/backend/x64/x64_stack_layout.h" #include "xenia/cpu/hir/hir_builder.h" #include "xenia/cpu/processor.h" - +XE_MSVC_OPTIMIZE_SMALL() DEFINE_bool(use_fast_dot_product, false, "Experimental optimization, much shorter sequence on dot products, " "treating inf as overflow instead of using mcxsr" diff --git a/src/xenia/cpu/entry_table.cc b/src/xenia/cpu/entry_table.cc index 1d82f0538..840706171 100644 --- a/src/xenia/cpu/entry_table.cc +++ b/src/xenia/cpu/entry_table.cc @@ -19,16 +19,19 @@ EntryTable::EntryTable() = default; EntryTable::~EntryTable() { auto global_lock = global_critical_region_.Acquire(); - for (auto it : map_) { - Entry* entry = it.second; + for (auto it : map_.Values()) { + Entry* entry = it; delete entry; } } Entry* EntryTable::Get(uint32_t address) { auto global_lock = global_critical_region_.Acquire(); - const auto& it = map_.find(address); - Entry* entry = it != map_.end() ? it->second : nullptr; + uint32_t idx = map_.IndexForKey(address); + if (idx == map_.size() || *map_.KeyAt(idx) != address) { + return nullptr; + } + Entry* entry = *map_.ValueAt(idx); if (entry) { // TODO(benvanik): wait if needed? if (entry->status != Entry::STATUS_READY) { @@ -43,8 +46,12 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) { // https://github.com/facebook/folly/blob/master/folly/AtomicHashMap.h auto global_lock = global_critical_region_.Acquire(); - const auto& it = map_.find(address); - Entry* entry = it != map_.end() ? it->second : nullptr; + + uint32_t idx = map_.IndexForKey(address); + + Entry* entry = idx != map_.size() && *map_.KeyAt(idx) == address + ? *map_.ValueAt(idx) + : nullptr; Entry::Status status; if (entry) { // If we aren't ready yet spin and wait. @@ -65,7 +72,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) { entry->end_address = 0; entry->status = Entry::STATUS_COMPILING; entry->function = 0; - map_[address] = entry; + map_.InsertAt(address, entry, idx); + // map_[address] = entry; status = Entry::STATUS_NEW; } global_lock.unlock(); @@ -75,18 +83,18 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) { void EntryTable::Delete(uint32_t address) { auto global_lock = global_critical_region_.Acquire(); - const auto itr = map_.find(address); - - if (itr != map_.cend()) { - map_.erase(itr); + // doesnt this leak memory by not deleting the entry? + uint32_t idx = map_.IndexForKey(address); + if (idx != map_.size() && *map_.KeyAt(idx) == address) { + map_.EraseAt(idx); } } std::vector EntryTable::FindWithAddress(uint32_t address) { auto global_lock = global_critical_region_.Acquire(); std::vector fns; - for (auto& it : map_) { - Entry* entry = it.second; + for (auto& it : map_.Values()) { + Entry* entry = it; if (address >= entry->address && address <= entry->end_address) { if (entry->status == Entry::STATUS_READY) { fns.push_back(entry->function); @@ -95,6 +103,5 @@ std::vector EntryTable::FindWithAddress(uint32_t address) { } return fns; } - } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/entry_table.h b/src/xenia/cpu/entry_table.h index 14a3e6c82..2ca2133c2 100644 --- a/src/xenia/cpu/entry_table.h +++ b/src/xenia/cpu/entry_table.h @@ -14,7 +14,7 @@ #include #include "xenia/base/mutex.h" - +#include "xenia/base/split_map.h" namespace xe { namespace cpu { @@ -48,7 +48,8 @@ class EntryTable { private: xe::global_critical_region global_critical_region_; // TODO(benvanik): replace with a better data structure. - std::unordered_map map_; + xe::split_map map_; + //std::unordered_map map_; }; } // namespace cpu diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index ab54438d7..66da46546 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -334,7 +334,7 @@ void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr, void CommandProcessor::UpdateWritePointer(uint32_t value) { write_ptr_index_ = value; - write_ptr_index_event_->Set(); + write_ptr_index_event_->SetBoostPriority(); } void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index, uint32_t value) { @@ -665,6 +665,11 @@ uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index, reader_.set_read_offset(read_index * sizeof(uint32_t)); reader_.set_write_offset(write_index * sizeof(uint32_t)); + // prefetch the wraparound range + // it likely is already in L3 cache, but in a zen system it may be another + // chiplets l3 + reader_.BeginPrefetchedRead( + GetCurrentRingReadCount()); do { if (!ExecutePacket()) { // This probably should be fatal - but we're going to continue anyways. diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 4e7ee919c..a24d468ae 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -380,7 +380,8 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature( root_signatures_bindful_.emplace(index, root_signature); return root_signature; } - +XE_NOINLINE +XE_COLD uint32_t D3D12CommandProcessor::GetRootBindfulExtraParameterIndices( const DxbcShader* vertex_shader, const DxbcShader* pixel_shader, RootBindfulExtraParameterIndices& indices_out) { @@ -2484,7 +2485,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, return false; } pipeline_cache_->AnalyzeShaderUcode(*vertex_shader); - bool memexport_used_vertex = vertex_shader->is_valid_memexport_used(); + const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used(); // Pixel shader analysis. bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs); @@ -2512,9 +2513,10 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, return true; } } - bool memexport_used_pixel = + + const bool memexport_used_pixel = pixel_shader && pixel_shader->is_valid_memexport_used(); - bool memexport_used = memexport_used_vertex || memexport_used_pixel; + const bool memexport_used = memexport_used_vertex || memexport_used_pixel; if (!BeginSubmission(true)) { return false; @@ -2639,6 +2641,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, previous_viewport_info_args_ = gviargs; previous_viewport_info_ = viewport_info; } + // todo: use SIMD for getscissor + scaling here, should reduce code size more draw_util::Scissor scissor; draw_util::GetScissor(regs, scissor); scissor.offset[0] *= draw_resolution_scale_x; @@ -2711,102 +2714,13 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // Gather memexport ranges and ensure the heaps for them are resident, and // also load the data surrounding the export and to fill the regions that // won't be modified by the shaders. - struct MemExportRange { - uint32_t base_address_dwords; - uint32_t size_dwords; - }; - MemExportRange memexport_ranges[512]; - uint32_t memexport_range_count = 0; - if (memexport_used_vertex) { - for (uint32_t constant_index : - vertex_shader->memexport_stream_constants()) { - const auto& memexport_stream = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4); - if (memexport_stream.index_count == 0) { - continue; - } - uint32_t memexport_format_size = - GetSupportedMemExportFormatSize(memexport_stream.format); - if (memexport_format_size == 0) { - XELOGE("Unsupported memexport format {}", - FormatInfo::GetName( - xenos::TextureFormat(uint32_t(memexport_stream.format)))); - return false; - } - uint32_t memexport_size_dwords = - memexport_stream.index_count * memexport_format_size; - // Try to reduce the number of shared memory operations when writing - // different elements into the same buffer through different exports - // (happens in 4D5307E6). - bool memexport_range_reused = false; - for (uint32_t i = 0; i < memexport_range_count; ++i) { - MemExportRange& memexport_range = memexport_ranges[i]; - if (memexport_range.base_address_dwords == - memexport_stream.base_address) { - memexport_range.size_dwords = - std::max(memexport_range.size_dwords, memexport_size_dwords); - memexport_range_reused = true; - break; - } - } - // Add a new range if haven't expanded an existing one. - if (!memexport_range_reused) { - MemExportRange& memexport_range = - memexport_ranges[memexport_range_count++]; - memexport_range.base_address_dwords = memexport_stream.base_address; - memexport_range.size_dwords = memexport_size_dwords; - } - } - } - if (memexport_used_pixel) { - for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) { - const auto& memexport_stream = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4); - if (memexport_stream.index_count == 0) { - continue; - } - uint32_t memexport_format_size = - GetSupportedMemExportFormatSize(memexport_stream.format); - if (memexport_format_size == 0) { - XELOGE("Unsupported memexport format {}", - FormatInfo::GetName( - xenos::TextureFormat(uint32_t(memexport_stream.format)))); - return false; - } - uint32_t memexport_size_dwords = - memexport_stream.index_count * memexport_format_size; - bool memexport_range_reused = false; - for (uint32_t i = 0; i < memexport_range_count; ++i) { - MemExportRange& memexport_range = memexport_ranges[i]; - if (memexport_range.base_address_dwords == - memexport_stream.base_address) { - memexport_range.size_dwords = - std::max(memexport_range.size_dwords, memexport_size_dwords); - memexport_range_reused = true; - break; - } - } - if (!memexport_range_reused) { - MemExportRange& memexport_range = - memexport_ranges[memexport_range_count++]; - memexport_range.base_address_dwords = memexport_stream.base_address; - memexport_range.size_dwords = memexport_size_dwords; - } - } - } - for (uint32_t i = 0; i < memexport_range_count; ++i) { - const MemExportRange& memexport_range = memexport_ranges[i]; - if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2, - memexport_range.size_dwords << 2)) { - XELOGE( - "Failed to request memexport stream at 0x{:08X} (size {}) in the " - "shared memory", - memexport_range.base_address_dwords << 2, - memexport_range.size_dwords << 2); - return false; - } - } + memexport_range_count_ = 0; + if (memexport_used_vertex || memexport_used_pixel) { + bool retflag; + bool retval = GatherMemexportRangesAndMakeResident(retflag); + if (retflag) return retval; + } // Primitive topology. D3D_PRIMITIVE_TOPOLOGY primitive_topology; if (primitive_processing_result.IsTessellated()) { @@ -2876,10 +2790,11 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // Draw. if (primitive_processing_result.index_buffer_type == PrimitiveProcessor::ProcessedIndexBufferType::kNone) { - if (memexport_used) { - shared_memory_->UseForWriting(); - } else { + if (!memexport_used) { shared_memory_->UseForReading(); + + } else { + shared_memory_->UseForWriting(); } SubmitBarriers(); deferred_command_list_.D3DDrawInstanced( @@ -2903,22 +2818,11 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // If the shared memory is a UAV, it can't be used as an index buffer // (UAV is a read/write state, index buffer is a read-only state). // Need to copy the indices to a buffer in the index buffer state. - scratch_index_buffer = RequestScratchGPUBuffer( - index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST); - if (scratch_index_buffer == nullptr) { - return false; - } - shared_memory_->UseAsCopySource(); - SubmitBarriers(); - deferred_command_list_.D3DCopyBufferRegion( - scratch_index_buffer, 0, shared_memory_->GetBuffer(), - primitive_processing_result.guest_index_base, - index_buffer_view.SizeInBytes); - PushTransitionBarrier(scratch_index_buffer, - D3D12_RESOURCE_STATE_COPY_DEST, - D3D12_RESOURCE_STATE_INDEX_BUFFER); - index_buffer_view.BufferLocation = - scratch_index_buffer->GetGPUVirtualAddress(); + bool retflag; + bool retval = HandleMemexportGuestDMA( + scratch_index_buffer, index_buffer_view, + primitive_processing_result.guest_index_base, retflag); + if (retflag) return retval; } else { index_buffer_view.BufferLocation = shared_memory_->GetGPUAddress() + @@ -2956,66 +2860,199 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } if (memexport_used) { - // Make sure this memexporting draw is ordered with other work using shared - // memory as a UAV. - // TODO(Triang3l): Find some PM4 command that can be used for indication of - // when memexports should be awaited? - shared_memory_->MarkUAVWritesCommitNeeded(); - // Invalidate textures in memexported memory and watch for changes. - for (uint32_t i = 0; i < memexport_range_count; ++i) { - const MemExportRange& memexport_range = memexport_ranges[i]; - shared_memory_->RangeWrittenByGpu( - memexport_range.base_address_dwords << 2, - memexport_range.size_dwords << 2, false); - } - if (cvars::d3d12_readback_memexport) { - // Read the exported data on the CPU. - uint32_t memexport_total_size = 0; - for (uint32_t i = 0; i < memexport_range_count; ++i) { - memexport_total_size += memexport_ranges[i].size_dwords << 2; + HandleMemexportDrawOrdering_AndReadback(); + } + + return true; +} +XE_COLD +XE_NOINLINE +bool D3D12CommandProcessor::HandleMemexportGuestDMA( + ID3D12Resource*& scratch_index_buffer, + D3D12_INDEX_BUFFER_VIEW& index_buffer_view, uint32_t guest_index_base, + // xe::gpu::PrimitiveProcessor::ProcessingResult& + // primitive_processing_result, + bool& retflag) { + retflag = true; + scratch_index_buffer = RequestScratchGPUBuffer( + index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST); + if (scratch_index_buffer == nullptr) { + return false; + } + shared_memory_->UseAsCopySource(); + SubmitBarriers(); + deferred_command_list_.D3DCopyBufferRegion( + scratch_index_buffer, 0, shared_memory_->GetBuffer(), guest_index_base, + index_buffer_view.SizeInBytes); + PushTransitionBarrier(scratch_index_buffer, D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_INDEX_BUFFER); + index_buffer_view.BufferLocation = + scratch_index_buffer->GetGPUVirtualAddress(); + retflag = false; + return {}; +} +XE_NOINLINE +XE_COLD +bool D3D12CommandProcessor::GatherMemexportRangesAndMakeResident( + bool& retflag) { + auto vertex_shader = static_cast(active_vertex_shader()); + auto pixel_shader = static_cast(active_pixel_shader()); + const xe::gpu::RegisterFile& regs = *register_file_; + const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used(); + const bool memexport_used_pixel = + pixel_shader && pixel_shader->is_valid_memexport_used(); + retflag = true; + if (memexport_used_vertex) { + for (uint32_t constant_index : + vertex_shader->memexport_stream_constants()) { + const auto& memexport_stream = regs.Get( + XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4); + if (memexport_stream.index_count == 0) { + continue; } - if (memexport_total_size != 0) { - ID3D12Resource* readback_buffer = - RequestReadbackBuffer(memexport_total_size); - if (readback_buffer != nullptr) { - shared_memory_->UseAsCopySource(); - SubmitBarriers(); - ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer(); - uint32_t readback_buffer_offset = 0; - for (uint32_t i = 0; i < memexport_range_count; ++i) { - const MemExportRange& memexport_range = memexport_ranges[i]; - uint32_t memexport_range_size = memexport_range.size_dwords << 2; - deferred_command_list_.D3DCopyBufferRegion( - readback_buffer, readback_buffer_offset, shared_memory_buffer, - memexport_range.base_address_dwords << 2, memexport_range_size); - readback_buffer_offset += memexport_range_size; - } - if (AwaitAllQueueOperationsCompletion()) { - D3D12_RANGE readback_range; - readback_range.Begin = 0; - readback_range.End = memexport_total_size; - void* readback_mapping; - if (SUCCEEDED(readback_buffer->Map(0, &readback_range, - &readback_mapping))) { - const uint32_t* readback_dwords = - reinterpret_cast(readback_mapping); - for (uint32_t i = 0; i < memexport_range_count; ++i) { - const MemExportRange& memexport_range = memexport_ranges[i]; - std::memcpy(memory_->TranslatePhysical( - memexport_range.base_address_dwords << 2), - readback_dwords, memexport_range.size_dwords << 2); - readback_dwords += memexport_range.size_dwords; - } - D3D12_RANGE readback_write_range = {}; - readback_buffer->Unmap(0, &readback_write_range); + uint32_t memexport_format_size = + GetSupportedMemExportFormatSize(memexport_stream.format); + if (memexport_format_size == 0) { + XELOGE("Unsupported memexport format {}", + FormatInfo::GetName( + xenos::TextureFormat(uint32_t(memexport_stream.format)))); + return false; + } + uint32_t memexport_size_dwords = + memexport_stream.index_count * memexport_format_size; + // Try to reduce the number of shared memory operations when writing + // different elements into the same buffer through different exports + // (happens in 4D5307E6). + bool memexport_range_reused = false; + for (uint32_t i = 0; i < memexport_range_count_; ++i) { + MemExportRange& memexport_range = memexport_ranges_[i]; + if (memexport_range.base_address_dwords == + memexport_stream.base_address) { + memexport_range.size_dwords = + std::max(memexport_range.size_dwords, memexport_size_dwords); + memexport_range_reused = true; + break; + } + } + // Add a new range if haven't expanded an existing one. + if (!memexport_range_reused) { + MemExportRange& memexport_range = + memexport_ranges_[memexport_range_count_++]; + memexport_range.base_address_dwords = memexport_stream.base_address; + memexport_range.size_dwords = memexport_size_dwords; + } + } + } + if (memexport_used_pixel) { + for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) { + const auto& memexport_stream = regs.Get( + XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4); + if (memexport_stream.index_count == 0) { + continue; + } + uint32_t memexport_format_size = + GetSupportedMemExportFormatSize(memexport_stream.format); + if (memexport_format_size == 0) { + XELOGE("Unsupported memexport format {}", + FormatInfo::GetName( + xenos::TextureFormat(uint32_t(memexport_stream.format)))); + return false; + } + uint32_t memexport_size_dwords = + memexport_stream.index_count * memexport_format_size; + bool memexport_range_reused = false; + for (uint32_t i = 0; i < memexport_range_count_; ++i) { + MemExportRange& memexport_range = memexport_ranges_[i]; + if (memexport_range.base_address_dwords == + memexport_stream.base_address) { + memexport_range.size_dwords = + std::max(memexport_range.size_dwords, memexport_size_dwords); + memexport_range_reused = true; + break; + } + } + if (!memexport_range_reused) { + MemExportRange& memexport_range = + memexport_ranges_[memexport_range_count_++]; + memexport_range.base_address_dwords = memexport_stream.base_address; + memexport_range.size_dwords = memexport_size_dwords; + } + } + } + for (uint32_t i = 0; i < memexport_range_count_; ++i) { + const MemExportRange& memexport_range = memexport_ranges_[i]; + if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2, + memexport_range.size_dwords << 2)) { + XELOGE( + "Failed to request memexport stream at 0x{:08X} (size {}) in the " + "shared memory", + memexport_range.base_address_dwords << 2, + memexport_range.size_dwords << 2); + return false; + } + } + retflag = false; + return {}; +} +XE_NOINLINE +XE_COLD +void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() { + // Make sure this memexporting draw is ordered with other work using shared + // memory as a UAV. + // TODO(Triang3l): Find some PM4 command that can be used for indication of + // when memexports should be awaited? + shared_memory_->MarkUAVWritesCommitNeeded(); + // Invalidate textures in memexported memory and watch for changes. + for (uint32_t i = 0; i < memexport_range_count_; ++i) { + const MemExportRange& memexport_range = memexport_ranges_[i]; + shared_memory_->RangeWrittenByGpu(memexport_range.base_address_dwords << 2, + memexport_range.size_dwords << 2, false); + } + if (cvars::d3d12_readback_memexport) { + // Read the exported data on the CPU. + uint32_t memexport_total_size = 0; + for (uint32_t i = 0; i < memexport_range_count_; ++i) { + memexport_total_size += memexport_ranges_[i].size_dwords << 2; + } + if (memexport_total_size != 0) { + ID3D12Resource* readback_buffer = + RequestReadbackBuffer(memexport_total_size); + if (readback_buffer != nullptr) { + shared_memory_->UseAsCopySource(); + SubmitBarriers(); + ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer(); + uint32_t readback_buffer_offset = 0; + for (uint32_t i = 0; i < memexport_range_count_; ++i) { + const MemExportRange& memexport_range = memexport_ranges_[i]; + uint32_t memexport_range_size = memexport_range.size_dwords << 2; + deferred_command_list_.D3DCopyBufferRegion( + readback_buffer, readback_buffer_offset, shared_memory_buffer, + memexport_range.base_address_dwords << 2, memexport_range_size); + readback_buffer_offset += memexport_range_size; + } + if (AwaitAllQueueOperationsCompletion()) { + D3D12_RANGE readback_range; + readback_range.Begin = 0; + readback_range.End = memexport_total_size; + void* readback_mapping; + if (SUCCEEDED(readback_buffer->Map(0, &readback_range, + &readback_mapping))) { + const uint32_t* readback_dwords = + reinterpret_cast(readback_mapping); + for (uint32_t i = 0; i < memexport_range_count_; ++i) { + const MemExportRange& memexport_range = memexport_ranges_[i]; + std::memcpy(memory_->TranslatePhysical( + memexport_range.base_address_dwords << 2), + readback_dwords, memexport_range.size_dwords << 2); + readback_dwords += memexport_range.size_dwords; } + D3D12_RANGE readback_write_range = {}; + readback_buffer->Unmap(0, &readback_write_range); } } } } } - - return true; } void D3D12CommandProcessor::InitializeTrace() { @@ -3065,23 +3102,33 @@ bool D3D12CommandProcessor::IssueCopy() { if (!BeginSubmission(true)) { return false; } - uint32_t written_address, written_length; - if (!render_target_cache_->Resolve(*memory_, *shared_memory_, *texture_cache_, - written_address, written_length)) { - return false; + + if (!cvars::d3d12_readback_resolve) { + uint32_t written_address, written_length; + return render_target_cache_->Resolve(*memory_, *shared_memory_, + *texture_cache_, written_address, + written_length); + } else { + return IssueCopy_ReadbackResolvePath(); } - if (cvars::d3d12_readback_resolve && - !texture_cache_->IsDrawResolutionScaled() && written_length) { - // Read the resolved data on the CPU. - ID3D12Resource* readback_buffer = RequestReadbackBuffer(written_length); - if (readback_buffer != nullptr) { - shared_memory_->UseAsCopySource(); - SubmitBarriers(); - ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer(); - deferred_command_list_.D3DCopyBufferRegion( - readback_buffer, 0, shared_memory_buffer, written_address, - written_length); - if (AwaitAllQueueOperationsCompletion()) { + return true; +} +XE_NOINLINE +bool D3D12CommandProcessor::IssueCopy_ReadbackResolvePath() { + uint32_t written_address, written_length; + if (render_target_cache_->Resolve(*memory_, *shared_memory_, *texture_cache_, + written_address, written_length)) { + if (!texture_cache_->IsDrawResolutionScaled() && written_length) { + // Read the resolved data on the CPU. + ID3D12Resource* readback_buffer = RequestReadbackBuffer(written_length); + if (readback_buffer != nullptr) { + shared_memory_->UseAsCopySource(); + SubmitBarriers(); + ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer(); + deferred_command_list_.D3DCopyBufferRegion( + readback_buffer, 0, shared_memory_buffer, written_address, + written_length); + if (AwaitAllQueueOperationsCompletion()) { #if 1 D3D12_RANGE readback_range; readback_range.Begin = 0; @@ -3099,23 +3146,25 @@ bool D3D12CommandProcessor::IssueCopy() { } #else - dma::XeDMAJob job{}; - job.destination = memory_->TranslatePhysical(written_address); - job.size = written_length; - job.source = nullptr; - job.userdata1 = (void*)readback_buffer; - job.precall = DmaPrefunc; - job.postcall = DmaPostfunc; + dma::XeDMAJob job{}; + job.destination = memory_->TranslatePhysical(written_address); + job.size = written_length; + job.source = nullptr; + job.userdata1 = (void*)readback_buffer; + job.precall = DmaPrefunc; + job.postcall = DmaPostfunc; - readback_available_ = GetDMAC()->PushDMAJob(&job); + readback_available_ = GetDMAC()->PushDMAJob(&job); #endif + } } } + } else { + return false; } return true; } - void D3D12CommandProcessor::CheckSubmissionFence(uint64_t await_submission) { if (await_submission >= submission_current_) { if (submission_open_) { @@ -4707,195 +4756,11 @@ bool D3D12CommandProcessor::UpdateBindings( ~(1u << kRootParameter_Bindless_DescriptorIndicesPixel); } } else { - // - // Bindful descriptors path. - // - - // See what descriptors need to be updated. - // Samplers have already been checked. - bool write_textures_vertex = - texture_count_vertex && - (!bindful_textures_written_vertex_ || - current_texture_layout_uid_vertex_ != texture_layout_uid_vertex || - !texture_cache_->AreActiveTextureSRVKeysUpToDate( - current_texture_srv_keys_vertex_.data(), textures_vertex.data(), - texture_count_vertex)); - bool write_textures_pixel = - texture_count_pixel && - (!bindful_textures_written_pixel_ || - current_texture_layout_uid_pixel_ != texture_layout_uid_pixel || - !texture_cache_->AreActiveTextureSRVKeysUpToDate( - current_texture_srv_keys_pixel_.data(), textures_pixel->data(), - texture_count_pixel)); - bool write_samplers_vertex = - sampler_count_vertex && !bindful_samplers_written_vertex_; - bool write_samplers_pixel = - sampler_count_pixel && !bindful_samplers_written_pixel_; - bool edram_rov_used = render_target_cache_->GetPath() == - RenderTargetCache::Path::kPixelShaderInterlock; - - // Allocate the descriptors. - size_t view_count_partial_update = 0; - if (write_textures_vertex) { - view_count_partial_update += texture_count_vertex; - } - if (write_textures_pixel) { - view_count_partial_update += texture_count_pixel; - } - // All the constants + shared memory SRV and UAV + textures. - size_t view_count_full_update = - 2 + texture_count_vertex + texture_count_pixel; - if (edram_rov_used) { - // + EDRAM UAV. - ++view_count_full_update; - } - D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle; - D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle; - uint32_t descriptor_size_view = provider.GetViewDescriptorSize(); - uint64_t view_heap_index = RequestViewBindfulDescriptors( - draw_view_bindful_heap_index_, uint32_t(view_count_partial_update), - uint32_t(view_count_full_update), view_cpu_handle, view_gpu_handle); - if (view_heap_index == - ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) { - XELOGE("Failed to allocate view descriptors"); - return false; - } - size_t sampler_count_partial_update = 0; - if (write_samplers_vertex) { - sampler_count_partial_update += sampler_count_vertex; - } - if (write_samplers_pixel) { - sampler_count_partial_update += sampler_count_pixel; - } - D3D12_CPU_DESCRIPTOR_HANDLE sampler_cpu_handle = {}; - D3D12_GPU_DESCRIPTOR_HANDLE sampler_gpu_handle = {}; - uint32_t descriptor_size_sampler = provider.GetSamplerDescriptorSize(); - uint64_t sampler_heap_index = - ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid; - if (sampler_count_vertex != 0 || sampler_count_pixel != 0) { - sampler_heap_index = RequestSamplerBindfulDescriptors( - draw_sampler_bindful_heap_index_, - uint32_t(sampler_count_partial_update), - uint32_t(sampler_count_vertex + sampler_count_pixel), - sampler_cpu_handle, sampler_gpu_handle); - if (sampler_heap_index == - ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) { - XELOGE("Failed to allocate sampler descriptors"); - return false; - } - } - if (draw_view_bindful_heap_index_ != view_heap_index) { - // Need to update all view descriptors. - write_textures_vertex = texture_count_vertex != 0; - write_textures_pixel = texture_count_pixel != 0; - bindful_textures_written_vertex_ = false; - bindful_textures_written_pixel_ = false; - // If updating fully, write the shared memory SRV and UAV descriptors and, - // if needed, the EDRAM descriptor. - gpu_handle_shared_memory_and_edram_ = view_gpu_handle; - shared_memory_->WriteRawSRVDescriptor(view_cpu_handle); - view_cpu_handle.ptr += descriptor_size_view; - view_gpu_handle.ptr += descriptor_size_view; - shared_memory_->WriteRawUAVDescriptor(view_cpu_handle); - view_cpu_handle.ptr += descriptor_size_view; - view_gpu_handle.ptr += descriptor_size_view; - if (edram_rov_used) { - render_target_cache_->WriteEdramUintPow2UAVDescriptor(view_cpu_handle, - 2); - view_cpu_handle.ptr += descriptor_size_view; - view_gpu_handle.ptr += descriptor_size_view; - } - current_graphics_root_up_to_date_ &= - ~(1u << kRootParameter_Bindful_SharedMemoryAndEdram); - } - if (sampler_heap_index != - ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid && - draw_sampler_bindful_heap_index_ != sampler_heap_index) { - write_samplers_vertex = sampler_count_vertex != 0; - write_samplers_pixel = sampler_count_pixel != 0; - bindful_samplers_written_vertex_ = false; - bindful_samplers_written_pixel_ = false; - } - - // Write the descriptors. - if (write_textures_vertex) { - assert_true(current_graphics_root_bindful_extras_.textures_vertex != - RootBindfulExtraParameterIndices::kUnavailable); - gpu_handle_textures_vertex_ = view_gpu_handle; - for (size_t i = 0; i < texture_count_vertex; ++i) { - texture_cache_->WriteActiveTextureBindfulSRV(textures_vertex[i], - view_cpu_handle); - view_cpu_handle.ptr += descriptor_size_view; - view_gpu_handle.ptr += descriptor_size_view; - } - current_texture_layout_uid_vertex_ = texture_layout_uid_vertex; - current_texture_srv_keys_vertex_.resize( - std::max(current_texture_srv_keys_vertex_.size(), - size_t(texture_count_vertex))); - texture_cache_->WriteActiveTextureSRVKeys( - current_texture_srv_keys_vertex_.data(), textures_vertex.data(), - texture_count_vertex); - bindful_textures_written_vertex_ = true; - current_graphics_root_up_to_date_ &= - ~(1u << current_graphics_root_bindful_extras_.textures_vertex); - } - if (write_textures_pixel) { - assert_true(current_graphics_root_bindful_extras_.textures_pixel != - RootBindfulExtraParameterIndices::kUnavailable); - gpu_handle_textures_pixel_ = view_gpu_handle; - for (size_t i = 0; i < texture_count_pixel; ++i) { - texture_cache_->WriteActiveTextureBindfulSRV((*textures_pixel)[i], - view_cpu_handle); - view_cpu_handle.ptr += descriptor_size_view; - view_gpu_handle.ptr += descriptor_size_view; - } - current_texture_layout_uid_pixel_ = texture_layout_uid_pixel; - current_texture_srv_keys_pixel_.resize(std::max( - current_texture_srv_keys_pixel_.size(), size_t(texture_count_pixel))); - texture_cache_->WriteActiveTextureSRVKeys( - current_texture_srv_keys_pixel_.data(), textures_pixel->data(), - texture_count_pixel); - bindful_textures_written_pixel_ = true; - current_graphics_root_up_to_date_ &= - ~(1u << current_graphics_root_bindful_extras_.textures_pixel); - } - if (write_samplers_vertex) { - assert_true(current_graphics_root_bindful_extras_.samplers_vertex != - RootBindfulExtraParameterIndices::kUnavailable); - gpu_handle_samplers_vertex_ = sampler_gpu_handle; - for (size_t i = 0; i < sampler_count_vertex; ++i) { - texture_cache_->WriteSampler(current_samplers_vertex_[i], - sampler_cpu_handle); - sampler_cpu_handle.ptr += descriptor_size_sampler; - sampler_gpu_handle.ptr += descriptor_size_sampler; - } - // Current samplers have already been updated. - bindful_samplers_written_vertex_ = true; - current_graphics_root_up_to_date_ &= - ~(1u << current_graphics_root_bindful_extras_.samplers_vertex); - } - if (write_samplers_pixel) { - assert_true(current_graphics_root_bindful_extras_.samplers_pixel != - RootBindfulExtraParameterIndices::kUnavailable); - gpu_handle_samplers_pixel_ = sampler_gpu_handle; - for (size_t i = 0; i < sampler_count_pixel; ++i) { - texture_cache_->WriteSampler(current_samplers_pixel_[i], - sampler_cpu_handle); - sampler_cpu_handle.ptr += descriptor_size_sampler; - sampler_gpu_handle.ptr += descriptor_size_sampler; - } - // Current samplers have already been updated. - bindful_samplers_written_pixel_ = true; - current_graphics_root_up_to_date_ &= - ~(1u << current_graphics_root_bindful_extras_.samplers_pixel); - } - - // Wrote new descriptors on the current page. - draw_view_bindful_heap_index_ = view_heap_index; - if (sampler_heap_index != - ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) { - draw_sampler_bindful_heap_index_ = sampler_heap_index; - } + bool retflag; + bool retval = UpdateBindings_BindfulPath( + texture_layout_uid_vertex, textures_vertex, texture_layout_uid_pixel, + textures_pixel, sampler_count_vertex, sampler_count_pixel, retflag); + if (retflag) return retval; } // Update the root parameters. @@ -4967,47 +4832,255 @@ bool D3D12CommandProcessor::UpdateBindings( << kRootParameter_Bindless_ViewHeap; } } else { - if (!(current_graphics_root_up_to_date_ & - (1u << kRootParameter_Bindful_SharedMemoryAndEdram))) { - deferred_command_list_.D3DSetGraphicsRootDescriptorTable( - kRootParameter_Bindful_SharedMemoryAndEdram, - gpu_handle_shared_memory_and_edram_); - current_graphics_root_up_to_date_ |= - 1u << kRootParameter_Bindful_SharedMemoryAndEdram; - } - uint32_t extra_index; - extra_index = current_graphics_root_bindful_extras_.textures_pixel; - if (extra_index != RootBindfulExtraParameterIndices::kUnavailable && - !(current_graphics_root_up_to_date_ & (1u << extra_index))) { - deferred_command_list_.D3DSetGraphicsRootDescriptorTable( - extra_index, gpu_handle_textures_pixel_); - current_graphics_root_up_to_date_ |= 1u << extra_index; - } - extra_index = current_graphics_root_bindful_extras_.samplers_pixel; - if (extra_index != RootBindfulExtraParameterIndices::kUnavailable && - !(current_graphics_root_up_to_date_ & (1u << extra_index))) { - deferred_command_list_.D3DSetGraphicsRootDescriptorTable( - extra_index, gpu_handle_samplers_pixel_); - current_graphics_root_up_to_date_ |= 1u << extra_index; - } - extra_index = current_graphics_root_bindful_extras_.textures_vertex; - if (extra_index != RootBindfulExtraParameterIndices::kUnavailable && - !(current_graphics_root_up_to_date_ & (1u << extra_index))) { - deferred_command_list_.D3DSetGraphicsRootDescriptorTable( - extra_index, gpu_handle_textures_vertex_); - current_graphics_root_up_to_date_ |= 1u << extra_index; - } - extra_index = current_graphics_root_bindful_extras_.samplers_vertex; - if (extra_index != RootBindfulExtraParameterIndices::kUnavailable && - !(current_graphics_root_up_to_date_ & (1u << extra_index))) { - deferred_command_list_.D3DSetGraphicsRootDescriptorTable( - extra_index, gpu_handle_samplers_vertex_); - current_graphics_root_up_to_date_ |= 1u << extra_index; - } + UpdateBindings_UpdateRootBindful(); } return true; } +XE_COLD +XE_NOINLINE +void D3D12CommandProcessor::UpdateBindings_UpdateRootBindful() { + if (!(current_graphics_root_up_to_date_ & + (1u << kRootParameter_Bindful_SharedMemoryAndEdram))) { + deferred_command_list_.D3DSetGraphicsRootDescriptorTable( + kRootParameter_Bindful_SharedMemoryAndEdram, + gpu_handle_shared_memory_and_edram_); + current_graphics_root_up_to_date_ |= + 1u << kRootParameter_Bindful_SharedMemoryAndEdram; + } + uint32_t extra_index; + extra_index = current_graphics_root_bindful_extras_.textures_pixel; + if (extra_index != RootBindfulExtraParameterIndices::kUnavailable && + !(current_graphics_root_up_to_date_ & (1u << extra_index))) { + deferred_command_list_.D3DSetGraphicsRootDescriptorTable( + extra_index, gpu_handle_textures_pixel_); + current_graphics_root_up_to_date_ |= 1u << extra_index; + } + extra_index = current_graphics_root_bindful_extras_.samplers_pixel; + if (extra_index != RootBindfulExtraParameterIndices::kUnavailable && + !(current_graphics_root_up_to_date_ & (1u << extra_index))) { + deferred_command_list_.D3DSetGraphicsRootDescriptorTable( + extra_index, gpu_handle_samplers_pixel_); + current_graphics_root_up_to_date_ |= 1u << extra_index; + } + extra_index = current_graphics_root_bindful_extras_.textures_vertex; + if (extra_index != RootBindfulExtraParameterIndices::kUnavailable && + !(current_graphics_root_up_to_date_ & (1u << extra_index))) { + deferred_command_list_.D3DSetGraphicsRootDescriptorTable( + extra_index, gpu_handle_textures_vertex_); + current_graphics_root_up_to_date_ |= 1u << extra_index; + } + extra_index = current_graphics_root_bindful_extras_.samplers_vertex; + if (extra_index != RootBindfulExtraParameterIndices::kUnavailable && + !(current_graphics_root_up_to_date_ & (1u << extra_index))) { + deferred_command_list_.D3DSetGraphicsRootDescriptorTable( + extra_index, gpu_handle_samplers_vertex_); + current_graphics_root_up_to_date_ |= 1u << extra_index; + } +} +XE_NOINLINE +XE_COLD +bool D3D12CommandProcessor::UpdateBindings_BindfulPath( + const size_t texture_layout_uid_vertex, + const std::vector& textures_vertex, + const size_t texture_layout_uid_pixel, + const std::vector* textures_pixel, + const size_t sampler_count_vertex, const size_t sampler_count_pixel, + bool& retflag) { + retflag = true; + auto& provider = this->GetD3D12Provider(); + size_t texture_count_pixel = textures_pixel->size(); + size_t texture_count_vertex = textures_vertex.size(); + // + // Bindful descriptors path. + // + + // See what descriptors need to be updated. + // Samplers have already been checked. + bool write_textures_vertex = + texture_count_vertex && + (!bindful_textures_written_vertex_ || + current_texture_layout_uid_vertex_ != texture_layout_uid_vertex || + !texture_cache_->AreActiveTextureSRVKeysUpToDate( + current_texture_srv_keys_vertex_.data(), textures_vertex.data(), + texture_count_vertex)); + bool write_textures_pixel = + texture_count_pixel && + (!bindful_textures_written_pixel_ || + current_texture_layout_uid_pixel_ != texture_layout_uid_pixel || + !texture_cache_->AreActiveTextureSRVKeysUpToDate( + current_texture_srv_keys_pixel_.data(), textures_pixel->data(), + texture_count_pixel)); + bool write_samplers_vertex = + sampler_count_vertex && !bindful_samplers_written_vertex_; + bool write_samplers_pixel = + sampler_count_pixel && !bindful_samplers_written_pixel_; + bool edram_rov_used = render_target_cache_->GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock; + + // Allocate the descriptors. + size_t view_count_partial_update = 0; + if (write_textures_vertex) { + view_count_partial_update += texture_count_vertex; + } + if (write_textures_pixel) { + view_count_partial_update += texture_count_pixel; + } + // All the constants + shared memory SRV and UAV + textures. + size_t view_count_full_update = + 2 + texture_count_vertex + texture_count_pixel; + if (edram_rov_used) { + // + EDRAM UAV. + ++view_count_full_update; + } + D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle; + D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle; + uint32_t descriptor_size_view = provider.GetViewDescriptorSize(); + uint64_t view_heap_index = RequestViewBindfulDescriptors( + draw_view_bindful_heap_index_, uint32_t(view_count_partial_update), + uint32_t(view_count_full_update), view_cpu_handle, view_gpu_handle); + if (view_heap_index == + ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) { + XELOGE("Failed to allocate view descriptors"); + return false; + } + size_t sampler_count_partial_update = 0; + if (write_samplers_vertex) { + sampler_count_partial_update += sampler_count_vertex; + } + if (write_samplers_pixel) { + sampler_count_partial_update += sampler_count_pixel; + } + D3D12_CPU_DESCRIPTOR_HANDLE sampler_cpu_handle = {}; + D3D12_GPU_DESCRIPTOR_HANDLE sampler_gpu_handle = {}; + uint32_t descriptor_size_sampler = provider.GetSamplerDescriptorSize(); + uint64_t sampler_heap_index = + ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid; + if (sampler_count_vertex != 0 || sampler_count_pixel != 0) { + sampler_heap_index = RequestSamplerBindfulDescriptors( + draw_sampler_bindful_heap_index_, + uint32_t(sampler_count_partial_update), + uint32_t(sampler_count_vertex + sampler_count_pixel), + sampler_cpu_handle, sampler_gpu_handle); + if (sampler_heap_index == + ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) { + XELOGE("Failed to allocate sampler descriptors"); + return false; + } + } + if (draw_view_bindful_heap_index_ != view_heap_index) { + // Need to update all view descriptors. + write_textures_vertex = texture_count_vertex != 0; + write_textures_pixel = texture_count_pixel != 0; + bindful_textures_written_vertex_ = false; + bindful_textures_written_pixel_ = false; + // If updating fully, write the shared memory SRV and UAV descriptors and, + // if needed, the EDRAM descriptor. + gpu_handle_shared_memory_and_edram_ = view_gpu_handle; + shared_memory_->WriteRawSRVDescriptor(view_cpu_handle); + view_cpu_handle.ptr += descriptor_size_view; + view_gpu_handle.ptr += descriptor_size_view; + shared_memory_->WriteRawUAVDescriptor(view_cpu_handle); + view_cpu_handle.ptr += descriptor_size_view; + view_gpu_handle.ptr += descriptor_size_view; + if (edram_rov_used) { + render_target_cache_->WriteEdramUintPow2UAVDescriptor(view_cpu_handle, 2); + view_cpu_handle.ptr += descriptor_size_view; + view_gpu_handle.ptr += descriptor_size_view; + } + current_graphics_root_up_to_date_ &= + ~(1u << kRootParameter_Bindful_SharedMemoryAndEdram); + } + if (sampler_heap_index != + ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid && + draw_sampler_bindful_heap_index_ != sampler_heap_index) { + write_samplers_vertex = sampler_count_vertex != 0; + write_samplers_pixel = sampler_count_pixel != 0; + bindful_samplers_written_vertex_ = false; + bindful_samplers_written_pixel_ = false; + } + + // Write the descriptors. + if (write_textures_vertex) { + assert_true(current_graphics_root_bindful_extras_.textures_vertex != + RootBindfulExtraParameterIndices::kUnavailable); + gpu_handle_textures_vertex_ = view_gpu_handle; + for (size_t i = 0; i < texture_count_vertex; ++i) { + texture_cache_->WriteActiveTextureBindfulSRV(textures_vertex[i], + view_cpu_handle); + view_cpu_handle.ptr += descriptor_size_view; + view_gpu_handle.ptr += descriptor_size_view; + } + current_texture_layout_uid_vertex_ = texture_layout_uid_vertex; + current_texture_srv_keys_vertex_.resize(std::max( + current_texture_srv_keys_vertex_.size(), size_t(texture_count_vertex))); + texture_cache_->WriteActiveTextureSRVKeys( + current_texture_srv_keys_vertex_.data(), textures_vertex.data(), + texture_count_vertex); + bindful_textures_written_vertex_ = true; + current_graphics_root_up_to_date_ &= + ~(1u << current_graphics_root_bindful_extras_.textures_vertex); + } + if (write_textures_pixel) { + assert_true(current_graphics_root_bindful_extras_.textures_pixel != + RootBindfulExtraParameterIndices::kUnavailable); + gpu_handle_textures_pixel_ = view_gpu_handle; + for (size_t i = 0; i < texture_count_pixel; ++i) { + texture_cache_->WriteActiveTextureBindfulSRV((*textures_pixel)[i], + view_cpu_handle); + view_cpu_handle.ptr += descriptor_size_view; + view_gpu_handle.ptr += descriptor_size_view; + } + current_texture_layout_uid_pixel_ = texture_layout_uid_pixel; + current_texture_srv_keys_pixel_.resize(std::max( + current_texture_srv_keys_pixel_.size(), size_t(texture_count_pixel))); + texture_cache_->WriteActiveTextureSRVKeys( + current_texture_srv_keys_pixel_.data(), textures_pixel->data(), + texture_count_pixel); + bindful_textures_written_pixel_ = true; + current_graphics_root_up_to_date_ &= + ~(1u << current_graphics_root_bindful_extras_.textures_pixel); + } + if (write_samplers_vertex) { + assert_true(current_graphics_root_bindful_extras_.samplers_vertex != + RootBindfulExtraParameterIndices::kUnavailable); + gpu_handle_samplers_vertex_ = sampler_gpu_handle; + for (size_t i = 0; i < sampler_count_vertex; ++i) { + texture_cache_->WriteSampler(current_samplers_vertex_[i], + sampler_cpu_handle); + sampler_cpu_handle.ptr += descriptor_size_sampler; + sampler_gpu_handle.ptr += descriptor_size_sampler; + } + // Current samplers have already been updated. + bindful_samplers_written_vertex_ = true; + current_graphics_root_up_to_date_ &= + ~(1u << current_graphics_root_bindful_extras_.samplers_vertex); + } + if (write_samplers_pixel) { + assert_true(current_graphics_root_bindful_extras_.samplers_pixel != + RootBindfulExtraParameterIndices::kUnavailable); + gpu_handle_samplers_pixel_ = sampler_gpu_handle; + for (size_t i = 0; i < sampler_count_pixel; ++i) { + texture_cache_->WriteSampler(current_samplers_pixel_[i], + sampler_cpu_handle); + sampler_cpu_handle.ptr += descriptor_size_sampler; + sampler_gpu_handle.ptr += descriptor_size_sampler; + } + // Current samplers have already been updated. + bindful_samplers_written_pixel_ = true; + current_graphics_root_up_to_date_ &= + ~(1u << current_graphics_root_bindful_extras_.samplers_pixel); + } + + // Wrote new descriptors on the current page. + draw_view_bindful_heap_index_ = view_heap_index; + if (sampler_heap_index != + ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) { + draw_sampler_bindful_heap_index_ = sampler_heap_index; + } + retflag = false; + return {}; +} uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize( xenos::ColorFormat format) { @@ -5043,7 +5116,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) { if (size == 0) { return nullptr; } -#if 0 +#if 1 if (readback_available_) { GetDMAC()->WaitJobDone(readback_available_); readback_available_ = 0; diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 37d048d29..ba2c17a82 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -45,7 +45,10 @@ namespace xe { namespace gpu { namespace d3d12 { - +struct MemExportRange { + uint32_t base_address_dwords; + uint32_t size_dwords; +}; class D3D12CommandProcessor final : public CommandProcessor { public: #include "../pm4_command_processor_declare.h" @@ -287,8 +290,21 @@ class D3D12CommandProcessor final : public CommandProcessor { bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count, IndexBufferInfo* index_buffer_info, bool major_mode_explicit) override; + XE_COLD + XE_NOINLINE + bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer, + D3D12_INDEX_BUFFER_VIEW& index_buffer_view, + uint32_t guest_index_base, + bool& retflag); + XE_NOINLINE + XE_COLD + bool GatherMemexportRangesAndMakeResident(bool& retflag); + XE_NOINLINE + XE_COLD + void HandleMemexportDrawOrdering_AndReadback(); bool IssueCopy() override; - + XE_NOINLINE + bool IssueCopy_ReadbackResolvePath(); void InitializeTrace() override; private: @@ -363,6 +379,8 @@ class D3D12CommandProcessor final : public CommandProcessor { }; // Gets the indices of optional root parameters. Returns the total parameter // count. + XE_NOINLINE + XE_COLD static uint32_t GetRootBindfulExtraParameterIndices( const DxbcShader* vertex_shader, const DxbcShader* pixel_shader, RootBindfulExtraParameterIndices& indices_out); @@ -437,6 +455,18 @@ class D3D12CommandProcessor final : public CommandProcessor { bool UpdateBindings(const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, ID3D12RootSignature* root_signature); + XE_COLD + XE_NOINLINE + void UpdateBindings_UpdateRootBindful(); + XE_NOINLINE + XE_COLD + bool UpdateBindings_BindfulPath( + const size_t texture_layout_uid_vertex, + const std::vector& textures_vertex, + const size_t texture_layout_uid_pixel, + const std::vector* textures_pixel, + const size_t sampler_count_vertex, const size_t sampler_count_pixel, + bool& retflag); // Returns dword count for one element for a memexport format, or 0 if it's // not supported by the D3D12 command processor (if it's smaller that 1 dword, @@ -743,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor { draw_util::GetViewportInfoArgs previous_viewport_info_args_; draw_util::ViewportInfo previous_viewport_info_; + // scratch memexport data + MemExportRange memexport_ranges_[512]; + uint32_t memexport_range_count_ = 0; }; } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/deferred_command_list.cc b/src/xenia/gpu/d3d12/deferred_command_list.cc index c27c8b226..0d647331f 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.cc +++ b/src/xenia/gpu/d3d12/deferred_command_list.cc @@ -266,22 +266,9 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, void* DeferredCommandList::WriteCommand(Command command, size_t arguments_size_bytes) { - size_t arguments_size_elements = round_up(arguments_size_bytes, sizeof(uintmax_t), false); - //(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t); - #if 0 - size_t offset = command_stream_.size(); - command_stream_.resize(offset + kCommandHeaderSizeElements + - arguments_size_elements); - CommandHeader& header = - *reinterpret_cast(command_stream_.data() + offset); - header.command = command; - header.arguments_size_elements = uint32_t(arguments_size_elements); - return command_stream_.data() + (offset + kCommandHeaderSizeElements); - #else - size_t offset = command_stream_.size(); constexpr size_t kCommandHeaderSizeBytes = kCommandHeaderSizeElements * sizeof(uintmax_t); @@ -290,9 +277,9 @@ void* DeferredCommandList::WriteCommand(Command command, CommandHeader& header = *reinterpret_cast(command_stream_.data() + offset); header.command = command; - header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t); + header.arguments_size_elements = + uint32_t(arguments_size_elements) / sizeof(uintmax_t); return command_stream_.data() + (offset + kCommandHeaderSizeBytes); - #endif } } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index d9914e566..29501b299 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -183,7 +183,7 @@ void PipelineCache::Shutdown() { // creating them. if (!creation_threads_.empty()) { { - std::lock_guard lock(creation_request_lock_); + std::lock_guard lock(creation_request_lock_); creation_threads_shutdown_from_ = 0; } creation_request_cond_.notify_all(); @@ -681,7 +681,7 @@ void PipelineCache::InitializeShaderStorage( if (!creation_threads_.empty()) { // Submit the pipeline for creation to any available thread. { - std::lock_guard lock(creation_request_lock_); + std::lock_guard lock(creation_request_lock_); creation_queue_.push_back(new_pipeline); } creation_request_cond_.notify_one(); @@ -695,7 +695,7 @@ void PipelineCache::InitializeShaderStorage( CreateQueuedPipelinesOnProcessorThread(); if (creation_threads_.size() > creation_thread_original_count) { { - std::lock_guard lock(creation_request_lock_); + std::lock_guard lock(creation_request_lock_); creation_threads_shutdown_from_ = creation_thread_original_count; // Assuming the queue is empty because of // CreateQueuedPipelinesOnProcessorThread. @@ -708,7 +708,7 @@ void PipelineCache::InitializeShaderStorage( bool await_creation_completion_event; { // Cleanup so additional threads can be created later again. - std::lock_guard lock(creation_request_lock_); + std::lock_guard lock(creation_request_lock_); creation_threads_shutdown_from_ = SIZE_MAX; // If the invocation is blocking, all the shader storage // initialization is expected to be done before proceeding, to avoid @@ -813,7 +813,7 @@ void PipelineCache::EndSubmission() { // Await creation of all queued pipelines. bool await_creation_completion_event; { - std::lock_guard lock(creation_request_lock_); + std::lock_guard lock(creation_request_lock_); // Assuming the creation queue is already empty (because the processor // thread also worked on creating the leftover pipelines), so only check // if there are threads with pipelines currently being created. @@ -834,7 +834,7 @@ bool PipelineCache::IsCreatingPipelines() { if (creation_threads_.empty()) { return false; } - std::lock_guard lock(creation_request_lock_); + std::lock_guard lock(creation_request_lock_); return !creation_queue_.empty() || creation_threads_busy_ != 0; } @@ -1076,7 +1076,7 @@ bool PipelineCache::ConfigurePipeline( if (!creation_threads_.empty()) { // Submit the pipeline for creation to any available thread. { - std::lock_guard lock(creation_request_lock_); + std::lock_guard lock(creation_request_lock_); creation_queue_.push_back(new_pipeline); } creation_request_cond_.notify_one(); @@ -3314,7 +3314,7 @@ void PipelineCache::CreationThread(size_t thread_index) { // Check if need to shut down or set the completion event and dequeue the // pipeline if there is any. { - std::unique_lock lock(creation_request_lock_); + std::unique_lock lock(creation_request_lock_); if (thread_index >= creation_threads_shutdown_from_ || creation_queue_.empty()) { if (creation_completion_set_event_ && creation_threads_busy_ == 0) { @@ -3345,7 +3345,7 @@ void PipelineCache::CreationThread(size_t thread_index) { // completion event if needed (at the next iteration, or in some other // thread). { - std::lock_guard lock(creation_request_lock_); + std::lock_guard lock(creation_request_lock_); --creation_threads_busy_; } } @@ -3356,7 +3356,7 @@ void PipelineCache::CreateQueuedPipelinesOnProcessorThread() { while (true) { Pipeline* pipeline_to_create; { - std::lock_guard lock(creation_request_lock_); + std::lock_guard lock(creation_request_lock_); if (creation_queue_.empty()) { break; } diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index 37e73cae4..43e528d35 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -403,8 +403,8 @@ class PipelineCache { // Pipeline creation threads. void CreationThread(size_t thread_index); void CreateQueuedPipelinesOnProcessorThread(); - std::mutex creation_request_lock_; - std::condition_variable creation_request_cond_; + xe_mutex creation_request_lock_; + std::condition_variable_any creation_request_cond_; // Protected with creation_request_lock_, notify_one creation_request_cond_ // when set. std::deque creation_queue_; diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 24b1eefdc..5c62c50c3 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -650,7 +650,8 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs, } return normalized_color_mask; } - +XE_NOINLINE +XE_NOALIAS xenos::CopySampleSelect SanitizeCopySampleSelect( xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples, bool is_depth) { @@ -737,7 +738,7 @@ const ResolveCopyShaderInfo {"Resolve Copy Full 64bpp", true, 2, 4, 5, 3}, {"Resolve Copy Full 128bpp", true, 2, 4, 4, 3}, }; - +XE_MSVC_OPTIMIZE_SMALL() bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, TraceWriter& trace_writer, uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y, @@ -869,7 +870,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, y1 = y0 + int32_t(xenos::kMaxResolveSize); } // fails in forza horizon 1 - assert_true(x0 < x1 && y0 < y1); + //x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100 + assert_true(x0 <= x1 && y0 <= y1); if (x0 >= x1 || y0 >= y1) { XELOGE("Resolve region is empty"); return false; @@ -1108,7 +1110,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32; info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32; info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32; - + #if 0 XELOGD( "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially " "modified memory range 0x{:08X} to 0x{:08X})", @@ -1119,10 +1121,10 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, xenos::ColorRenderTargetFormat(color_edram_info.format)), FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start, copy_dest_extent_end); - + #endif return true; } - +XE_MSVC_OPTIMIZE_REVERT() ResolveCopyShaderIndex ResolveInfo::GetCopyShader( uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y, ResolveCopyShaderConstants& constants_out, uint32_t& group_count_x_out, diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index 420bafcf2..15c014520 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -475,6 +475,8 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA( // To avoid passing values that the shader won't understand (even though // Direct3D 9 shouldn't pass them anyway). +XE_NOINLINE +XE_NOALIAS xenos::CopySampleSelect SanitizeCopySampleSelect( xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples, bool is_depth); diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h index 53b81b888..1c877a9ab 100644 --- a/src/xenia/gpu/pm4_command_processor_implement.h +++ b/src/xenia/gpu/pm4_command_processor_implement.h @@ -14,6 +14,11 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr, new (&reader_) RingBuffer(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)); reader_.set_write_offset(count * sizeof(uint32_t)); + // prefetch the wraparound range + // it likely is already in L3 cache, but in a zen system it may be another + // chiplets l3 + reader_.BeginPrefetchedRead( + COMMAND_PROCESSOR::GetCurrentRingReadCount()); do { if (COMMAND_PROCESSOR::ExecutePacket()) { continue; @@ -30,11 +35,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr, } bool COMMAND_PROCESSOR::ExecutePacket() { - // prefetch the wraparound range - // it likely is already in L3 cache, but in a zen system it may be another - // chiplets l3 - reader_.BeginPrefetchedRead( - COMMAND_PROCESSOR::GetCurrentRingReadCount()); const uint32_t packet = reader_.ReadAndSwap(); const uint32_t packet_type = packet >> 30; @@ -495,7 +495,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM( } else { xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100)); } - xe::threading::SyncMemory(); + // xe::threading::SyncMemory(); ReturnFromWait(); if (!worker_running_) { @@ -599,27 +599,28 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_COND_WRITE( value = register_file_->values[poll_reg_addr].u32; } bool matched = false; + value &= mask; switch (wait_info & 0x7) { case 0x0: // Never. matched = false; break; case 0x1: // Less than reference. - matched = (value & mask) < ref; + matched = value < ref; break; case 0x2: // Less than or equal to reference. - matched = (value & mask) <= ref; + matched = value <= ref; break; case 0x3: // Equal to reference. - matched = (value & mask) == ref; + matched = value == ref; break; case 0x4: // Not equal to reference. - matched = (value & mask) != ref; + matched = value != ref; break; case 0x5: // Greater than or equal to reference. - matched = (value & mask) >= ref; + matched = value >= ref; break; case 0x6: // Greater than reference. - matched = (value & mask) > ref; + matched = value > ref; break; case 0x7: // Always matched = true; @@ -1064,7 +1065,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE( assert_true(count - 2 >= size_dwords); auto shader = COMMAND_PROCESSOR::LoadShader( shader_type, uint32_t(reader_.read_ptr()), - reinterpret_cast(reader_.read_ptr()), size_dwords); + reinterpret_cast(reader_.read_ptr()), size_dwords); switch (shader_type) { case xenos::ShaderType::kVertex: active_vertex_shader_ = shader; diff --git a/src/xenia/gpu/primitive_processor.h b/src/xenia/gpu/primitive_processor.h index aac84885d..7c2e96e3b 100644 --- a/src/xenia/gpu/primitive_processor.h +++ b/src/xenia/gpu/primitive_processor.h @@ -430,7 +430,7 @@ class PrimitiveProcessor { --count; uint32_t index = *(source++) & low_bits_mask_guest_endian; *(dest++) = index != reset_index_guest_endian - ? xenos::GpuSwap(index, HostSwap) + ? xenos::GpuSwapInline(index, HostSwap) : UINT32_MAX; } if (count >= kSimdVectorU32Elements) { @@ -442,10 +442,10 @@ class PrimitiveProcessor { __m128i host_swap_shuffle; if constexpr (HostSwap != xenos::Endian::kNone) { host_swap_shuffle = _mm_set_epi32( - int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)), - int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)), - int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)), - int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap))); + int32_t(xenos::GpuSwapInline(uint32_t(0x0F0E0D0C), HostSwap)), + int32_t(xenos::GpuSwapInline(uint32_t(0x0B0A0908), HostSwap)), + int32_t(xenos::GpuSwapInline(uint32_t(0x07060504), HostSwap)), + int32_t(xenos::GpuSwapInline(uint32_t(0x03020100), HostSwap))); } #endif // XE_ARCH_AMD64 while (count >= kSimdVectorU32Elements) { @@ -490,7 +490,7 @@ class PrimitiveProcessor { while (count--) { uint32_t index = *(source++) & low_bits_mask_guest_endian; *(dest++) = index != reset_index_guest_endian - ? xenos::GpuSwap(index, HostSwap) + ? xenos::GpuSwapInline(index, HostSwap) : UINT32_MAX; } } @@ -510,19 +510,19 @@ class PrimitiveProcessor { }; struct To24Swapping8In16IndexTransform { uint32_t operator()(uint32_t index) const { - return xenos::GpuSwap(index, xenos::Endian::k8in16) & + return xenos::GpuSwapInline(index, xenos::Endian::k8in16) & xenos::kVertexIndexMask; } }; struct To24Swapping8In32IndexTransform { uint32_t operator()(uint32_t index) const { - return xenos::GpuSwap(index, xenos::Endian::k8in32) & + return xenos::GpuSwapInline(index, xenos::Endian::k8in32) & xenos::kVertexIndexMask; } }; struct To24Swapping16In32IndexTransform { uint32_t operator()(uint32_t index) const { - return xenos::GpuSwap(index, xenos::Endian::k16in32) & + return xenos::GpuSwapInline(index, xenos::Endian::k16in32) & xenos::kVertexIndexMask; } }; diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc index ffd77246e..38a8c54e9 100644 --- a/src/xenia/gpu/shared_memory.cc +++ b/src/xenia/gpu/shared_memory.cc @@ -388,6 +388,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length, bool any_data_resolved = false; uint32_t block_first = page_first >> 6; + swcache::PrefetchL1(&system_page_flags_[block_first]); uint32_t block_last = page_last >> 6; uint32_t range_start = UINT32_MAX; diff --git a/src/xenia/gpu/texture_util.cc b/src/xenia/gpu/texture_util.cc index b20194a78..cbe6c62bd 100644 --- a/src/xenia/gpu/texture_util.cc +++ b/src/xenia/gpu/texture_util.cc @@ -464,7 +464,8 @@ TextureGuestLayout GetGuestTextureLayout( return layout; } - +XE_NOINLINE +XE_NOALIAS int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch, uint32_t bytes_per_block_log2) { // https://github.com/gildor2/UModel/blob/de8fbd3bc922427ea056b7340202dcdcc19ccff5/Unreal/UnTexture.cpp#L489 @@ -481,7 +482,8 @@ int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch, return ((offset & ~0x1FF) << 3) + ((y & 16) << 7) + ((offset & 0x1C0) << 2) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6) + (offset & 0x3F); } - +XE_NOINLINE +XE_NOALIAS int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch, uint32_t height, uint32_t bytes_per_block_log2) { // Reconstructed from disassembly of XGRAPHICS::TileVolume. @@ -509,7 +511,8 @@ int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch, address += offset2 & 63; return address; } - +XE_NOINLINE +XE_NOALIAS uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom, uint32_t pitch, uint32_t bytes_per_block_log2) { @@ -538,7 +541,8 @@ uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom, } return upper_bound; } - +XE_NOINLINE +XE_NOALIAS uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom, uint32_t back, uint32_t pitch, uint32_t height, diff --git a/src/xenia/gpu/texture_util.h b/src/xenia/gpu/texture_util.h index bcc080de3..a6513a0c0 100644 --- a/src/xenia/gpu/texture_util.h +++ b/src/xenia/gpu/texture_util.h @@ -280,8 +280,12 @@ void GetTextureTotalSize(xenos::DataDimension dimension, // bytes_per_block_log2 is log2_floor according to how Direct3D 9 calculates it, // but k_32_32_32 textures are never tiled anyway likely. +XE_NOINLINE +XE_NOALIAS int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch, uint32_t bytes_per_block_log2); +XE_NOINLINE +XE_NOALIAS int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch, uint32_t height, uint32_t bytes_per_block_log2); // Because (0, 0, 0) within each 32x32x4-block tile is stored in memory first, @@ -308,9 +312,13 @@ inline uint32_t GetTiledAddressLowerBound3D(uint32_t left, uint32_t top, // Supporting the right > pitch and bottom > height (in tiles) cases also, for // estimation how far addresses can actually go even potentially beyond the // subresource stride. +XE_NOINLINE +XE_NOALIAS uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom, uint32_t pitch, uint32_t bytes_per_block_log2); +XE_NOINLINE +XE_NOALIAS uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom, uint32_t back, uint32_t pitch, uint32_t height, diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc index f15c621cd..997e9a48a 100644 --- a/src/xenia/gpu/xenos.cc +++ b/src/xenia/gpu/xenos.cc @@ -125,8 +125,8 @@ float Float7e3To32(uint32_t f10) { // Based on CFloat24 from d3dref9.dll and the 6e4 code from: // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2). - -uint32_t Float32To20e4(float f32, bool round_to_nearest_even) { +XE_NOALIAS +uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept { if (!(f32 > 0.0f)) { // Positive only, and not -0 or NaN. return 0; @@ -150,8 +150,8 @@ uint32_t Float32To20e4(float f32, bool round_to_nearest_even) { } return (f32u32 >> 3) & 0xFFFFFF; } - -float Float20e4To32(uint32_t f24) { +XE_NOALIAS +float Float20e4To32(uint32_t f24) noexcept { f24 &= 0xFFFFFF; if (!f24) { return 0.0f; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 8c03be479..8e9fd5c11 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -421,10 +421,12 @@ float Float7e3To32(uint32_t f10); // floating-point number. // Converts an IEEE-754 32-bit floating-point number to Xenos floating-point // depth, rounding to the nearest even or towards zero. -uint32_t Float32To20e4(float f32, bool round_to_nearest_even); +XE_NOALIAS +uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept; // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an // IEEE-754 32-bit floating-point number. -float Float20e4To32(uint32_t f24); +XE_NOALIAS +float Float20e4To32(uint32_t f24) noexcept; // Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit // floating-point number. constexpr float UNorm24To32(uint32_t n24) { @@ -1045,9 +1047,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) { return value; } } -XE_NOINLINE +XE_FORCEINLINE XE_NOALIAS -static uint32_t GpuSwap(uint32_t value, Endian endianness) { +static uint32_t GpuSwapInline(uint32_t value, Endian endianness) { switch (endianness) { default: case Endian::kNone: @@ -1065,6 +1067,11 @@ static uint32_t GpuSwap(uint32_t value, Endian endianness) { return ((value >> 16) & 0xFFFF) | (value << 16); } } +XE_NOINLINE +XE_NOALIAS +static uint32_t GpuSwap(uint32_t value, Endian endianness) { + return GpuSwapInline(value, endianness); +} inline float GpuSwap(float value, Endian endianness) { union { diff --git a/src/xenia/hid/input_system.cc b/src/xenia/hid/input_system.cc index 588faefe3..a21ce5a7b 100644 --- a/src/xenia/hid/input_system.cc +++ b/src/xenia/hid/input_system.cc @@ -137,8 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel( modified_vibration.right_motor_speed = 0; return modified_vibration; } -std::unique_lock InputSystem::lock() { - return std::unique_lock{lock_}; +std::unique_lock InputSystem::lock() { + return std::unique_lock{lock_}; } } // namespace hid } // namespace xe diff --git a/src/xenia/hid/input_system.h b/src/xenia/hid/input_system.h index 333116499..c294edc64 100644 --- a/src/xenia/hid/input_system.h +++ b/src/xenia/hid/input_system.h @@ -48,7 +48,7 @@ class InputSystem { void UpdateUsedSlot(uint8_t slot, bool connected); uint8_t GetConnectedSlots() const { return connected_slot; } - std::unique_lock lock(); + std::unique_lock lock(); private: xe::ui::Window* window_ = nullptr; @@ -57,7 +57,7 @@ class InputSystem { X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration); uint8_t connected_slot = 0b0001; - xe_unlikely_mutex lock_; + xe_mutex lock_; }; } // namespace hid diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc index 8e66ac683..b5bb6c57b 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc @@ -911,11 +911,17 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle, DECLARE_XBOXKRNL_EXPORT3(NtSignalAndWaitForSingleObjectEx, kThreading, kImplemented, kBlocking, kHighFrequency); +static void PrefetchForCAS(const void* value) { + if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) { + swcache::PrefetchW(value); + } +} + uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) { // XELOGD( // "KfAcquireSpinLock({:08X})", // lock_ptr); - + PrefetchForCAS(lock); // Lock. while (!xe::atomic_cas(0, 1, lock)) { // Spin! @@ -956,6 +962,7 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented, void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { // Lock. auto lock = reinterpret_cast(lock_ptr.host_address()); + PrefetchForCAS(lock); while (!xe::atomic_cas(0, 1, lock)) { #if XE_ARCH_AMD64 == 1 // todo: this is just a nop if they don't have SMT, which is not great @@ -973,6 +980,7 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading, dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { // Lock. auto lock = reinterpret_cast(lock_ptr.host_address()); + PrefetchForCAS(lock); if (!xe::atomic_cas(0, 1, lock)) { return 0; } diff --git a/src/xenia/kernel/xthread.cc b/src/xenia/kernel/xthread.cc index b842c2c08..084485c66 100644 --- a/src/xenia/kernel/xthread.cc +++ b/src/xenia/kernel/xthread.cc @@ -763,7 +763,8 @@ void XThread::SetActiveCpu(uint8_t cpu_index) { thread_->set_affinity_mask(uint64_t(1) << cpu_index); } } else { - XELOGW("Too few processor cores - scheduling will be wonky"); + //there no good reason why we need to log this... we don't perfectly emulate the 360's scheduler in any way + // XELOGW("Too few processor cores - scheduling will be wonky"); } } diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index 16e2b8336..f29eb21dc 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -713,6 +713,8 @@ void BaseHeap::Initialize(Memory* memory, uint8_t* membase, HeapType heap_type, heap_base_ = heap_base; heap_size_ = heap_size; page_size_ = page_size; + xenia_assert(xe::is_pow2(page_size_)); + page_size_shift_ = xe::log2_floor(page_size_); host_address_offset_ = host_address_offset; page_table_.resize(heap_size / page_size); unreserved_page_count_ = uint32_t(page_table_.size()); @@ -1234,14 +1236,14 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect, // fails and returns without modifying the access protection of any pages in // the specified region." - uint32_t start_page_number = (address - heap_base_) / page_size_; + uint32_t start_page_number = (address - heap_base_) >> page_size_shift_; if (start_page_number >= page_table_.size()) { XELOGE("BaseHeap::Protect failed due to out-of-bounds base address {:08X}", address); return false; } uint32_t end_page_number = - uint32_t((uint64_t(address) + size - 1 - heap_base_) / page_size_); + uint32_t((uint64_t(address) + size - 1 - heap_base_) >> page_size_shift_); if (end_page_number >= page_table_.size()) { XELOGE( "BaseHeap::Protect failed due to out-of-bounds range ({:08X} bytes " @@ -1268,17 +1270,21 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect, return false; } } + uint32_t xe_page_size = static_cast(xe::memory::page_size()); + + uint32_t page_size_mask = xe_page_size - 1; // Attempt host change (hopefully won't fail). // We can only do this if our size matches system page granularity. uint32_t page_count = end_page_number - start_page_number + 1; - if (page_size_ == xe::memory::page_size() || - (((page_count * page_size_) % xe::memory::page_size() == 0) && - ((start_page_number * page_size_) % xe::memory::page_size() == 0))) { + if (page_size_ == xe_page_size || + ((((page_count << page_size_shift_) & page_size_mask) == 0) && + (((start_page_number << page_size_shift_) & page_size_mask) == 0))) { memory::PageAccess old_protect_access; - if (!xe::memory::Protect(TranslateRelative(start_page_number * page_size_), - page_count * page_size_, ToPageAccess(protect), - old_protect ? &old_protect_access : nullptr)) { + if (!xe::memory::Protect( + TranslateRelative(start_page_number << page_size_shift_), + page_count << page_size_shift_, ToPageAccess(protect), + old_protect ? &old_protect_access : nullptr)) { XELOGE("BaseHeap::Protect failed due to host VirtualProtect failure"); return false; } @@ -1303,7 +1309,7 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect, bool BaseHeap::QueryRegionInfo(uint32_t base_address, HeapAllocationInfo* out_info) { - uint32_t start_page_number = (base_address - heap_base_) / page_size_; + uint32_t start_page_number = (base_address - heap_base_) >> page_size_shift_; if (start_page_number > page_table_.size()) { XELOGE("BaseHeap::QueryRegionInfo base page out of range"); return false; @@ -1321,9 +1327,10 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address, if (start_page_entry.state) { // Committed/reserved region. out_info->allocation_base = - heap_base_ + start_page_entry.base_address * page_size_; + heap_base_ + (start_page_entry.base_address << page_size_shift_); out_info->allocation_protect = start_page_entry.allocation_protect; - out_info->allocation_size = start_page_entry.region_page_count * page_size_; + out_info->allocation_size = start_page_entry.region_page_count + << page_size_shift_; out_info->state = start_page_entry.state; out_info->protect = start_page_entry.current_protect; @@ -1358,7 +1365,7 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address, } bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) { - uint32_t page_number = (address - heap_base_) / page_size_; + uint32_t page_number = (address - heap_base_) >> page_size_shift_; if (page_number > page_table_.size()) { XELOGE("BaseHeap::QuerySize base page out of range"); *out_size = 0; @@ -1366,12 +1373,12 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) { } auto global_lock = global_critical_region_.Acquire(); auto page_entry = page_table_[page_number]; - *out_size = (page_entry.region_page_count * page_size_); + *out_size = (page_entry.region_page_count << page_size_shift_); return true; } bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) { - uint32_t page_number = (*in_out_address - heap_base_) / page_size_; + uint32_t page_number = (*in_out_address - heap_base_) >> page_size_shift_; if (page_number > page_table_.size()) { XELOGE("BaseHeap::QuerySize base page out of range"); *out_size = 0; @@ -1379,13 +1386,13 @@ bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) { } auto global_lock = global_critical_region_.Acquire(); auto page_entry = page_table_[page_number]; - *in_out_address = (page_entry.base_address * page_size_); - *out_size = (page_entry.region_page_count * page_size_); + *in_out_address = (page_entry.base_address << page_size_shift_); + *out_size = (page_entry.region_page_count << page_size_shift_); return true; } bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) { - uint32_t page_number = (address - heap_base_) / page_size_; + uint32_t page_number = (address - heap_base_) >> page_size_shift_; if (page_number > page_table_.size()) { XELOGE("BaseHeap::QueryProtect base page out of range"); *out_protect = 0; @@ -1403,8 +1410,8 @@ xe::memory::PageAccess BaseHeap::QueryRangeAccess(uint32_t low_address, (high_address - heap_base_) >= heap_size_) { return xe::memory::PageAccess::kNoAccess; } - uint32_t low_page_number = (low_address - heap_base_) / page_size_; - uint32_t high_page_number = (high_address - heap_base_) / page_size_; + uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_; + uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_; uint32_t protect = kMemoryProtectRead | kMemoryProtectWrite; { auto global_lock = global_critical_region_.Acquire(); @@ -1446,6 +1453,8 @@ void PhysicalHeap::Initialize(Memory* memory, uint8_t* membase, page_size, host_address_offset); parent_heap_ = parent_heap; system_page_size_ = uint32_t(xe::memory::page_size()); + xenia_assert(xe::is_pow2(system_page_size_)); + system_page_shift_ = xe::log2_floor(system_page_size_); system_page_count_ = (size_t(heap_size_) + host_address_offset + (system_page_size_ - 1)) / @@ -1665,10 +1674,11 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address, } uint32_t system_page_first = - (heap_relative_address + host_address_offset()) / system_page_size_; + (heap_relative_address + host_address_offset()) >> system_page_shift_; + swcache::PrefetchL1(&system_page_flags_[system_page_first >> 6]); uint32_t system_page_last = - (heap_relative_address + length - 1 + host_address_offset()) / - system_page_size_; + (heap_relative_address + length - 1 + host_address_offset()) >> + system_page_shift_; system_page_last = std::min(system_page_last, system_page_count_ - 1); assert_true(system_page_first <= system_page_last); @@ -1677,10 +1687,40 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address, xe::memory::PageAccess protect_access = enable_data_providers ? xe::memory::PageAccess::kNoAccess : xe::memory::PageAccess::kReadOnly; + + auto global_lock = global_critical_region_.Acquire(); + if (enable_invalidation_notifications) { + EnableAccessCallbacksInner(system_page_first, system_page_last, + protect_access); + } else { + EnableAccessCallbacksInner(system_page_first, system_page_last, + protect_access); + } +} + +template +XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner( + const uint32_t system_page_first, const uint32_t system_page_last, + xe::memory::PageAccess protect_access) XE_RESTRICT { uint8_t* protect_base = membase_ + heap_base_; uint32_t protect_system_page_first = UINT32_MAX; - auto global_lock = global_critical_region_.Acquire(); - for (uint32_t i = system_page_first; i <= system_page_last; ++i) { + + SystemPageFlagsBlock* XE_RESTRICT sys_page_flags = system_page_flags_.data(); + PageEntry* XE_RESTRICT page_table_ptr = page_table_.data(); + + // chrispy: a lot of time is spent in this loop, and i think some of the work + // may be avoidable and repetitive profiling shows quite a bit of time spent + // in this loop, but very little spent actually calling Protect + uint32_t i = system_page_first; + + uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first); + uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last); + + uint32_t guest_one = + SystemPagenumToGuestPagenum(1); + + uint32_t system_one = GuestPagenumToSystemPagenum(1); + for (; i <= system_page_last; ++i) { // Check if need to enable callbacks for the page and raise its protection. // // If enabling invalidation notifications: @@ -1702,12 +1742,19 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address, // // Enabling data providers doesn't need to be deferred - providers will be // polled for the last time without releasing the lock. - SystemPageFlagsBlock& page_flags_block = system_page_flags_[i >> 6]; + SystemPageFlagsBlock& page_flags_block = sys_page_flags[i >> 6]; + +#if XE_ARCH_AMD64 == 1 + // x86 modulus shift + uint64_t page_flags_bit = uint64_t(1) << i; +#else uint64_t page_flags_bit = uint64_t(1) << (i & 63); - uint32_t guest_page_number = - xe::sat_sub(i * system_page_size_, host_address_offset()) / page_size_; +#endif + + uint32_t guest_page_number = SystemPagenumToGuestPagenum(i); + //swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]); xe::memory::PageAccess current_page_access = - ToPageAccess(page_table_[guest_page_number].current_protect); + ToPageAccess(page_table_ptr[guest_page_number].current_protect); bool protect_system_page = false; // Don't do anything with inaccessible pages - don't protect, don't enable // callbacks - because real access violations are needed there. And don't @@ -1715,7 +1762,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address, // reason. if (current_page_access != xe::memory::PageAccess::kNoAccess) { // TODO(Triang3l): Enable data providers. - if (enable_invalidation_notifications) { + if constexpr (enable_invalidation_notifications) { if (current_page_access != xe::memory::PageAccess::kReadOnly && (page_flags_block.notify_on_invalidation & page_flags_bit) == 0) { // TODO(Triang3l): Check if data providers are already enabled. @@ -1733,21 +1780,22 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address, } else { if (protect_system_page_first != UINT32_MAX) { xe::memory::Protect( - protect_base + protect_system_page_first * system_page_size_, - (i - protect_system_page_first) * system_page_size_, + protect_base + (protect_system_page_first << system_page_shift_), + (i - protect_system_page_first) << system_page_shift_, protect_access); protect_system_page_first = UINT32_MAX; } } } + if (protect_system_page_first != UINT32_MAX) { xe::memory::Protect( - protect_base + protect_system_page_first * system_page_size_, - (system_page_last + 1 - protect_system_page_first) * system_page_size_, + protect_base + (protect_system_page_first << system_page_shift_), + (system_page_last + 1 - protect_system_page_first) + << system_page_shift_, protect_access); } } - bool PhysicalHeap::TriggerCallbacks( global_unique_lock_type global_lock_locked_once, uint32_t virtual_address, uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) { @@ -1774,10 +1822,10 @@ bool PhysicalHeap::TriggerCallbacks( } uint32_t system_page_first = - (heap_relative_address + host_address_offset()) / system_page_size_; + (heap_relative_address + host_address_offset()) >> system_page_shift_; uint32_t system_page_last = - (heap_relative_address + length - 1 + host_address_offset()) / - system_page_size_; + (heap_relative_address + length - 1 + host_address_offset()) >> + system_page_shift_; system_page_last = std::min(system_page_last, system_page_count_ - 1); assert_true(system_page_first <= system_page_last); uint32_t block_index_first = system_page_first >> 6; @@ -1810,11 +1858,11 @@ bool PhysicalHeap::TriggerCallbacks( } uint32_t physical_address_offset = GetPhysicalAddress(heap_base_); uint32_t physical_address_start = - xe::sat_sub(system_page_first * system_page_size_, + xe::sat_sub(system_page_first << system_page_shift_, host_address_offset()) + physical_address_offset; uint32_t physical_length = std::min( - xe::sat_sub(system_page_last * system_page_size_ + system_page_size_, + xe::sat_sub((system_page_last << system_page_shift_) + system_page_size_, host_address_offset()) + physical_address_offset - physical_address_start, heap_size_ - (physical_address_start - physical_address_offset)); @@ -1858,8 +1906,8 @@ bool PhysicalHeap::TriggerCallbacks( unwatch_first += host_address_offset(); unwatch_last += host_address_offset(); assert_true(unwatch_first <= unwatch_last); - system_page_first = unwatch_first / system_page_size_; - system_page_last = unwatch_last / system_page_size_; + system_page_first = unwatch_first >> system_page_shift_; + system_page_last = unwatch_last >> system_page_shift_; block_index_first = system_page_first >> 6; block_index_last = system_page_last >> 6; } @@ -1874,8 +1922,8 @@ bool PhysicalHeap::TriggerCallbacks( (uint64_t(1) << (i & 63))) != 0; if (unprotect_page) { uint32_t guest_page_number = - xe::sat_sub(i * system_page_size_, host_address_offset()) / - page_size_; + xe::sat_sub(i << system_page_shift_, host_address_offset()) >> + page_size_shift_; if (ToPageAccess(page_table_[guest_page_number].current_protect) != xe::memory::PageAccess::kReadWrite) { unprotect_page = false; @@ -1888,8 +1936,9 @@ bool PhysicalHeap::TriggerCallbacks( } else { if (unprotect_system_page_first != UINT32_MAX) { xe::memory::Protect( - protect_base + unprotect_system_page_first * system_page_size_, - (i - unprotect_system_page_first) * system_page_size_, + protect_base + + (unprotect_system_page_first << system_page_shift_), + (i - unprotect_system_page_first) << system_page_shift_, xe::memory::PageAccess::kReadWrite); unprotect_system_page_first = UINT32_MAX; } @@ -1897,9 +1946,9 @@ bool PhysicalHeap::TriggerCallbacks( } if (unprotect_system_page_first != UINT32_MAX) { xe::memory::Protect( - protect_base + unprotect_system_page_first * system_page_size_, - (system_page_last + 1 - unprotect_system_page_first) * - system_page_size_, + protect_base + (unprotect_system_page_first << system_page_shift_), + (system_page_last + 1 - unprotect_system_page_first) + << system_page_shift_, xe::memory::PageAccess::kReadWrite); } } diff --git a/src/xenia/memory.h b/src/xenia/memory.h index 3d4cf5637..672115d5c 100644 --- a/src/xenia/memory.h +++ b/src/xenia/memory.h @@ -216,6 +216,7 @@ class BaseHeap { uint32_t heap_base_; uint32_t heap_size_; uint32_t page_size_; + uint32_t page_size_shift_; uint32_t host_address_offset_; uint32_t unreserved_page_count_; xe::global_critical_region global_critical_region_; @@ -270,18 +271,36 @@ class PhysicalHeap : public BaseHeap { void EnableAccessCallbacks(uint32_t physical_address, uint32_t length, bool enable_invalidation_notifications, bool enable_data_providers); + template + XE_NOINLINE void EnableAccessCallbacksInner( + const uint32_t system_page_first, const uint32_t system_page_last, + xe::memory::PageAccess protect_access) XE_RESTRICT; + // Returns true if any page in the range was watched. bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once, - uint32_t virtual_address, uint32_t length, bool is_write, - bool unwatch_exact_range, bool unprotect = true); + uint32_t virtual_address, uint32_t length, + bool is_write, bool unwatch_exact_range, + bool unprotect = true); uint32_t GetPhysicalAddress(uint32_t address) const; + uint32_t SystemPagenumToGuestPagenum(uint32_t num) const { + return ((num << system_page_shift_) - host_address_offset()) >> page_size_shift_; + } + + uint32_t GuestPagenumToSystemPagenum(uint32_t num) { + num <<= page_size_shift_; + num += host_address_offset(); + num >>= system_page_shift_; + return num; + } protected: VirtualHeap* parent_heap_; uint32_t system_page_size_; uint32_t system_page_count_; + uint32_t system_page_shift_; + uint32_t padding1_; struct SystemPageFlagsBlock { // Whether writing to each page should result trigger invalidation @@ -458,9 +477,9 @@ class Memory { // TODO(Triang3l): Implement data providers - this is why locking depth of 1 // will be required in the future. bool TriggerPhysicalMemoryCallbacks( - global_unique_lock_type global_lock_locked_once, - uint32_t virtual_address, uint32_t length, bool is_write, - bool unwatch_exact_range, bool unprotect = true); + global_unique_lock_type global_lock_locked_once, uint32_t virtual_address, + uint32_t length, bool is_write, bool unwatch_exact_range, + bool unprotect = true); // Allocates virtual memory from the 'system' heap. // System memory is kept separate from game memory but is still accessible @@ -509,10 +528,10 @@ class Memory { const void* host_address); bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once, - void* host_address, bool is_write); + void* host_address, bool is_write); static bool AccessViolationCallbackThunk( - global_unique_lock_type global_lock_locked_once, - void* context, void* host_address, bool is_write); + global_unique_lock_type global_lock_locked_once, void* context, + void* host_address, bool is_write); std::filesystem::path file_name_; uint32_t system_page_size_ = 0;