atomic cas use prefetchw if available

remove useless memorybarrier remove double membarrier in wait pm4 cmd add int64 cvar use int64 cvar for x64 feature mask Rework some functions that were frontend bound according to vtune placing some of their code in different noinline functions, profiling after indicating l1 cache misses decreased and perf of func increased remove long vpinsrd dep chain code for conversion.h, instead do normal load+bswap or movbe if avail Much faster entry table via split_map, code size could be improved though GetResolveInfo was very large and had impact on icache, mark callees as noinline + msvc pragma optimize small use log2 shifts instead of integer divides in memory minor optimizations in PhysicalHeap::EnableAccessCallbacks, the majority of time in the function is spent looping, NOT calling Protect! Someone should optimize this function and rework the algo completely remove wonky scheduling log message, it was spammy and unhelpful lock count was unnecessary for criticalsection mutex, criticalsection is already a recursive mutex brief notes i gotta run
2022-09-17 04:04:53 -07:00 · 2022-09-17 04:04:53 -07:00 · eb8154908c
parent 0fd4a2533b
commit eb8154908c
35 changed files with 942 additions and 738 deletions
--- a/src/xenia/apu/conversion.h
+++ b/src/xenia/apu/conversion.h
@ -20,6 +20,8 @@ namespace apu {
 namespace conversion {

 #if XE_ARCH_AMD64
+
+#if 0
 inline void sequential_6_BE_to_interleaved_6_LE(float* output,
                                                const float* input,
                                                size_t ch_sample_count) {
@ -41,7 +43,44 @@ inline void sequential_6_BE_to_interleaved_6_LE(float* output,
    out[sample * 6 + 5] = sample2;
  }
 }
+#else
+XE_NOINLINE
+static void _generic_sequential_6_BE_to_interleaved_6_LE(
+    float* XE_RESTRICT output, const float* XE_RESTRICT input,
+    unsigned ch_sample_count) {
+  for (unsigned sample = 0; sample < ch_sample_count; sample++) {
+    for (unsigned channel = 0; channel < 6; channel++) {
+      unsigned int value = *reinterpret_cast<const unsigned int*>(
+          &input[channel * ch_sample_count + sample]);

+      *reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
+          xe::byte_swap(value);
+    }
+  }
+}
+XE_NOINLINE
+static void _movbe_sequential_6_BE_to_interleaved_6_LE(
+    float* XE_RESTRICT output, const float* XE_RESTRICT input,
+    unsigned ch_sample_count) {
+  for (unsigned sample = 0; sample < ch_sample_count; sample++) {
+    for (unsigned channel = 0; channel < 6; channel++) {
+      *reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
+          _load_be_u32(reinterpret_cast<const unsigned int*>(
+              &input[channel * ch_sample_count + sample]));
+    }
+  }
+}
+
+inline static void sequential_6_BE_to_interleaved_6_LE(
+    float* output, const float* input, unsigned ch_sample_count) {
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovbe) {
+    _movbe_sequential_6_BE_to_interleaved_6_LE(output, input, ch_sample_count);
+  } else {
+    _generic_sequential_6_BE_to_interleaved_6_LE(output, input,
+                                                 ch_sample_count);
+  }
+}
+#endif
 inline void sequential_6_BE_to_interleaved_2_LE(float* output,
                                                const float* input,
                                                size_t ch_sample_count) {
--- a/src/xenia/base/cvar.h
+++ b/src/xenia/base/cvar.h
@ -335,7 +335,8 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,

 #define DEFINE_uint64(name, default_value, description, category) \
  DEFINE_CVar(name, default_value, description, category, false, uint64_t)
-
+#define DEFINE_int64(name, default_value, description, category) \
+  DEFINE_CVar(name, default_value, description, category, false, int64_t)
 #define DEFINE_double(name, default_value, description, category) \
  DEFINE_CVar(name, default_value, description, category, false, double)

@ -383,7 +384,7 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,
 #define DECLARE_uint32(name) DECLARE_CVar(name, uint32_t)

 #define DECLARE_uint64(name) DECLARE_CVar(name, uint64_t)
-
+#define DECLARE_int64(name) DECLARE_CVar(name, int64_t)
 #define DECLARE_double(name) DECLARE_CVar(name, double)

 #define DECLARE_string(name) DECLARE_CVar(name, std::string)
--- a/src/xenia/base/mutex.cc
+++ b/src/xenia/base/mutex.cc
@ -26,7 +26,7 @@ check this and release the mutex one way to do this is by using FlsAlloc and
 PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
 thread exits
 */
-thread_local unsigned global_mutex_depth = 0;
+
 static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
  return reinterpret_cast<CRITICAL_SECTION*>(mutex);
 }
@ -38,29 +38,16 @@ xe_global_mutex::xe_global_mutex() {
 xe_global_mutex ::~xe_global_mutex() {
  DeleteCriticalSection(global_critical_section(this));
 }
+
 void xe_global_mutex::lock() {
-  if (global_mutex_depth) {
-  } else {
-    EnterCriticalSection(global_critical_section(this));
-  }
-  global_mutex_depth++;
+  EnterCriticalSection(global_critical_section(this));
 }
 void xe_global_mutex::unlock() {
-  if (--global_mutex_depth == 0) {
-    LeaveCriticalSection(global_critical_section(this));
-  }
+  LeaveCriticalSection(global_critical_section(this));
 }
 bool xe_global_mutex::try_lock() {
-  if (global_mutex_depth) {
-    ++global_mutex_depth;
-    return true;
-  } else {
-    BOOL success = TryEnterCriticalSection(global_critical_section(this));
-    if (success) {
-      ++global_mutex_depth;
-    }
-    return success;
-  }
+  BOOL success = TryEnterCriticalSection(global_critical_section(this));
+  return success;
 }

 CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {
--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@ -116,15 +116,15 @@
 #define XE_LIKELY(...) (!!(__VA_ARGS__))
 #define XE_UNLIKELY(...) (!!(__VA_ARGS__))
 #define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__)
-#define	XE_NOALIAS		__declspec(noalias)
+#define XE_NOALIAS __declspec(noalias)
 #elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
 #define XE_FORCEINLINE __attribute__((always_inline))
 #define XE_NOINLINE __attribute__((noinline))
 #define XE_COLD __attribute__((cold))
 #define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
 #define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
-#define XE_NOALIAS		
-//cant do unevaluated assume
+#define XE_NOALIAS
+// cant do unevaluated assume
 #define XE_MSVC_ASSUME(...) static_cast<void>(0)
 #else
 #define XE_FORCEINLINE inline
@ -137,7 +137,13 @@
 #define XE_MSVC_ASSUME(...) static_cast<void>(0)

 #endif
-
+#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1
+#define XE_MSVC_OPTIMIZE_SMALL() __pragma(optimize("s", on))
+#define XE_MSVC_OPTIMIZE_REVERT() __pragma(optimize("", on))
+#else
+#define XE_MSVC_OPTIMIZE_SMALL()
+#define XE_MSVC_OPTIMIZE_REVERT()
+#endif
 #if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
 #define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__))
 #define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__))
@ -180,7 +186,7 @@ const char kPathSeparator = '/';
 const char kGuestPathSeparator = '\\';

 }  // namespace xe
-#if XE_ARCH_AMD64==1
+#if XE_ARCH_AMD64 == 1
 #include "platform_amd64.h"
 #endif
 #endif  // XENIA_BASE_PLATFORM_H_
--- a/src/xenia/base/platform_amd64.cc
+++ b/src/xenia/base/platform_amd64.cc
@ -7,13 +7,12 @@
 ******************************************************************************
 */

-
 #include "xenia/base/cvar.h"
 #include "xenia/base/platform.h"

 #include "third_party/xbyak/xbyak/xbyak.h"
 #include "third_party/xbyak/xbyak/xbyak_util.h"
-DEFINE_int32(x64_extension_mask, -1,
+DEFINE_int64(x64_extension_mask, -1LL,
             "Allow the detection and utilization of specific instruction set "
             "features.\n"
             "    0 = x86_64 + AVX1\n"
@ -33,79 +32,92 @@ DEFINE_int32(x64_extension_mask, -1,
             "x64");
 namespace xe {
 namespace amd64 {
-static uint32_t g_feature_flags = 0U;
+static uint64_t g_feature_flags = 0U;
 static bool g_did_initialize_feature_flags = false;
-uint32_t GetFeatureFlags() { 
-	xenia_assert(g_did_initialize_feature_flags);
-	return g_feature_flags; 
+uint64_t GetFeatureFlags() {
+  xenia_assert(g_did_initialize_feature_flags);
+  return g_feature_flags;
 }
 XE_COLD
 XE_NOINLINE
 void InitFeatureFlags() {
-  uint32_t feature_flags_ = 0U;
-
-  Xbyak::util::Cpu cpu_;
+  uint64_t feature_flags_ = 0U;
+  {
+    Xbyak::util::Cpu cpu_;
 #define TEST_EMIT_FEATURE(emit, ext)                \
  if ((cvars::x64_extension_mask & emit) == emit) { \
    feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
  }

-  TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
-  TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
-  TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
-  TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
-  TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
-  TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
-  TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
-  TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
-  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
-  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
-  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
+    TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
+    TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
+    TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
+    TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
+    TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
+    TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
+    TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
+    TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
+    TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
+    TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
+    TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
+    TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
+    TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
 #undef TEST_EMIT_FEATURE
-  /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
-  latest version of xbyak
-*/
-  unsigned int data[4];
-  Xbyak::util::Cpu::getCpuid(0x80000001, data);
-  unsigned amd_flags = data[2];
-  if (amd_flags & (1U << 5)) {
-    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
-      feature_flags_ |= kX64EmitLZCNT;
-    }
-  }
-  // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
-  if (amd_flags & (1U << 16)) {
-    if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
-      feature_flags_ |= kX64EmitFMA4;
-    }
-  }
-  if (amd_flags & (1U << 21)) {
-    if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
-      feature_flags_ |= kX64EmitTBM;
-    }
-  }
-  if (amd_flags & (1U << 11)) {
-    if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
-      feature_flags_ |= kX64EmitXOP;
-    }
-  }
-  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-    bool is_zennish = cpu_.displayFamily >= 0x17;
    /*
-                chrispy: according to agner's tables, all amd architectures that
-       we support (ones with avx) have the same timings for
-       jrcxz/loop/loope/loopne as for other jmps
-        */
-    feature_flags_ |= kX64FastJrcx;
-    feature_flags_ |= kX64FastLoop;
-    if (is_zennish) {
-      // ik that i heard somewhere that this is the case for zen, but i need to
-      // verify. cant find my original source for that.
-      // todo: ask agner?
-      feature_flags_ |= kX64FlagsIndependentVars;
+    fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
+    latest version of xbyak
+  */
+    unsigned int data[4];
+    Xbyak::util::Cpu::getCpuid(0x80000001, data);
+    unsigned amd_flags = data[2];
+    if (amd_flags & (1U << 5)) {
+      if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
+        feature_flags_ |= kX64EmitLZCNT;
+      }
+    }
+    // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
+    if (amd_flags & (1U << 16)) {
+      if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
+        feature_flags_ |= kX64EmitFMA4;
+      }
+    }
+    if (amd_flags & (1U << 21)) {
+      if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
+        feature_flags_ |= kX64EmitTBM;
+      }
+    }
+    if (amd_flags & (1U << 11)) {
+      if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
+        feature_flags_ |= kX64EmitXOP;
+      }
+    }
+    if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
+      bool is_zennish = cpu_.displayFamily >= 0x17;
+      /*
+                  chrispy: according to agner's tables, all amd architectures
+         that we support (ones with avx) have the same timings for
+         jrcxz/loop/loope/loopne as for other jmps
+          */
+      feature_flags_ |= kX64FastJrcx;
+      feature_flags_ |= kX64FastLoop;
+      if (is_zennish) {
+        // ik that i heard somewhere that this is the case for zen, but i need
+        // to verify. cant find my original source for that. todo: ask agner?
+        feature_flags_ |= kX64FlagsIndependentVars;
+      }
+    }
+  }
+  {
+    unsigned int data[4];
+    memset(data, 0, sizeof(data));
+    // intel extended features
+    Xbyak::util::Cpu::getCpuidEx(7, 0, data);
+    if ((data[2] & (1 << 28)) &&
+        (cvars::x64_extension_mask & kX64EmitMovdir64M)) {
+      feature_flags_ |= kX64EmitMovdir64M;
+    }
+    if ((data[1] & (1 << 9)) && (cvars::x64_extension_mask & kX64FastRepMovs)) {
+      feature_flags_ |= kX64FastRepMovs;
    }
  }
  g_feature_flags = feature_flags_;
--- a/src/xenia/base/platform_amd64.h
+++ b/src/xenia/base/platform_amd64.h
@ -13,7 +13,7 @@

 namespace xe {
 namespace amd64 {
-enum X64FeatureFlags {
+enum X64FeatureFlags : uint64_t {
  kX64EmitAVX2 = 1 << 0,
  kX64EmitFMA = 1 << 1,
  kX64EmitLZCNT = 1 << 2,  // this is actually ABM and includes popcount
@ -44,14 +44,13 @@ enum X64FeatureFlags {
                           // instructions, and FX users need the boost
  kX64EmitFMA4 = 1 << 17,  // todo: also use on zen1?
  kX64EmitTBM = 1 << 18,
-  // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
-  // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
-  // instructions by using the legacy sse version if we recently cleared the
-  // high 128 bits of the
+  kX64EmitMovdir64M = 1 << 19,
+  kX64FastRepMovs = 1 << 20
+
 };

 XE_NOALIAS
-uint32_t GetFeatureFlags();
+uint64_t GetFeatureFlags();
 XE_COLD
 void InitFeatureFlags();

--- a/src/xenia/base/threading.h
+++ b/src/xenia/base/threading.h
@ -299,6 +299,12 @@ class Event : public WaitHandle {
  // the nonsignaled state after releasing the appropriate number of waiting
  // threads.
  virtual void Pulse() = 0;
+  #if XE_PLATFORM_WIN32 ==1
+  //SetEvent, but if there is a waiter we immediately transfer execution to it
+  virtual void SetBoostPriority() = 0;
+  #else
+  void SetBoostPriority() { Set() }
+  #endif
 };

 // Models a Win32-like semaphore object.
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@ -39,6 +39,8 @@ XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject,
                NtWaitForSingleObjectPointer);

 XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer);
+XE_NTDLL_IMPORT(NtSetEventBoostPriority, cls_NtSetEventBoostPriority,
+                NtSetEventBoostPriorityPointer);
 // difference between NtClearEvent and NtResetEvent is that NtResetEvent returns
 // the events state prior to the call, but we dont need that. might need to
 // check whether one or the other is faster in the kernel though yeah, just
@ -53,6 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,

 XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
                NtDelayExecutionPointer);
+
 namespace xe {
 namespace threading {

@ -137,7 +140,7 @@ void MaybeYield() {
 #endif
 #endif
  // memorybarrier is really not necessary here...
-  MemoryBarrier();
+  // MemoryBarrier();
 }

 void SyncMemory() { MemoryBarrier(); }
@ -288,11 +291,19 @@ class Win32Event : public Win32Handle<Event> {
  void Set() override { NtSetEventPointer.invoke(handle_, nullptr); }
  void Reset() override { NtClearEventPointer.invoke(handle_); }
  void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
+  void SetBoostPriority() override {
+    // no previous state for boostpriority
+    NtSetEventBoostPriorityPointer.invoke(handle_);
+  }
 #else
  void Set() override { SetEvent(handle_); }
  void Reset() override { ResetEvent(handle_); }
  void Pulse() override { PulseEvent(handle_); }

+  void SetBoostPriority() override {
+    // no win32 version of boostpriority
+    SetEvent(handle_);
+  }
 #endif
 };

--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -23,7 +23,7 @@
 #define XE_X64_PROFILER_AVAILABLE 1
 #endif

-DECLARE_int32(x64_extension_mask);
+DECLARE_int64(x64_extension_mask);

 namespace xe {
 class Exception;
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -103,74 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
        "FAQ for system requirements at https://xenia.jp");
    return;
  }
-#if 1
-  feature_flags_ = amd64::GetFeatureFlags();
-#else
-#define TEST_EMIT_FEATURE(emit, ext)                \
-  if ((cvars::x64_extension_mask & emit) == emit) { \
-    feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
-  }

-  TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
-  TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
-  TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
-  TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
-  TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
-  TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
-  TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
-  TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
-  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
-  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
-  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
-#undef TEST_EMIT_FEATURE
-  /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
-  latest version of xbyak
-*/
-  unsigned int data[4];
-  Xbyak::util::Cpu::getCpuid(0x80000001, data);
-  unsigned amd_flags = data[2];
-  if (amd_flags & (1U << 5)) {
-    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
-      feature_flags_ |= kX64EmitLZCNT;
-    }
-  }
-  // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
-  if (amd_flags & (1U << 16)) {
-    if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
-      feature_flags_ |= kX64EmitFMA4;
-    }
-  }
-  if (amd_flags & (1U << 21)) {
-    if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
-      feature_flags_ |= kX64EmitTBM;
-    }
-  }
-  if (amd_flags & (1U << 11)) {
-    if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
-      feature_flags_ |= kX64EmitXOP;
-      XELOGCPU("Cpu support XOP!\n\n");
-    }
-  }
-  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-    bool is_zennish = cpu_.displayFamily >= 0x17;
-    /*
-                chrispy: according to agner's tables, all amd architectures that
-       we support (ones with avx) have the same timings for
-       jrcxz/loop/loope/loopne as for other jmps
-        */
-    feature_flags_ |= kX64FastJrcx;
-    feature_flags_ |= kX64FastLoop;
-    if (is_zennish) {
-      // ik that i heard somewhere that this is the case for zen, but i need to
-      // verify. cant find my original source for that.
-      // todo: ask agner?
-      feature_flags_ |= kX64FlagsIndependentVars;
-    }
-  }
-#endif
+  feature_flags_ = amd64::GetFeatureFlags();
+
  may_use_membase32_as_zero_reg_ =
      static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
          processor()->memory()->virtual_membase())) == 0;
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -299,7 +299,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  void* FindWordConstantOffset(unsigned wordvalue);
  void* FindDwordConstantOffset(unsigned bytevalue);
  void* FindQwordConstantOffset(uint64_t bytevalue);
-  bool IsFeatureEnabled(uint32_t feature_flag) const {
+  bool IsFeatureEnabled(uint64_t feature_flag) const {
    return (feature_flags_ & feature_flag) == feature_flag;
  }

@ -395,7 +395,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  XbyakAllocator* allocator_ = nullptr;
  XexModule* guest_module_ = nullptr;
  Xbyak::util::Cpu cpu_;
-  uint32_t feature_flags_ = 0;
+  uint64_t feature_flags_ = 0;
  uint32_t current_guest_function_ = 0;
  Xbyak::Label* epilog_label_ = nullptr;

--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -39,7 +39,7 @@
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"
-
+XE_MSVC_OPTIMIZE_SMALL()
 DEFINE_bool(use_fast_dot_product, false,
            "Experimental optimization, much shorter sequence on dot products, "
            "treating inf as overflow instead of using mcxsr"
--- a/src/xenia/cpu/entry_table.cc
+++ b/src/xenia/cpu/entry_table.cc
@ -19,16 +19,19 @@ EntryTable::EntryTable() = default;

 EntryTable::~EntryTable() {
  auto global_lock = global_critical_region_.Acquire();
-  for (auto it : map_) {
-    Entry* entry = it.second;
+  for (auto it : map_.Values()) {
+    Entry* entry = it;
    delete entry;
  }
 }

 Entry* EntryTable::Get(uint32_t address) {
  auto global_lock = global_critical_region_.Acquire();
-  const auto& it = map_.find(address);
-  Entry* entry = it != map_.end() ? it->second : nullptr;
+  uint32_t idx = map_.IndexForKey(address);
+  if (idx == map_.size() || *map_.KeyAt(idx) != address) {
+    return nullptr;
+  }
+  Entry* entry = *map_.ValueAt(idx);
  if (entry) {
    // TODO(benvanik): wait if needed?
    if (entry->status != Entry::STATUS_READY) {
@ -43,8 +46,12 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
  // https://github.com/facebook/folly/blob/master/folly/AtomicHashMap.h

  auto global_lock = global_critical_region_.Acquire();
-  const auto& it = map_.find(address);
-  Entry* entry = it != map_.end() ? it->second : nullptr;
+
+  uint32_t idx = map_.IndexForKey(address);
+
+  Entry* entry = idx != map_.size() && *map_.KeyAt(idx) == address
+                     ? *map_.ValueAt(idx)
+                     : nullptr;
  Entry::Status status;
  if (entry) {
    // If we aren't ready yet spin and wait.
@ -65,7 +72,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
    entry->end_address = 0;
    entry->status = Entry::STATUS_COMPILING;
    entry->function = 0;
-    map_[address] = entry;
+    map_.InsertAt(address, entry, idx);
+    // map_[address] = entry;
    status = Entry::STATUS_NEW;
  }
  global_lock.unlock();
@ -75,18 +83,18 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {

 void EntryTable::Delete(uint32_t address) {
  auto global_lock = global_critical_region_.Acquire();
-  const auto itr = map_.find(address);
-
-  if (itr != map_.cend()) {
-    map_.erase(itr);
+  // doesnt this leak memory by not deleting the entry?
+  uint32_t idx = map_.IndexForKey(address);
+  if (idx != map_.size() && *map_.KeyAt(idx) == address) {
+    map_.EraseAt(idx);
  }
 }

 std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
  auto global_lock = global_critical_region_.Acquire();
  std::vector<Function*> fns;
-  for (auto& it : map_) {
-    Entry* entry = it.second;
+  for (auto& it : map_.Values()) {
+    Entry* entry = it;
    if (address >= entry->address && address <= entry->end_address) {
      if (entry->status == Entry::STATUS_READY) {
        fns.push_back(entry->function);
@ -95,6 +103,5 @@ std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
  }
  return fns;
 }
-
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/entry_table.h
+++ b/src/xenia/cpu/entry_table.h
@ -14,7 +14,7 @@
 #include <vector>

 #include "xenia/base/mutex.h"
-
+#include "xenia/base/split_map.h"
 namespace xe {
 namespace cpu {

@ -48,7 +48,8 @@ class EntryTable {
 private:
  xe::global_critical_region global_critical_region_;
  // TODO(benvanik): replace with a better data structure.
-  std::unordered_map<uint32_t, Entry*> map_;
+  xe::split_map<uint32_t, Entry*> map_;
+  //std::unordered_map<uint32_t, Entry*> map_;
 };

 }  // namespace cpu
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@ -334,7 +334,7 @@ void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,

 void CommandProcessor::UpdateWritePointer(uint32_t value) {
  write_ptr_index_ = value;
-  write_ptr_index_event_->Set();
+  write_ptr_index_event_->SetBoostPriority();
 }
 void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
                                                  uint32_t value) {
@ -665,6 +665,11 @@ uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,

  reader_.set_read_offset(read_index * sizeof(uint32_t));
  reader_.set_write_offset(write_index * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      GetCurrentRingReadCount());
  do {
    if (!ExecutePacket()) {
      // This probably should be fatal - but we're going to continue anyways.
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -45,7 +45,10 @@
 namespace xe {
 namespace gpu {
 namespace d3d12 {
-
+struct MemExportRange {
+  uint32_t base_address_dwords;
+  uint32_t size_dwords;
+};
 class D3D12CommandProcessor final : public CommandProcessor {
 public:
 #include "../pm4_command_processor_declare.h"
@ -287,8 +290,21 @@ class D3D12CommandProcessor final : public CommandProcessor {
  bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count,
                 IndexBufferInfo* index_buffer_info,
                 bool major_mode_explicit) override;
+  XE_COLD
+  XE_NOINLINE
+  bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer,
+                               D3D12_INDEX_BUFFER_VIEW& index_buffer_view,
+                               uint32_t guest_index_base,
+                               bool& retflag);
+  XE_NOINLINE
+  XE_COLD
+  bool GatherMemexportRangesAndMakeResident(bool& retflag);
+  XE_NOINLINE
+  XE_COLD
+  void HandleMemexportDrawOrdering_AndReadback();
  bool IssueCopy() override;
-
+  XE_NOINLINE
+  bool IssueCopy_ReadbackResolvePath();
  void InitializeTrace() override;

 private:
@ -363,6 +379,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
  };
  // Gets the indices of optional root parameters. Returns the total parameter
  // count.
+  XE_NOINLINE
+  XE_COLD
  static uint32_t GetRootBindfulExtraParameterIndices(
      const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
      RootBindfulExtraParameterIndices& indices_out);
@ -437,6 +455,18 @@ class D3D12CommandProcessor final : public CommandProcessor {
  bool UpdateBindings(const D3D12Shader* vertex_shader,
                      const D3D12Shader* pixel_shader,
                      ID3D12RootSignature* root_signature);
+  XE_COLD
+  XE_NOINLINE
+  void UpdateBindings_UpdateRootBindful();
+  XE_NOINLINE
+  XE_COLD
+  bool UpdateBindings_BindfulPath(
+      const size_t texture_layout_uid_vertex,
+      const std::vector<xe::gpu::DxbcShader::TextureBinding>& textures_vertex,
+      const size_t texture_layout_uid_pixel,
+      const std::vector<xe::gpu::DxbcShader::TextureBinding>* textures_pixel,
+      const size_t sampler_count_vertex, const size_t sampler_count_pixel,
+      bool& retflag);

  // Returns dword count for one element for a memexport format, or 0 if it's
  // not supported by the D3D12 command processor (if it's smaller that 1 dword,
@ -743,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor {

  draw_util::GetViewportInfoArgs previous_viewport_info_args_;
  draw_util::ViewportInfo previous_viewport_info_;
+  // scratch memexport data
+  MemExportRange memexport_ranges_[512];
+  uint32_t memexport_range_count_ = 0;
 };

 }  // namespace d3d12
--- a/src/xenia/gpu/d3d12/deferred_command_list.cc
+++ b/src/xenia/gpu/d3d12/deferred_command_list.cc
@ -266,22 +266,9 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,

 void* DeferredCommandList::WriteCommand(Command command,
                                        size_t arguments_size_bytes) {
-
  size_t arguments_size_elements =
      round_up(arguments_size_bytes, sizeof(uintmax_t), false);

-      //(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
-  #if 0
-  size_t offset = command_stream_.size();
-  command_stream_.resize(offset + kCommandHeaderSizeElements +
-                         arguments_size_elements);
-  CommandHeader& header =
-      *reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
-  header.command = command;
-  header.arguments_size_elements = uint32_t(arguments_size_elements);
-  return command_stream_.data() + (offset + kCommandHeaderSizeElements);
-  #else
-
  size_t offset = command_stream_.size();
  constexpr size_t kCommandHeaderSizeBytes =
      kCommandHeaderSizeElements * sizeof(uintmax_t);
@ -290,9 +277,9 @@ void* DeferredCommandList::WriteCommand(Command command,
  CommandHeader& header =
      *reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
  header.command = command;
-  header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t);
+  header.arguments_size_elements =
+      uint32_t(arguments_size_elements) / sizeof(uintmax_t);
  return command_stream_.data() + (offset + kCommandHeaderSizeBytes);
-  #endif
 }

 }  // namespace d3d12
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@ -183,7 +183,7 @@ void PipelineCache::Shutdown() {
  // creating them.
  if (!creation_threads_.empty()) {
    {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
      creation_threads_shutdown_from_ = 0;
    }
    creation_request_cond_.notify_all();
@ -681,7 +681,7 @@ void PipelineCache::InitializeShaderStorage(
      if (!creation_threads_.empty()) {
        // Submit the pipeline for creation to any available thread.
        {
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
          creation_queue_.push_back(new_pipeline);
        }
        creation_request_cond_.notify_one();
@ -695,7 +695,7 @@ void PipelineCache::InitializeShaderStorage(
      CreateQueuedPipelinesOnProcessorThread();
      if (creation_threads_.size() > creation_thread_original_count) {
        {
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
          creation_threads_shutdown_from_ = creation_thread_original_count;
          // Assuming the queue is empty because of
          // CreateQueuedPipelinesOnProcessorThread.
@ -708,7 +708,7 @@ void PipelineCache::InitializeShaderStorage(
        bool await_creation_completion_event;
        {
          // Cleanup so additional threads can be created later again.
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
          creation_threads_shutdown_from_ = SIZE_MAX;
          // If the invocation is blocking, all the shader storage
          // initialization is expected to be done before proceeding, to avoid
@ -813,7 +813,7 @@ void PipelineCache::EndSubmission() {
    // Await creation of all queued pipelines.
    bool await_creation_completion_event;
    {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
      // Assuming the creation queue is already empty (because the processor
      // thread also worked on creating the leftover pipelines), so only check
      // if there are threads with pipelines currently being created.
@ -834,7 +834,7 @@ bool PipelineCache::IsCreatingPipelines() {
  if (creation_threads_.empty()) {
    return false;
  }
-  std::lock_guard<std::mutex> lock(creation_request_lock_);
+  std::lock_guard<xe_mutex> lock(creation_request_lock_);
  return !creation_queue_.empty() || creation_threads_busy_ != 0;
 }

@ -1076,7 +1076,7 @@ bool PipelineCache::ConfigurePipeline(
  if (!creation_threads_.empty()) {
    // Submit the pipeline for creation to any available thread.
    {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
      creation_queue_.push_back(new_pipeline);
    }
    creation_request_cond_.notify_one();
@ -3314,7 +3314,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
    // Check if need to shut down or set the completion event and dequeue the
    // pipeline if there is any.
    {
-      std::unique_lock<std::mutex> lock(creation_request_lock_);
+      std::unique_lock<xe_mutex> lock(creation_request_lock_);
      if (thread_index >= creation_threads_shutdown_from_ ||
          creation_queue_.empty()) {
        if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
@ -3345,7 +3345,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
    // completion event if needed (at the next iteration, or in some other
    // thread).
    {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
      --creation_threads_busy_;
    }
  }
@ -3356,7 +3356,7 @@ void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
  while (true) {
    Pipeline* pipeline_to_create;
    {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
      if (creation_queue_.empty()) {
        break;
      }
--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@ -403,8 +403,8 @@ class PipelineCache {
  // Pipeline creation threads.
  void CreationThread(size_t thread_index);
  void CreateQueuedPipelinesOnProcessorThread();
-  std::mutex creation_request_lock_;
-  std::condition_variable creation_request_cond_;
+  xe_mutex creation_request_lock_;
+  std::condition_variable_any creation_request_cond_;
  // Protected with creation_request_lock_, notify_one creation_request_cond_
  // when set.
  std::deque<Pipeline*> creation_queue_;
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -650,7 +650,8 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
  }
  return normalized_color_mask;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 xenos::CopySampleSelect SanitizeCopySampleSelect(
    xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
    bool is_depth) {
@ -737,7 +738,7 @@ const ResolveCopyShaderInfo
        {"Resolve Copy Full 64bpp", true, 2, 4, 5, 3},
        {"Resolve Copy Full 128bpp", true, 2, 4, 4, 3},
 };
-
+XE_MSVC_OPTIMIZE_SMALL()
 bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                    TraceWriter& trace_writer, uint32_t draw_resolution_scale_x,
                    uint32_t draw_resolution_scale_y,
@ -869,7 +870,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
    y1 = y0 + int32_t(xenos::kMaxResolveSize);
  }
  // fails in forza horizon 1
-  assert_true(x0 < x1 && y0 < y1);
+  //x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
+  assert_true(x0 <= x1 && y0 <= y1);
  if (x0 >= x1 || y0 >= y1) {
    XELOGE("Resolve region is empty");
    return false;
@ -1108,7 +1110,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
  info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
  info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
  info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
-
+  #if 0
  XELOGD(
      "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
      "modified memory range 0x{:08X} to 0x{:08X})",
@ -1119,10 +1121,10 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                     xenos::ColorRenderTargetFormat(color_edram_info.format)),
      FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
      copy_dest_extent_end);
-
+  #endif
  return true;
 }
-
+XE_MSVC_OPTIMIZE_REVERT()
 ResolveCopyShaderIndex ResolveInfo::GetCopyShader(
    uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y,
    ResolveCopyShaderConstants& constants_out, uint32_t& group_count_x_out,
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -475,6 +475,8 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(

 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
+XE_NOINLINE
+XE_NOALIAS
 xenos::CopySampleSelect SanitizeCopySampleSelect(
    xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
    bool is_depth);
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@ -14,6 +14,11 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
  new (&reader_)
      RingBuffer(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t));
  reader_.set_write_offset(count * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      COMMAND_PROCESSOR::GetCurrentRingReadCount());
  do {
    if (COMMAND_PROCESSOR::ExecutePacket()) {
      continue;
@ -30,11 +35,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
 }

 bool COMMAND_PROCESSOR::ExecutePacket() {
-  // prefetch the wraparound range
-  // it likely is already in L3 cache, but in a zen system it may be another
-  // chiplets l3
-  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
-      COMMAND_PROCESSOR::GetCurrentRingReadCount());
  const uint32_t packet = reader_.ReadAndSwap<uint32_t>();
  const uint32_t packet_type = packet >> 30;

@ -495,7 +495,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
        } else {
          xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100));
        }
-        xe::threading::SyncMemory();
+        // xe::threading::SyncMemory();
        ReturnFromWait();

        if (!worker_running_) {
@ -599,27 +599,28 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_COND_WRITE(
    value = register_file_->values[poll_reg_addr].u32;
  }
  bool matched = false;
+  value &= mask;
  switch (wait_info & 0x7) {
    case 0x0:  // Never.
      matched = false;
      break;
    case 0x1:  // Less than reference.
-      matched = (value & mask) < ref;
+      matched = value < ref;
      break;
    case 0x2:  // Less than or equal to reference.
-      matched = (value & mask) <= ref;
+      matched = value <= ref;
      break;
    case 0x3:  // Equal to reference.
-      matched = (value & mask) == ref;
+      matched = value == ref;
      break;
    case 0x4:  // Not equal to reference.
-      matched = (value & mask) != ref;
+      matched = value != ref;
      break;
    case 0x5:  // Greater than or equal to reference.
-      matched = (value & mask) >= ref;
+      matched = value >= ref;
      break;
    case 0x6:  // Greater than reference.
-      matched = (value & mask) > ref;
+      matched = value > ref;
      break;
    case 0x7:  // Always
      matched = true;
@ -1064,7 +1065,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
  assert_true(count - 2 >= size_dwords);
  auto shader = COMMAND_PROCESSOR::LoadShader(
      shader_type, uint32_t(reader_.read_ptr()),
-                 reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
+      reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
  switch (shader_type) {
    case xenos::ShaderType::kVertex:
      active_vertex_shader_ = shader;
--- a/src/xenia/gpu/primitive_processor.h
+++ b/src/xenia/gpu/primitive_processor.h
@ -430,7 +430,7 @@ class PrimitiveProcessor {
      --count;
      uint32_t index = *(source++) & low_bits_mask_guest_endian;
      *(dest++) = index != reset_index_guest_endian
-                      ? xenos::GpuSwap(index, HostSwap)
+                      ? xenos::GpuSwapInline(index, HostSwap)
                      : UINT32_MAX;
    }
    if (count >= kSimdVectorU32Elements) {
@ -442,10 +442,10 @@ class PrimitiveProcessor {
      __m128i host_swap_shuffle;
      if constexpr (HostSwap != xenos::Endian::kNone) {
        host_swap_shuffle = _mm_set_epi32(
-            int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap)));
+            int32_t(xenos::GpuSwapInline(uint32_t(0x0F0E0D0C), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x0B0A0908), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x07060504), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x03020100), HostSwap)));
      }
 #endif  // XE_ARCH_AMD64
      while (count >= kSimdVectorU32Elements) {
@ -490,7 +490,7 @@ class PrimitiveProcessor {
    while (count--) {
      uint32_t index = *(source++) & low_bits_mask_guest_endian;
      *(dest++) = index != reset_index_guest_endian
-                      ? xenos::GpuSwap(index, HostSwap)
+                      ? xenos::GpuSwapInline(index, HostSwap)
                      : UINT32_MAX;
    }
  }
@ -510,19 +510,19 @@ class PrimitiveProcessor {
  };
  struct To24Swapping8In16IndexTransform {
    uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k8in16) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k8in16) &
             xenos::kVertexIndexMask;
    }
  };
  struct To24Swapping8In32IndexTransform {
    uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k8in32) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k8in32) &
             xenos::kVertexIndexMask;
    }
  };
  struct To24Swapping16In32IndexTransform {
    uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k16in32) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k16in32) &
             xenos::kVertexIndexMask;
    }
  };
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@ -388,6 +388,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,

  bool any_data_resolved = false;
  uint32_t block_first = page_first >> 6;
+  swcache::PrefetchL1(&system_page_flags_[block_first]);
  uint32_t block_last = page_last >> 6;
  uint32_t range_start = UINT32_MAX;

--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@ -464,7 +464,8 @@ TextureGuestLayout GetGuestTextureLayout(

  return layout;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
                         uint32_t bytes_per_block_log2) {
  // https://github.com/gildor2/UModel/blob/de8fbd3bc922427ea056b7340202dcdcc19ccff5/Unreal/UnTexture.cpp#L489
@ -481,7 +482,8 @@ int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
  return ((offset & ~0x1FF) << 3) + ((y & 16) << 7) + ((offset & 0x1C0) << 2) +
         (((((y & 8) >> 2) + (x >> 3)) & 3) << 6) + (offset & 0x3F);
 }
-
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
                         uint32_t height, uint32_t bytes_per_block_log2) {
  // Reconstructed from disassembly of XGRAPHICS::TileVolume.
@ -509,7 +511,8 @@ int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
  address += offset2 & 63;
  return address;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
                                     uint32_t pitch,
                                     uint32_t bytes_per_block_log2) {
@ -538,7 +541,8 @@ uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
  }
  return upper_bound;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
                                     uint32_t back, uint32_t pitch,
                                     uint32_t height,
--- a/src/xenia/gpu/texture_util.h
+++ b/src/xenia/gpu/texture_util.h
@ -280,8 +280,12 @@ void GetTextureTotalSize(xenos::DataDimension dimension,
 // bytes_per_block_log2 is log2_floor according to how Direct3D 9 calculates it,
 // but k_32_32_32 textures are never tiled anyway likely.

+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
                         uint32_t bytes_per_block_log2);
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
                         uint32_t height, uint32_t bytes_per_block_log2);
 // Because (0, 0, 0) within each 32x32x4-block tile is stored in memory first,
@ -308,9 +312,13 @@ inline uint32_t GetTiledAddressLowerBound3D(uint32_t left, uint32_t top,
 // Supporting the right > pitch and bottom > height (in tiles) cases also, for
 // estimation how far addresses can actually go even potentially beyond the
 // subresource stride.
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
                                     uint32_t pitch,
                                     uint32_t bytes_per_block_log2);
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
                                     uint32_t back, uint32_t pitch,
                                     uint32_t height,
--- a/src/xenia/gpu/xenos.cc
+++ b/src/xenia/gpu/xenos.cc
@ -125,8 +125,8 @@ float Float7e3To32(uint32_t f10) {
 // Based on CFloat24 from d3dref9.dll and the 6e4 code from:
 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
 // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
-
-uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
+XE_NOALIAS
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept {
  if (!(f32 > 0.0f)) {
    // Positive only, and not -0 or NaN.
    return 0;
@ -150,8 +150,8 @@ uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
  }
  return (f32u32 >> 3) & 0xFFFFFF;
 }
-
-float Float20e4To32(uint32_t f24) {
+XE_NOALIAS
+float Float20e4To32(uint32_t f24) noexcept {
  f24 &= 0xFFFFFF;
  if (!f24) {
    return 0.0f;
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -421,10 +421,12 @@ float Float7e3To32(uint32_t f10);
 // floating-point number.
 // Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
 // depth, rounding to the nearest even or towards zero.
-uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
+XE_NOALIAS 
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept;
 // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
 // IEEE-754 32-bit floating-point number.
-float Float20e4To32(uint32_t f24);
+XE_NOALIAS
+float Float20e4To32(uint32_t f24) noexcept;
 // Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
 // floating-point number.
 constexpr float UNorm24To32(uint32_t n24) {
@ -1045,9 +1047,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) {
      return value;
  }
 }
-XE_NOINLINE
+XE_FORCEINLINE
 XE_NOALIAS
-static uint32_t GpuSwap(uint32_t value, Endian endianness) {
+static uint32_t GpuSwapInline(uint32_t value, Endian endianness) {
  switch (endianness) {
    default:
    case Endian::kNone:
@ -1065,6 +1067,11 @@ static uint32_t GpuSwap(uint32_t value, Endian endianness) {
      return ((value >> 16) & 0xFFFF) | (value << 16);
  }
 }
+XE_NOINLINE
+XE_NOALIAS
+static uint32_t GpuSwap(uint32_t value, Endian endianness) {
+  return GpuSwapInline(value, endianness);
+}

 inline float GpuSwap(float value, Endian endianness) {
  union {
--- a/src/xenia/hid/input_system.cc
+++ b/src/xenia/hid/input_system.cc
@ -137,8 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel(
  modified_vibration.right_motor_speed = 0;
  return modified_vibration;
 }
-std::unique_lock<xe_unlikely_mutex> InputSystem::lock() {
-  return std::unique_lock<xe_unlikely_mutex>{lock_};
+std::unique_lock<xe_mutex> InputSystem::lock() {
+  return std::unique_lock<xe_mutex>{lock_};
 }
 }  // namespace hid
 }  // namespace xe
--- a/src/xenia/hid/input_system.h
+++ b/src/xenia/hid/input_system.h
@ -48,7 +48,7 @@ class InputSystem {
  void UpdateUsedSlot(uint8_t slot, bool connected);
  uint8_t GetConnectedSlots() const { return connected_slot; }

-  std::unique_lock<xe_unlikely_mutex> lock();
+  std::unique_lock<xe_mutex> lock();

 private:
  xe::ui::Window* window_ = nullptr;
@ -57,7 +57,7 @@ class InputSystem {

  X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration);
  uint8_t connected_slot = 0b0001;
-  xe_unlikely_mutex lock_;
+  xe_mutex lock_;
 };

 }  // namespace hid
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@ -911,11 +911,17 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
 DECLARE_XBOXKRNL_EXPORT3(NtSignalAndWaitForSingleObjectEx, kThreading,
                         kImplemented, kBlocking, kHighFrequency);

+static void PrefetchForCAS(const void* value) {
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
+    swcache::PrefetchW(value);
+  }
+}
+
 uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
  // XELOGD(
  //     "KfAcquireSpinLock({:08X})",
  //     lock_ptr);
-
+  PrefetchForCAS(lock);
  // Lock.
  while (!xe::atomic_cas(0, 1, lock)) {
    // Spin!
@ -956,6 +962,7 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
 void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
  // Lock.
  auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
+  PrefetchForCAS(lock);
  while (!xe::atomic_cas(0, 1, lock)) {
 #if XE_ARCH_AMD64 == 1
    // todo: this is just a nop if they don't have SMT, which is not great
@ -973,6 +980,7 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
 dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
  // Lock.
  auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
+  PrefetchForCAS(lock);
  if (!xe::atomic_cas(0, 1, lock)) {
    return 0;
  }
--- a/src/xenia/kernel/xthread.cc
+++ b/src/xenia/kernel/xthread.cc
@ -763,7 +763,8 @@ void XThread::SetActiveCpu(uint8_t cpu_index) {
      thread_->set_affinity_mask(uint64_t(1) << cpu_index);
    }
  } else {
-    XELOGW("Too few processor cores - scheduling will be wonky");
+	  //there no good reason why we need to log this... we don't perfectly emulate the 360's scheduler in any way
+   // XELOGW("Too few processor cores - scheduling will be wonky");
  }
 }

--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@ -713,6 +713,8 @@ void BaseHeap::Initialize(Memory* memory, uint8_t* membase, HeapType heap_type,
  heap_base_ = heap_base;
  heap_size_ = heap_size;
  page_size_ = page_size;
+  xenia_assert(xe::is_pow2(page_size_));
+  page_size_shift_ = xe::log2_floor(page_size_);
  host_address_offset_ = host_address_offset;
  page_table_.resize(heap_size / page_size);
  unreserved_page_count_ = uint32_t(page_table_.size());
@ -1234,14 +1236,14 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
  //  fails and returns without modifying the access protection of any pages in
  //  the specified region."

-  uint32_t start_page_number = (address - heap_base_) / page_size_;
+  uint32_t start_page_number = (address - heap_base_) >> page_size_shift_;
  if (start_page_number >= page_table_.size()) {
    XELOGE("BaseHeap::Protect failed due to out-of-bounds base address {:08X}",
           address);
    return false;
  }
  uint32_t end_page_number =
-      uint32_t((uint64_t(address) + size - 1 - heap_base_) / page_size_);
+      uint32_t((uint64_t(address) + size - 1 - heap_base_) >> page_size_shift_);
  if (end_page_number >= page_table_.size()) {
    XELOGE(
        "BaseHeap::Protect failed due to out-of-bounds range ({:08X} bytes "
@ -1268,17 +1270,21 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
      return false;
    }
  }
+  uint32_t xe_page_size = static_cast<uint32_t>(xe::memory::page_size());
+
+  uint32_t page_size_mask = xe_page_size - 1;

  // Attempt host change (hopefully won't fail).
  // We can only do this if our size matches system page granularity.
  uint32_t page_count = end_page_number - start_page_number + 1;
-  if (page_size_ == xe::memory::page_size() ||
-      (((page_count * page_size_) % xe::memory::page_size() == 0) &&
-       ((start_page_number * page_size_) % xe::memory::page_size() == 0))) {
+  if (page_size_ == xe_page_size ||
+      ((((page_count << page_size_shift_) & page_size_mask) == 0) &&
+       (((start_page_number << page_size_shift_) & page_size_mask) == 0))) {
    memory::PageAccess old_protect_access;
-    if (!xe::memory::Protect(TranslateRelative(start_page_number * page_size_),
-                             page_count * page_size_, ToPageAccess(protect),
-                             old_protect ? &old_protect_access : nullptr)) {
+    if (!xe::memory::Protect(
+            TranslateRelative(start_page_number << page_size_shift_),
+            page_count << page_size_shift_, ToPageAccess(protect),
+            old_protect ? &old_protect_access : nullptr)) {
      XELOGE("BaseHeap::Protect failed due to host VirtualProtect failure");
      return false;
    }
@ -1303,7 +1309,7 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,

 bool BaseHeap::QueryRegionInfo(uint32_t base_address,
                               HeapAllocationInfo* out_info) {
-  uint32_t start_page_number = (base_address - heap_base_) / page_size_;
+  uint32_t start_page_number = (base_address - heap_base_) >> page_size_shift_;
  if (start_page_number > page_table_.size()) {
    XELOGE("BaseHeap::QueryRegionInfo base page out of range");
    return false;
@ -1321,9 +1327,10 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
  if (start_page_entry.state) {
    // Committed/reserved region.
    out_info->allocation_base =
-        heap_base_ + start_page_entry.base_address * page_size_;
+        heap_base_ + (start_page_entry.base_address << page_size_shift_);
    out_info->allocation_protect = start_page_entry.allocation_protect;
-    out_info->allocation_size = start_page_entry.region_page_count * page_size_;
+    out_info->allocation_size = start_page_entry.region_page_count
+                                << page_size_shift_;
    out_info->state = start_page_entry.state;
    out_info->protect = start_page_entry.current_protect;

@ -1358,7 +1365,7 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
 }

 bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
-  uint32_t page_number = (address - heap_base_) / page_size_;
+  uint32_t page_number = (address - heap_base_) >> page_size_shift_;
  if (page_number > page_table_.size()) {
    XELOGE("BaseHeap::QuerySize base page out of range");
    *out_size = 0;
@ -1366,12 +1373,12 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
  }
  auto global_lock = global_critical_region_.Acquire();
  auto page_entry = page_table_[page_number];
-  *out_size = (page_entry.region_page_count * page_size_);
+  *out_size = (page_entry.region_page_count << page_size_shift_);
  return true;
 }

 bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
-  uint32_t page_number = (*in_out_address - heap_base_) / page_size_;
+  uint32_t page_number = (*in_out_address - heap_base_) >> page_size_shift_;
  if (page_number > page_table_.size()) {
    XELOGE("BaseHeap::QuerySize base page out of range");
    *out_size = 0;
@ -1379,13 +1386,13 @@ bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
  }
  auto global_lock = global_critical_region_.Acquire();
  auto page_entry = page_table_[page_number];
-  *in_out_address = (page_entry.base_address * page_size_);
-  *out_size = (page_entry.region_page_count * page_size_);
+  *in_out_address = (page_entry.base_address << page_size_shift_);
+  *out_size = (page_entry.region_page_count << page_size_shift_);
  return true;
 }

 bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
-  uint32_t page_number = (address - heap_base_) / page_size_;
+  uint32_t page_number = (address - heap_base_) >> page_size_shift_;
  if (page_number > page_table_.size()) {
    XELOGE("BaseHeap::QueryProtect base page out of range");
    *out_protect = 0;
@ -1403,8 +1410,8 @@ xe::memory::PageAccess BaseHeap::QueryRangeAccess(uint32_t low_address,
      (high_address - heap_base_) >= heap_size_) {
    return xe::memory::PageAccess::kNoAccess;
  }
-  uint32_t low_page_number = (low_address - heap_base_) / page_size_;
-  uint32_t high_page_number = (high_address - heap_base_) / page_size_;
+  uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
+  uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
  uint32_t protect = kMemoryProtectRead | kMemoryProtectWrite;
  {
    auto global_lock = global_critical_region_.Acquire();
@ -1446,6 +1453,8 @@ void PhysicalHeap::Initialize(Memory* memory, uint8_t* membase,
                       page_size, host_address_offset);
  parent_heap_ = parent_heap;
  system_page_size_ = uint32_t(xe::memory::page_size());
+  xenia_assert(xe::is_pow2(system_page_size_));
+  system_page_shift_ = xe::log2_floor(system_page_size_);

  system_page_count_ =
      (size_t(heap_size_) + host_address_offset + (system_page_size_ - 1)) /
@ -1665,10 +1674,11 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
  }

  uint32_t system_page_first =
-      (heap_relative_address + host_address_offset()) / system_page_size_;
+      (heap_relative_address + host_address_offset()) >> system_page_shift_;
+  swcache::PrefetchL1(&system_page_flags_[system_page_first >> 6]);
  uint32_t system_page_last =
-      (heap_relative_address + length - 1 + host_address_offset()) /
-      system_page_size_;
+      (heap_relative_address + length - 1 + host_address_offset()) >>
+      system_page_shift_;
  system_page_last = std::min(system_page_last, system_page_count_ - 1);
  assert_true(system_page_first <= system_page_last);

@ -1677,10 +1687,40 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
  xe::memory::PageAccess protect_access =
      enable_data_providers ? xe::memory::PageAccess::kNoAccess
                            : xe::memory::PageAccess::kReadOnly;
+
+  auto global_lock = global_critical_region_.Acquire();
+  if (enable_invalidation_notifications) {
+    EnableAccessCallbacksInner<true>(system_page_first, system_page_last,
+                                     protect_access);
+  } else {
+    EnableAccessCallbacksInner<false>(system_page_first, system_page_last,
+                                      protect_access);
+  }
+}
+
+template <bool enable_invalidation_notifications>
+XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
+    const uint32_t system_page_first, const uint32_t system_page_last,
+    xe::memory::PageAccess protect_access) XE_RESTRICT {
  uint8_t* protect_base = membase_ + heap_base_;
  uint32_t protect_system_page_first = UINT32_MAX;
-  auto global_lock = global_critical_region_.Acquire();
-  for (uint32_t i = system_page_first; i <= system_page_last; ++i) {
+
+  SystemPageFlagsBlock* XE_RESTRICT sys_page_flags = system_page_flags_.data();
+  PageEntry* XE_RESTRICT page_table_ptr = page_table_.data();
+
+  // chrispy: a lot of time is spent in this loop, and i think some of the work
+  // may be avoidable and repetitive profiling shows quite a bit of time spent
+  // in this loop, but very little spent actually calling Protect
+  uint32_t i = system_page_first;
+
+  uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
+  uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
+
+  uint32_t guest_one =
+      SystemPagenumToGuestPagenum(1);
+
+  uint32_t system_one = GuestPagenumToSystemPagenum(1);
+  for (; i <= system_page_last; ++i) {
    // Check if need to enable callbacks for the page and raise its protection.
    //
    // If enabling invalidation notifications:
@ -1702,12 +1742,19 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
    //
    // Enabling data providers doesn't need to be deferred - providers will be
    // polled for the last time without releasing the lock.
-    SystemPageFlagsBlock& page_flags_block = system_page_flags_[i >> 6];
+    SystemPageFlagsBlock& page_flags_block = sys_page_flags[i >> 6];
+
+#if XE_ARCH_AMD64 == 1
+    // x86 modulus shift
+    uint64_t page_flags_bit = uint64_t(1) << i;
+#else
    uint64_t page_flags_bit = uint64_t(1) << (i & 63);
-    uint32_t guest_page_number =
-        xe::sat_sub(i * system_page_size_, host_address_offset()) / page_size_;
+#endif
+
+    uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
+    //swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]);
    xe::memory::PageAccess current_page_access =
-        ToPageAccess(page_table_[guest_page_number].current_protect);
+        ToPageAccess(page_table_ptr[guest_page_number].current_protect);
    bool protect_system_page = false;
    // Don't do anything with inaccessible pages - don't protect, don't enable
    // callbacks - because real access violations are needed there. And don't
@ -1715,7 +1762,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
    // reason.
    if (current_page_access != xe::memory::PageAccess::kNoAccess) {
      // TODO(Triang3l): Enable data providers.
-      if (enable_invalidation_notifications) {
+      if constexpr (enable_invalidation_notifications) {
        if (current_page_access != xe::memory::PageAccess::kReadOnly &&
            (page_flags_block.notify_on_invalidation & page_flags_bit) == 0) {
          // TODO(Triang3l): Check if data providers are already enabled.
@ -1733,21 +1780,22 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
    } else {
      if (protect_system_page_first != UINT32_MAX) {
        xe::memory::Protect(
-            protect_base + protect_system_page_first * system_page_size_,
-            (i - protect_system_page_first) * system_page_size_,
+            protect_base + (protect_system_page_first << system_page_shift_),
+            (i - protect_system_page_first) << system_page_shift_,
            protect_access);
        protect_system_page_first = UINT32_MAX;
      }
    }
  }
+
  if (protect_system_page_first != UINT32_MAX) {
    xe::memory::Protect(
-        protect_base + protect_system_page_first * system_page_size_,
-        (system_page_last + 1 - protect_system_page_first) * system_page_size_,
+        protect_base + (protect_system_page_first << system_page_shift_),
+        (system_page_last + 1 - protect_system_page_first)
+            << system_page_shift_,
        protect_access);
  }
 }
-
 bool PhysicalHeap::TriggerCallbacks(
    global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
    uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
@ -1774,10 +1822,10 @@ bool PhysicalHeap::TriggerCallbacks(
  }

  uint32_t system_page_first =
-      (heap_relative_address + host_address_offset()) / system_page_size_;
+      (heap_relative_address + host_address_offset()) >> system_page_shift_;
  uint32_t system_page_last =
-      (heap_relative_address + length - 1 + host_address_offset()) /
-      system_page_size_;
+      (heap_relative_address + length - 1 + host_address_offset()) >>
+      system_page_shift_;
  system_page_last = std::min(system_page_last, system_page_count_ - 1);
  assert_true(system_page_first <= system_page_last);
  uint32_t block_index_first = system_page_first >> 6;
@ -1810,11 +1858,11 @@ bool PhysicalHeap::TriggerCallbacks(
  }
  uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
  uint32_t physical_address_start =
-      xe::sat_sub(system_page_first * system_page_size_,
+      xe::sat_sub(system_page_first << system_page_shift_,
                  host_address_offset()) +
      physical_address_offset;
  uint32_t physical_length = std::min(
-      xe::sat_sub(system_page_last * system_page_size_ + system_page_size_,
+      xe::sat_sub((system_page_last << system_page_shift_) + system_page_size_,
                  host_address_offset()) +
          physical_address_offset - physical_address_start,
      heap_size_ - (physical_address_start - physical_address_offset));
@ -1858,8 +1906,8 @@ bool PhysicalHeap::TriggerCallbacks(
    unwatch_first += host_address_offset();
    unwatch_last += host_address_offset();
    assert_true(unwatch_first <= unwatch_last);
-    system_page_first = unwatch_first / system_page_size_;
-    system_page_last = unwatch_last / system_page_size_;
+    system_page_first = unwatch_first >> system_page_shift_;
+    system_page_last = unwatch_last >> system_page_shift_;
    block_index_first = system_page_first >> 6;
    block_index_last = system_page_last >> 6;
  }
@ -1874,8 +1922,8 @@ bool PhysicalHeap::TriggerCallbacks(
                             (uint64_t(1) << (i & 63))) != 0;
      if (unprotect_page) {
        uint32_t guest_page_number =
-            xe::sat_sub(i * system_page_size_, host_address_offset()) /
-            page_size_;
+            xe::sat_sub(i << system_page_shift_, host_address_offset()) >>
+            page_size_shift_;
        if (ToPageAccess(page_table_[guest_page_number].current_protect) !=
            xe::memory::PageAccess::kReadWrite) {
          unprotect_page = false;
@ -1888,8 +1936,9 @@ bool PhysicalHeap::TriggerCallbacks(
      } else {
        if (unprotect_system_page_first != UINT32_MAX) {
          xe::memory::Protect(
-              protect_base + unprotect_system_page_first * system_page_size_,
-              (i - unprotect_system_page_first) * system_page_size_,
+              protect_base +
+                  (unprotect_system_page_first << system_page_shift_),
+              (i - unprotect_system_page_first) << system_page_shift_,
              xe::memory::PageAccess::kReadWrite);
          unprotect_system_page_first = UINT32_MAX;
        }
@ -1897,9 +1946,9 @@ bool PhysicalHeap::TriggerCallbacks(
    }
    if (unprotect_system_page_first != UINT32_MAX) {
      xe::memory::Protect(
-          protect_base + unprotect_system_page_first * system_page_size_,
-          (system_page_last + 1 - unprotect_system_page_first) *
-              system_page_size_,
+          protect_base + (unprotect_system_page_first << system_page_shift_),
+          (system_page_last + 1 - unprotect_system_page_first)
+              << system_page_shift_,
          xe::memory::PageAccess::kReadWrite);
    }
  }
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@ -216,6 +216,7 @@ class BaseHeap {
  uint32_t heap_base_;
  uint32_t heap_size_;
  uint32_t page_size_;
+  uint32_t page_size_shift_;
  uint32_t host_address_offset_;
  uint32_t unreserved_page_count_;
  xe::global_critical_region global_critical_region_;
@ -270,18 +271,36 @@ class PhysicalHeap : public BaseHeap {
  void EnableAccessCallbacks(uint32_t physical_address, uint32_t length,
                             bool enable_invalidation_notifications,
                             bool enable_data_providers);
+  template <bool enable_invalidation_notifications>
+  XE_NOINLINE void EnableAccessCallbacksInner(
+      const uint32_t system_page_first, const uint32_t system_page_last,
+      xe::memory::PageAccess protect_access) XE_RESTRICT;
+
  // Returns true if any page in the range was watched.
  bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
-      uint32_t virtual_address, uint32_t length, bool is_write,
-      bool unwatch_exact_range, bool unprotect = true);
+                        uint32_t virtual_address, uint32_t length,
+                        bool is_write, bool unwatch_exact_range,
+                        bool unprotect = true);

  uint32_t GetPhysicalAddress(uint32_t address) const;

+  uint32_t SystemPagenumToGuestPagenum(uint32_t num) const {
+    return ((num << system_page_shift_) - host_address_offset()) >> page_size_shift_;
+  }
+
+  uint32_t GuestPagenumToSystemPagenum(uint32_t num) {
+    num <<= page_size_shift_;
+    num += host_address_offset();
+    num >>= system_page_shift_;
+    return num;
+  }
 protected:
  VirtualHeap* parent_heap_;

  uint32_t system_page_size_;
  uint32_t system_page_count_;
+  uint32_t system_page_shift_;
+  uint32_t padding1_;

  struct SystemPageFlagsBlock {
    // Whether writing to each page should result trigger invalidation
@ -458,9 +477,9 @@ class Memory {
  // TODO(Triang3l): Implement data providers - this is why locking depth of 1
  // will be required in the future.
  bool TriggerPhysicalMemoryCallbacks(
-      global_unique_lock_type global_lock_locked_once,
-      uint32_t virtual_address, uint32_t length, bool is_write,
-      bool unwatch_exact_range, bool unprotect = true);
+      global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
+      uint32_t length, bool is_write, bool unwatch_exact_range,
+      bool unprotect = true);

  // Allocates virtual memory from the 'system' heap.
  // System memory is kept separate from game memory but is still accessible
@ -509,10 +528,10 @@ class Memory {
                                          const void* host_address);

  bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
-      void* host_address, bool is_write);
+                               void* host_address, bool is_write);
  static bool AccessViolationCallbackThunk(
-      global_unique_lock_type global_lock_locked_once,
-      void* context, void* host_address, bool is_write);
+      global_unique_lock_type global_lock_locked_once, void* context,
+      void* host_address, bool is_write);

  std::filesystem::path file_name_;
  uint32_t system_page_size_ = 0;