Merge pull request #75 from chrisps/canary_experimental

misc stuff again
2022-09-17 06:43:50 -07:00 · 2022-09-17 06:43:50 -07:00 · a29a7436e0
parent b4224ff3dc d0acd68369
commit a29a7436e0
35 changed files with 942 additions and 738 deletions
--- a/src/xenia/apu/conversion.h
+++ b/src/xenia/apu/conversion.h
@ -20,6 +20,8 @@ namespace apu {
 namespace conversion {

 #if XE_ARCH_AMD64
+
+#if 0
 inline void sequential_6_BE_to_interleaved_6_LE(float* output,
                                                const float* input,
                                                size_t ch_sample_count) {
@ -41,7 +43,44 @@ inline void sequential_6_BE_to_interleaved_6_LE(float* output,
    out[sample * 6 + 5] = sample2;
  }
 }
+#else
+XE_NOINLINE
+static void _generic_sequential_6_BE_to_interleaved_6_LE(
+    float* XE_RESTRICT output, const float* XE_RESTRICT input,
+    unsigned ch_sample_count) {
+  for (unsigned sample = 0; sample < ch_sample_count; sample++) {
+    for (unsigned channel = 0; channel < 6; channel++) {
+      unsigned int value = *reinterpret_cast<const unsigned int*>(
+          &input[channel * ch_sample_count + sample]);

+      *reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
+          xe::byte_swap(value);
+    }
+  }
+}
+XE_NOINLINE
+static void _movbe_sequential_6_BE_to_interleaved_6_LE(
+    float* XE_RESTRICT output, const float* XE_RESTRICT input,
+    unsigned ch_sample_count) {
+  for (unsigned sample = 0; sample < ch_sample_count; sample++) {
+    for (unsigned channel = 0; channel < 6; channel++) {
+      *reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
+          _load_be_u32(reinterpret_cast<const unsigned int*>(
+              &input[channel * ch_sample_count + sample]));
+    }
+  }
+}
+
+inline static void sequential_6_BE_to_interleaved_6_LE(
+    float* output, const float* input, unsigned ch_sample_count) {
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovbe) {
+    _movbe_sequential_6_BE_to_interleaved_6_LE(output, input, ch_sample_count);
+  } else {
+    _generic_sequential_6_BE_to_interleaved_6_LE(output, input,
+                                                 ch_sample_count);
+  }
+}
+#endif
 inline void sequential_6_BE_to_interleaved_2_LE(float* output,
                                                const float* input,
                                                size_t ch_sample_count) {
--- a/src/xenia/base/cvar.h
+++ b/src/xenia/base/cvar.h
@ -335,7 +335,8 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,

 #define DEFINE_uint64(name, default_value, description, category) \
  DEFINE_CVar(name, default_value, description, category, false, uint64_t)
-
+#define DEFINE_int64(name, default_value, description, category) \
+  DEFINE_CVar(name, default_value, description, category, false, int64_t)
 #define DEFINE_double(name, default_value, description, category) \
  DEFINE_CVar(name, default_value, description, category, false, double)

@ -383,7 +384,7 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,
 #define DECLARE_uint32(name) DECLARE_CVar(name, uint32_t)

 #define DECLARE_uint64(name) DECLARE_CVar(name, uint64_t)
-
+#define DECLARE_int64(name) DECLARE_CVar(name, int64_t)
 #define DECLARE_double(name) DECLARE_CVar(name, double)

 #define DECLARE_string(name) DECLARE_CVar(name, std::string)
--- a/src/xenia/base/mutex.cc
+++ b/src/xenia/base/mutex.cc
@ -26,7 +26,7 @@ check this and release the mutex one way to do this is by using FlsAlloc and
 PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
 thread exits
 */
-thread_local unsigned global_mutex_depth = 0;
+
 static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
  return reinterpret_cast<CRITICAL_SECTION*>(mutex);
 }
@ -38,29 +38,16 @@ xe_global_mutex::xe_global_mutex() {
 xe_global_mutex ::~xe_global_mutex() {
  DeleteCriticalSection(global_critical_section(this));
 }
+
 void xe_global_mutex::lock() {
-  if (global_mutex_depth) {
-  } else {
-    EnterCriticalSection(global_critical_section(this));
-  }
-  global_mutex_depth++;
+  EnterCriticalSection(global_critical_section(this));
 }
 void xe_global_mutex::unlock() {
-  if (--global_mutex_depth == 0) {
-    LeaveCriticalSection(global_critical_section(this));
-  }
+  LeaveCriticalSection(global_critical_section(this));
 }
 bool xe_global_mutex::try_lock() {
-  if (global_mutex_depth) {
-    ++global_mutex_depth;
-    return true;
-  } else {
-    BOOL success = TryEnterCriticalSection(global_critical_section(this));
-    if (success) {
-      ++global_mutex_depth;
-    }
-    return success;
-  }
+  BOOL success = TryEnterCriticalSection(global_critical_section(this));
+  return success;
 }

 CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {
--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@ -116,15 +116,15 @@
 #define XE_LIKELY(...) (!!(__VA_ARGS__))
 #define XE_UNLIKELY(...) (!!(__VA_ARGS__))
 #define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__)
-#define	XE_NOALIAS		__declspec(noalias)
+#define XE_NOALIAS __declspec(noalias)
 #elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
 #define XE_FORCEINLINE __attribute__((always_inline))
 #define XE_NOINLINE __attribute__((noinline))
 #define XE_COLD __attribute__((cold))
 #define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
 #define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
-#define XE_NOALIAS		
-//cant do unevaluated assume
+#define XE_NOALIAS
+// cant do unevaluated assume
 #define XE_MSVC_ASSUME(...) static_cast<void>(0)
 #else
 #define XE_FORCEINLINE inline
@ -137,7 +137,13 @@
 #define XE_MSVC_ASSUME(...) static_cast<void>(0)

 #endif
-
+#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1
+#define XE_MSVC_OPTIMIZE_SMALL() __pragma(optimize("s", on))
+#define XE_MSVC_OPTIMIZE_REVERT() __pragma(optimize("", on))
+#else
+#define XE_MSVC_OPTIMIZE_SMALL()
+#define XE_MSVC_OPTIMIZE_REVERT()
+#endif
 #if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
 #define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__))
 #define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__))
@ -180,7 +186,7 @@ const char kPathSeparator = '/';
 const char kGuestPathSeparator = '\\';

 }  // namespace xe
-#if XE_ARCH_AMD64==1
+#if XE_ARCH_AMD64 == 1
 #include "platform_amd64.h"
 #endif
 #endif  // XENIA_BASE_PLATFORM_H_
--- a/src/xenia/base/platform_amd64.cc
+++ b/src/xenia/base/platform_amd64.cc
@ -7,13 +7,12 @@
 ******************************************************************************
 */

-
 #include "xenia/base/cvar.h"
 #include "xenia/base/platform.h"

 #include "third_party/xbyak/xbyak/xbyak.h"
 #include "third_party/xbyak/xbyak/xbyak_util.h"
-DEFINE_int32(x64_extension_mask, -1,
+DEFINE_int64(x64_extension_mask, -1LL,
             "Allow the detection and utilization of specific instruction set "
             "features.\n"
             "    0 = x86_64 + AVX1\n"
@ -33,79 +32,92 @@ DEFINE_int32(x64_extension_mask, -1,
             "x64");
 namespace xe {
 namespace amd64 {
-static uint32_t g_feature_flags = 0U;
+static uint64_t g_feature_flags = 0U;
 static bool g_did_initialize_feature_flags = false;
-uint32_t GetFeatureFlags() { 
-	xenia_assert(g_did_initialize_feature_flags);
-	return g_feature_flags; 
+uint64_t GetFeatureFlags() {
+  xenia_assert(g_did_initialize_feature_flags);
+  return g_feature_flags;
 }
 XE_COLD
 XE_NOINLINE
 void InitFeatureFlags() {
-  uint32_t feature_flags_ = 0U;
-
-  Xbyak::util::Cpu cpu_;
+  uint64_t feature_flags_ = 0U;
+  {
+    Xbyak::util::Cpu cpu_;
 #define TEST_EMIT_FEATURE(emit, ext)                \
  if ((cvars::x64_extension_mask & emit) == emit) { \
    feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
  }

-  TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
-  TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
-  TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
-  TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
-  TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
-  TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
-  TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
-  TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
-  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
-  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
-  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
+    TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
+    TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
+    TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
+    TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
+    TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
+    TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
+    TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
+    TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
+    TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
+    TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
+    TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
+    TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
+    TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
 #undef TEST_EMIT_FEATURE
-  /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
-  latest version of xbyak
-*/
-  unsigned int data[4];
-  Xbyak::util::Cpu::getCpuid(0x80000001, data);
-  unsigned amd_flags = data[2];
-  if (amd_flags & (1U << 5)) {
-    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
-      feature_flags_ |= kX64EmitLZCNT;
-    }
-  }
-  // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
-  if (amd_flags & (1U << 16)) {
-    if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
-      feature_flags_ |= kX64EmitFMA4;
-    }
-  }
-  if (amd_flags & (1U << 21)) {
-    if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
-      feature_flags_ |= kX64EmitTBM;
-    }
-  }
-  if (amd_flags & (1U << 11)) {
-    if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
-      feature_flags_ |= kX64EmitXOP;
-    }
-  }
-  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-    bool is_zennish = cpu_.displayFamily >= 0x17;
    /*
-                chrispy: according to agner's tables, all amd architectures that
-       we support (ones with avx) have the same timings for
-       jrcxz/loop/loope/loopne as for other jmps
-        */
-    feature_flags_ |= kX64FastJrcx;
-    feature_flags_ |= kX64FastLoop;
-    if (is_zennish) {
-      // ik that i heard somewhere that this is the case for zen, but i need to
-      // verify. cant find my original source for that.
-      // todo: ask agner?
-      feature_flags_ |= kX64FlagsIndependentVars;
+    fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
+    latest version of xbyak
+  */
+    unsigned int data[4];
+    Xbyak::util::Cpu::getCpuid(0x80000001, data);
+    unsigned amd_flags = data[2];
+    if (amd_flags & (1U << 5)) {
+      if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
+        feature_flags_ |= kX64EmitLZCNT;
+      }
+    }
+    // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
+    if (amd_flags & (1U << 16)) {
+      if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
+        feature_flags_ |= kX64EmitFMA4;
+      }
+    }
+    if (amd_flags & (1U << 21)) {
+      if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
+        feature_flags_ |= kX64EmitTBM;
+      }
+    }
+    if (amd_flags & (1U << 11)) {
+      if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
+        feature_flags_ |= kX64EmitXOP;
+      }
+    }
+    if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
+      bool is_zennish = cpu_.displayFamily >= 0x17;
+      /*
+                  chrispy: according to agner's tables, all amd architectures
+         that we support (ones with avx) have the same timings for
+         jrcxz/loop/loope/loopne as for other jmps
+          */
+      feature_flags_ |= kX64FastJrcx;
+      feature_flags_ |= kX64FastLoop;
+      if (is_zennish) {
+        // ik that i heard somewhere that this is the case for zen, but i need
+        // to verify. cant find my original source for that. todo: ask agner?
+        feature_flags_ |= kX64FlagsIndependentVars;
+      }
+    }
+  }
+  {
+    unsigned int data[4];
+    memset(data, 0, sizeof(data));
+    // intel extended features
+    Xbyak::util::Cpu::getCpuidEx(7, 0, data);
+    if ((data[2] & (1 << 28)) &&
+        (cvars::x64_extension_mask & kX64EmitMovdir64M)) {
+      feature_flags_ |= kX64EmitMovdir64M;
+    }
+    if ((data[1] & (1 << 9)) && (cvars::x64_extension_mask & kX64FastRepMovs)) {
+      feature_flags_ |= kX64FastRepMovs;
    }
  }
  g_feature_flags = feature_flags_;
--- a/src/xenia/base/platform_amd64.h
+++ b/src/xenia/base/platform_amd64.h
@ -13,7 +13,7 @@

 namespace xe {
 namespace amd64 {
-enum X64FeatureFlags {
+enum X64FeatureFlags : uint64_t {
  kX64EmitAVX2 = 1 << 0,
  kX64EmitFMA = 1 << 1,
  kX64EmitLZCNT = 1 << 2,  // this is actually ABM and includes popcount
@ -44,14 +44,13 @@ enum X64FeatureFlags {
                           // instructions, and FX users need the boost
  kX64EmitFMA4 = 1 << 17,  // todo: also use on zen1?
  kX64EmitTBM = 1 << 18,
-  // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
-  // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
-  // instructions by using the legacy sse version if we recently cleared the
-  // high 128 bits of the
+  kX64EmitMovdir64M = 1 << 19,
+  kX64FastRepMovs = 1 << 20
+
 };

 XE_NOALIAS
-uint32_t GetFeatureFlags();
+uint64_t GetFeatureFlags();
 XE_COLD
 void InitFeatureFlags();

--- a/src/xenia/base/threading.h
+++ b/src/xenia/base/threading.h
@ -299,6 +299,12 @@ class Event : public WaitHandle {
  // the nonsignaled state after releasing the appropriate number of waiting
  // threads.
  virtual void Pulse() = 0;
+  #if XE_PLATFORM_WIN32 ==1
+  //SetEvent, but if there is a waiter we immediately transfer execution to it
+  virtual void SetBoostPriority() = 0;
+  #else
+  void SetBoostPriority() { Set() }
+  #endif
 };

 // Models a Win32-like semaphore object.
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@ -39,6 +39,8 @@ XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject,
                NtWaitForSingleObjectPointer);

 XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer);
+XE_NTDLL_IMPORT(NtSetEventBoostPriority, cls_NtSetEventBoostPriority,
+                NtSetEventBoostPriorityPointer);
 // difference between NtClearEvent and NtResetEvent is that NtResetEvent returns
 // the events state prior to the call, but we dont need that. might need to
 // check whether one or the other is faster in the kernel though yeah, just
@ -53,6 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,

 XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
                NtDelayExecutionPointer);
+
 namespace xe {
 namespace threading {

@ -137,7 +140,7 @@ void MaybeYield() {
 #endif
 #endif
  // memorybarrier is really not necessary here...
-  MemoryBarrier();
+  // MemoryBarrier();
 }

 void SyncMemory() { MemoryBarrier(); }
@ -288,11 +291,19 @@ class Win32Event : public Win32Handle<Event> {
  void Set() override { NtSetEventPointer.invoke(handle_, nullptr); }
  void Reset() override { NtClearEventPointer.invoke(handle_); }
  void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
+  void SetBoostPriority() override {
+    // no previous state for boostpriority
+    NtSetEventBoostPriorityPointer.invoke(handle_);
+  }
 #else
  void Set() override { SetEvent(handle_); }
  void Reset() override { ResetEvent(handle_); }
  void Pulse() override { PulseEvent(handle_); }

+  void SetBoostPriority() override {
+    // no win32 version of boostpriority
+    SetEvent(handle_);
+  }
 #endif
 };

--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -23,7 +23,7 @@
 #define XE_X64_PROFILER_AVAILABLE 1
 #endif

-DECLARE_int32(x64_extension_mask);
+DECLARE_int64(x64_extension_mask);

 namespace xe {
 class Exception;
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -103,74 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
        "FAQ for system requirements at https://xenia.jp");
    return;
  }
-#if 1
-  feature_flags_ = amd64::GetFeatureFlags();
-#else
-#define TEST_EMIT_FEATURE(emit, ext)                \
-  if ((cvars::x64_extension_mask & emit) == emit) { \
-    feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
-  }

-  TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
-  TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
-  TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
-  TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
-  TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
-  TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
-  TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
-  TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
-  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
-  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
-  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
-#undef TEST_EMIT_FEATURE
-  /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
-  latest version of xbyak
-*/
-  unsigned int data[4];
-  Xbyak::util::Cpu::getCpuid(0x80000001, data);
-  unsigned amd_flags = data[2];
-  if (amd_flags & (1U << 5)) {
-    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
-      feature_flags_ |= kX64EmitLZCNT;
-    }
-  }
-  // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
-  if (amd_flags & (1U << 16)) {
-    if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
-      feature_flags_ |= kX64EmitFMA4;
-    }
-  }
-  if (amd_flags & (1U << 21)) {
-    if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
-      feature_flags_ |= kX64EmitTBM;
-    }
-  }
-  if (amd_flags & (1U << 11)) {
-    if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
-      feature_flags_ |= kX64EmitXOP;
-      XELOGCPU("Cpu support XOP!\n\n");
-    }
-  }
-  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-    bool is_zennish = cpu_.displayFamily >= 0x17;
-    /*
-                chrispy: according to agner's tables, all amd architectures that
-       we support (ones with avx) have the same timings for
-       jrcxz/loop/loope/loopne as for other jmps
-        */
-    feature_flags_ |= kX64FastJrcx;
-    feature_flags_ |= kX64FastLoop;
-    if (is_zennish) {
-      // ik that i heard somewhere that this is the case for zen, but i need to
-      // verify. cant find my original source for that.
-      // todo: ask agner?
-      feature_flags_ |= kX64FlagsIndependentVars;
-    }
-  }
-#endif
+  feature_flags_ = amd64::GetFeatureFlags();
+
  may_use_membase32_as_zero_reg_ =
      static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
          processor()->memory()->virtual_membase())) == 0;
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -299,7 +299,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  void* FindWordConstantOffset(unsigned wordvalue);
  void* FindDwordConstantOffset(unsigned bytevalue);
  void* FindQwordConstantOffset(uint64_t bytevalue);
-  bool IsFeatureEnabled(uint32_t feature_flag) const {
+  bool IsFeatureEnabled(uint64_t feature_flag) const {
    return (feature_flags_ & feature_flag) == feature_flag;
  }

@ -395,7 +395,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  XbyakAllocator* allocator_ = nullptr;
  XexModule* guest_module_ = nullptr;
  Xbyak::util::Cpu cpu_;
-  uint32_t feature_flags_ = 0;
+  uint64_t feature_flags_ = 0;
  uint32_t current_guest_function_ = 0;
  Xbyak::Label* epilog_label_ = nullptr;

--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -39,7 +39,7 @@
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"
-
+XE_MSVC_OPTIMIZE_SMALL()
 DEFINE_bool(use_fast_dot_product, false,
            "Experimental optimization, much shorter sequence on dot products, "
            "treating inf as overflow instead of using mcxsr"
--- a/src/xenia/cpu/entry_table.cc
+++ b/src/xenia/cpu/entry_table.cc
@ -19,16 +19,19 @@ EntryTable::EntryTable() = default;

 EntryTable::~EntryTable() {
  auto global_lock = global_critical_region_.Acquire();
-  for (auto it : map_) {
-    Entry* entry = it.second;
+  for (auto it : map_.Values()) {
+    Entry* entry = it;
    delete entry;
  }
 }

 Entry* EntryTable::Get(uint32_t address) {
  auto global_lock = global_critical_region_.Acquire();
-  const auto& it = map_.find(address);
-  Entry* entry = it != map_.end() ? it->second : nullptr;
+  uint32_t idx = map_.IndexForKey(address);
+  if (idx == map_.size() || *map_.KeyAt(idx) != address) {
+    return nullptr;
+  }
+  Entry* entry = *map_.ValueAt(idx);
  if (entry) {
    // TODO(benvanik): wait if needed?
    if (entry->status != Entry::STATUS_READY) {
@ -43,8 +46,12 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
  // https://github.com/facebook/folly/blob/master/folly/AtomicHashMap.h

  auto global_lock = global_critical_region_.Acquire();
-  const auto& it = map_.find(address);
-  Entry* entry = it != map_.end() ? it->second : nullptr;
+
+  uint32_t idx = map_.IndexForKey(address);
+
+  Entry* entry = idx != map_.size() && *map_.KeyAt(idx) == address
+                     ? *map_.ValueAt(idx)
+                     : nullptr;
  Entry::Status status;
  if (entry) {
    // If we aren't ready yet spin and wait.
@ -65,7 +72,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
    entry->end_address = 0;
    entry->status = Entry::STATUS_COMPILING;
    entry->function = 0;
-    map_[address] = entry;
+    map_.InsertAt(address, entry, idx);
+    // map_[address] = entry;
    status = Entry::STATUS_NEW;
  }
  global_lock.unlock();
@ -75,18 +83,18 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {

 void EntryTable::Delete(uint32_t address) {
  auto global_lock = global_critical_region_.Acquire();
-  const auto itr = map_.find(address);
-
-  if (itr != map_.cend()) {
-    map_.erase(itr);
+  // doesnt this leak memory by not deleting the entry?
+  uint32_t idx = map_.IndexForKey(address);
+  if (idx != map_.size() && *map_.KeyAt(idx) == address) {
+    map_.EraseAt(idx);
  }
 }

 std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
  auto global_lock = global_critical_region_.Acquire();
  std::vector<Function*> fns;
-  for (auto& it : map_) {
-    Entry* entry = it.second;
+  for (auto& it : map_.Values()) {
+    Entry* entry = it;
    if (address >= entry->address && address <= entry->end_address) {
      if (entry->status == Entry::STATUS_READY) {
        fns.push_back(entry->function);
@ -95,6 +103,5 @@ std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
  }
  return fns;
 }
-
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/entry_table.h
+++ b/src/xenia/cpu/entry_table.h
@ -14,7 +14,7 @@
 #include <vector>

 #include "xenia/base/mutex.h"
-
+#include "xenia/base/split_map.h"
 namespace xe {
 namespace cpu {

@ -48,7 +48,8 @@ class EntryTable {
 private:
  xe::global_critical_region global_critical_region_;
  // TODO(benvanik): replace with a better data structure.
-  std::unordered_map<uint32_t, Entry*> map_;
+  xe::split_map<uint32_t, Entry*> map_;
+  //std::unordered_map<uint32_t, Entry*> map_;
 };

 }  // namespace cpu
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@ -334,7 +334,7 @@ void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,

 void CommandProcessor::UpdateWritePointer(uint32_t value) {
  write_ptr_index_ = value;
-  write_ptr_index_event_->Set();
+  write_ptr_index_event_->SetBoostPriority();
 }
 void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
                                                  uint32_t value) {
@ -665,6 +665,11 @@ uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,

  reader_.set_read_offset(read_index * sizeof(uint32_t));
  reader_.set_write_offset(write_index * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      GetCurrentRingReadCount());
  do {
    if (!ExecutePacket()) {
      // This probably should be fatal - but we're going to continue anyways.
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -45,7 +45,10 @@
 namespace xe {
 namespace gpu {
 namespace d3d12 {
-
+struct MemExportRange {
+  uint32_t base_address_dwords;
+  uint32_t size_dwords;
+};
 class D3D12CommandProcessor final : public CommandProcessor {
 public:
 #include "../pm4_command_processor_declare.h"
@ -287,8 +290,21 @@ class D3D12CommandProcessor final : public CommandProcessor {
  bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count,
                 IndexBufferInfo* index_buffer_info,
                 bool major_mode_explicit) override;
+  XE_COLD
+  XE_NOINLINE
+  bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer,
+                               D3D12_INDEX_BUFFER_VIEW& index_buffer_view,
+                               uint32_t guest_index_base,
+                               bool& retflag);
+  XE_NOINLINE
+  XE_COLD
+  bool GatherMemexportRangesAndMakeResident(bool& retflag);
+  XE_NOINLINE
+  XE_COLD
+  void HandleMemexportDrawOrdering_AndReadback();
  bool IssueCopy() override;
-
+  XE_NOINLINE
+  bool IssueCopy_ReadbackResolvePath();
  void InitializeTrace() override;

 private:
@ -363,6 +379,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
  };
  // Gets the indices of optional root parameters. Returns the total parameter
  // count.
+  XE_NOINLINE
+  XE_COLD
  static uint32_t GetRootBindfulExtraParameterIndices(
      const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
      RootBindfulExtraParameterIndices& indices_out);
@ -437,6 +455,18 @@ class D3D12CommandProcessor final : public CommandProcessor {
  bool UpdateBindings(const D3D12Shader* vertex_shader,
                      const D3D12Shader* pixel_shader,
                      ID3D12RootSignature* root_signature);
+  XE_COLD
+  XE_NOINLINE
+  void UpdateBindings_UpdateRootBindful();
+  XE_NOINLINE
+  XE_COLD
+  bool UpdateBindings_BindfulPath(
+      const size_t texture_layout_uid_vertex,
+      const std::vector<xe::gpu::DxbcShader::TextureBinding>& textures_vertex,
+      const size_t texture_layout_uid_pixel,
+      const std::vector<xe::gpu::DxbcShader::TextureBinding>* textures_pixel,
+      const size_t sampler_count_vertex, const size_t sampler_count_pixel,
+      bool& retflag);

  // Returns dword count for one element for a memexport format, or 0 if it's
  // not supported by the D3D12 command processor (if it's smaller that 1 dword,
@ -743,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor {

  draw_util::GetViewportInfoArgs previous_viewport_info_args_;
  draw_util::ViewportInfo previous_viewport_info_;
+  // scratch memexport data
+  MemExportRange memexport_ranges_[512];
+  uint32_t memexport_range_count_ = 0;
 };

 }  // namespace d3d12
--- a/src/xenia/gpu/d3d12/deferred_command_list.cc
+++ b/src/xenia/gpu/d3d12/deferred_command_list.cc
@ -266,22 +266,9 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,

 void* DeferredCommandList::WriteCommand(Command command,
                                        size_t arguments_size_bytes) {
-
  size_t arguments_size_elements =
      round_up(arguments_size_bytes, sizeof(uintmax_t), false);

-      //(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
-  #if 0
-  size_t offset = command_stream_.size();
-  command_stream_.resize(offset + kCommandHeaderSizeElements +
-                         arguments_size_elements);
-  CommandHeader& header =
-      *reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
-  header.command = command;
-  header.arguments_size_elements = uint32_t(arguments_size_elements);
-  return command_stream_.data() + (offset + kCommandHeaderSizeElements);
-  #else
-
  size_t offset = command_stream_.size();
  constexpr size_t kCommandHeaderSizeBytes =
      kCommandHeaderSizeElements * sizeof(uintmax_t);
@ -290,9 +277,9 @@ void* DeferredCommandList::WriteCommand(Command command,
  CommandHeader& header =
      *reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
  header.command = command;
-  header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t);
+  header.arguments_size_elements =
+      uint32_t(arguments_size_elements) / sizeof(uintmax_t);
  return command_stream_.data() + (offset + kCommandHeaderSizeBytes);
-  #endif
 }

 }  // namespace d3d12
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@ -183,7 +183,7 @@ void PipelineCache::Shutdown() {
  // creating them.
  if (!creation_threads_.empty()) {
    {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
      creation_threads_shutdown_from_ = 0;
    }
    creation_request_cond_.notify_all();
@ -681,7 +681,7 @@ void PipelineCache::InitializeShaderStorage(
      if (!creation_threads_.empty()) {
        // Submit the pipeline for creation to any available thread.
        {
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
          creation_queue_.push_back(new_pipeline);
        }
        creation_request_cond_.notify_one();
@ -695,7 +695,7 @@ void PipelineCache::InitializeShaderStorage(
      CreateQueuedPipelinesOnProcessorThread();
      if (creation_threads_.size() > creation_thread_original_count) {
        {
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
          creation_threads_shutdown_from_ = creation_thread_original_count;
          // Assuming the queue is empty because of
          // CreateQueuedPipelinesOnProcessorThread.
@ -708,7 +708,7 @@ void PipelineCache::InitializeShaderStorage(
        bool await_creation_completion_event;
        {
          // Cleanup so additional threads can be created later again.
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
          creation_threads_shutdown_from_ = SIZE_MAX;
          // If the invocation is blocking, all the shader storage
          // initialization is expected to be done before proceeding, to avoid
@ -813,7 +813,7 @@ void PipelineCache::EndSubmission() {
    // Await creation of all queued pipelines.
    bool await_creation_completion_event;
    {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
      // Assuming the creation queue is already empty (because the processor
      // thread also worked on creating the leftover pipelines), so only check
      // if there are threads with pipelines currently being created.
@ -834,7 +834,7 @@ bool PipelineCache::IsCreatingPipelines() {
  if (creation_threads_.empty()) {
    return false;
  }
-  std::lock_guard<std::mutex> lock(creation_request_lock_);
+  std::lock_guard<xe_mutex> lock(creation_request_lock_);
  return !creation_queue_.empty() || creation_threads_busy_ != 0;
 }

@ -1076,7 +1076,7 @@ bool PipelineCache::ConfigurePipeline(
  if (!creation_threads_.empty()) {
    // Submit the pipeline for creation to any available thread.
    {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
      creation_queue_.push_back(new_pipeline);
    }
    creation_request_cond_.notify_one();
@ -3314,7 +3314,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
    // Check if need to shut down or set the completion event and dequeue the
    // pipeline if there is any.
    {
-      std::unique_lock<std::mutex> lock(creation_request_lock_);
+      std::unique_lock<xe_mutex> lock(creation_request_lock_);
      if (thread_index >= creation_threads_shutdown_from_ ||
          creation_queue_.empty()) {
        if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
@ -3345,7 +3345,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
    // completion event if needed (at the next iteration, or in some other
    // thread).
    {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
      --creation_threads_busy_;
    }
  }
@ -3356,7 +3356,7 @@ void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
  while (true) {
    Pipeline* pipeline_to_create;
    {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
      if (creation_queue_.empty()) {
        break;
      }
--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@ -403,8 +403,8 @@ class PipelineCache {
  // Pipeline creation threads.
  void CreationThread(size_t thread_index);
  void CreateQueuedPipelinesOnProcessorThread();
-  std::mutex creation_request_lock_;
-  std::condition_variable creation_request_cond_;
+  xe_mutex creation_request_lock_;
+  std::condition_variable_any creation_request_cond_;
  // Protected with creation_request_lock_, notify_one creation_request_cond_
  // when set.
  std::deque<Pipeline*> creation_queue_;
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -650,7 +650,8 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
  }
  return normalized_color_mask;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 xenos::CopySampleSelect SanitizeCopySampleSelect(
    xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
    bool is_depth) {
@ -737,7 +738,7 @@ const ResolveCopyShaderInfo
        {"Resolve Copy Full 64bpp", true, 2, 4, 5, 3},
        {"Resolve Copy Full 128bpp", true, 2, 4, 4, 3},
 };
-
+XE_MSVC_OPTIMIZE_SMALL()
 bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                    TraceWriter& trace_writer, uint32_t draw_resolution_scale_x,
                    uint32_t draw_resolution_scale_y,
@ -869,7 +870,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
    y1 = y0 + int32_t(xenos::kMaxResolveSize);
  }
  // fails in forza horizon 1
-  assert_true(x0 < x1 && y0 < y1);
+  //x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
+  assert_true(x0 <= x1 && y0 <= y1);
  if (x0 >= x1 || y0 >= y1) {
    XELOGE("Resolve region is empty");
    return false;
@ -1108,7 +1110,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
  info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
  info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
  info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
-
+  #if 0
  XELOGD(
      "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
      "modified memory range 0x{:08X} to 0x{:08X})",
@ -1119,10 +1121,10 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                     xenos::ColorRenderTargetFormat(color_edram_info.format)),
      FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
      copy_dest_extent_end);
-
+  #endif
  return true;
 }
-
+XE_MSVC_OPTIMIZE_REVERT()
 ResolveCopyShaderIndex ResolveInfo::GetCopyShader(
    uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y,
    ResolveCopyShaderConstants& constants_out, uint32_t& group_count_x_out,
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -475,6 +475,8 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(

 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
+XE_NOINLINE
+XE_NOALIAS
 xenos::CopySampleSelect SanitizeCopySampleSelect(
    xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
    bool is_depth);
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@ -14,6 +14,11 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
  new (&reader_)
      RingBuffer(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t));
  reader_.set_write_offset(count * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      COMMAND_PROCESSOR::GetCurrentRingReadCount());
  do {
    if (COMMAND_PROCESSOR::ExecutePacket()) {
      continue;
@ -30,11 +35,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
 }

 bool COMMAND_PROCESSOR::ExecutePacket() {
-  // prefetch the wraparound range
-  // it likely is already in L3 cache, but in a zen system it may be another
-  // chiplets l3
-  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
-      COMMAND_PROCESSOR::GetCurrentRingReadCount());
  const uint32_t packet = reader_.ReadAndSwap<uint32_t>();
  const uint32_t packet_type = packet >> 30;

@ -495,7 +495,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
        } else {
          xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100));
        }
-        xe::threading::SyncMemory();
+        // xe::threading::SyncMemory();
        ReturnFromWait();

        if (!worker_running_) {
@ -599,27 +599,28 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_COND_WRITE(
    value = register_file_->values[poll_reg_addr].u32;
  }
  bool matched = false;
+  value &= mask;
  switch (wait_info & 0x7) {
    case 0x0:  // Never.
      matched = false;
      break;
    case 0x1:  // Less than reference.
-      matched = (value & mask) < ref;
+      matched = value < ref;
      break;
    case 0x2:  // Less than or equal to reference.
-      matched = (value & mask) <= ref;
+      matched = value <= ref;
      break;
    case 0x3:  // Equal to reference.
-      matched = (value & mask) == ref;
+      matched = value == ref;
      break;
    case 0x4:  // Not equal to reference.
-      matched = (value & mask) != ref;
+      matched = value != ref;
      break;
    case 0x5:  // Greater than or equal to reference.
-      matched = (value & mask) >= ref;
+      matched = value >= ref;
      break;
    case 0x6:  // Greater than reference.
-      matched = (value & mask) > ref;
+      matched = value > ref;
      break;
    case 0x7:  // Always
      matched = true;
@ -1064,7 +1065,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
  assert_true(count - 2 >= size_dwords);
  auto shader = COMMAND_PROCESSOR::LoadShader(
      shader_type, uint32_t(reader_.read_ptr()),
-                 reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
+      reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
  switch (shader_type) {
    case xenos::ShaderType::kVertex:
      active_vertex_shader_ = shader;
--- a/src/xenia/gpu/primitive_processor.h
+++ b/src/xenia/gpu/primitive_processor.h
@ -430,7 +430,7 @@ class PrimitiveProcessor {
      --count;
      uint32_t index = *(source++) & low_bits_mask_guest_endian;
      *(dest++) = index != reset_index_guest_endian
-                      ? xenos::GpuSwap(index, HostSwap)
+                      ? xenos::GpuSwapInline(index, HostSwap)
                      : UINT32_MAX;
    }
    if (count >= kSimdVectorU32Elements) {
@ -442,10 +442,10 @@ class PrimitiveProcessor {
      __m128i host_swap_shuffle;
      if constexpr (HostSwap != xenos::Endian::kNone) {
        host_swap_shuffle = _mm_set_epi32(
-            int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap)));
+            int32_t(xenos::GpuSwapInline(uint32_t(0x0F0E0D0C), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x0B0A0908), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x07060504), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x03020100), HostSwap)));
      }
 #endif  // XE_ARCH_AMD64
      while (count >= kSimdVectorU32Elements) {
@ -490,7 +490,7 @@ class PrimitiveProcessor {
    while (count--) {
      uint32_t index = *(source++) & low_bits_mask_guest_endian;
      *(dest++) = index != reset_index_guest_endian
-                      ? xenos::GpuSwap(index, HostSwap)
+                      ? xenos::GpuSwapInline(index, HostSwap)
                      : UINT32_MAX;
    }
  }
@ -510,19 +510,19 @@ class PrimitiveProcessor {
  };
  struct To24Swapping8In16IndexTransform {
    uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k8in16) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k8in16) &
             xenos::kVertexIndexMask;
    }
  };
  struct To24Swapping8In32IndexTransform {
    uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k8in32) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k8in32) &
             xenos::kVertexIndexMask;
    }
  };
  struct To24Swapping16In32IndexTransform {
    uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k16in32) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k16in32) &
             xenos::kVertexIndexMask;
    }
  };
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@ -388,6 +388,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,

  bool any_data_resolved = false;
  uint32_t block_first = page_first >> 6;
+  swcache::PrefetchL1(&system_page_flags_[block_first]);
  uint32_t block_last = page_last >> 6;
  uint32_t range_start = UINT32_MAX;

--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@ -464,7 +464,8 @@ TextureGuestLayout GetGuestTextureLayout(

  return layout;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
                         uint32_t bytes_per_block_log2) {
  // https://github.com/gildor2/UModel/blob/de8fbd3bc922427ea056b7340202dcdcc19ccff5/Unreal/UnTexture.cpp#L489
@ -481,7 +482,8 @@ int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
  return ((offset & ~0x1FF) << 3) + ((y & 16) << 7) + ((offset & 0x1C0) << 2) +
         (((((y & 8) >> 2) + (x >> 3)) & 3) << 6) + (offset & 0x3F);
 }
-
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
                         uint32_t height, uint32_t bytes_per_block_log2) {
  // Reconstructed from disassembly of XGRAPHICS::TileVolume.
@ -509,7 +511,8 @@ int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
  address += offset2 & 63;
  return address;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
                                     uint32_t pitch,
                                     uint32_t bytes_per_block_log2) {
@ -538,7 +541,8 @@ uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
  }
  return upper_bound;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
                                     uint32_t back, uint32_t pitch,
                                     uint32_t height,
--- a/src/xenia/gpu/texture_util.h
+++ b/src/xenia/gpu/texture_util.h
@ -280,8 +280,12 @@ void GetTextureTotalSize(xenos::DataDimension dimension,
 // bytes_per_block_log2 is log2_floor according to how Direct3D 9 calculates it,
 // but k_32_32_32 textures are never tiled anyway likely.

+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
                         uint32_t bytes_per_block_log2);
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
                         uint32_t height, uint32_t bytes_per_block_log2);
 // Because (0, 0, 0) within each 32x32x4-block tile is stored in memory first,
@ -308,9 +312,13 @@ inline uint32_t GetTiledAddressLowerBound3D(uint32_t left, uint32_t top,
 // Supporting the right > pitch and bottom > height (in tiles) cases also, for
 // estimation how far addresses can actually go even potentially beyond the
 // subresource stride.
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
                                     uint32_t pitch,
                                     uint32_t bytes_per_block_log2);
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
                                     uint32_t back, uint32_t pitch,
                                     uint32_t height,
--- a/src/xenia/gpu/xenos.cc
+++ b/src/xenia/gpu/xenos.cc
@ -125,8 +125,8 @@ float Float7e3To32(uint32_t f10) {
 // Based on CFloat24 from d3dref9.dll and the 6e4 code from:
 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
 // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
-
-uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
+XE_NOALIAS
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept {
  if (!(f32 > 0.0f)) {
    // Positive only, and not -0 or NaN.
    return 0;
@ -150,8 +150,8 @@ uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
  }
  return (f32u32 >> 3) & 0xFFFFFF;
 }
-
-float Float20e4To32(uint32_t f24) {
+XE_NOALIAS
+float Float20e4To32(uint32_t f24) noexcept {
  f24 &= 0xFFFFFF;
  if (!f24) {
    return 0.0f;
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -421,10 +421,12 @@ float Float7e3To32(uint32_t f10);
 // floating-point number.
 // Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
 // depth, rounding to the nearest even or towards zero.
-uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
+XE_NOALIAS 
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept;
 // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
 // IEEE-754 32-bit floating-point number.
-float Float20e4To32(uint32_t f24);
+XE_NOALIAS
+float Float20e4To32(uint32_t f24) noexcept;
 // Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
 // floating-point number.
 constexpr float UNorm24To32(uint32_t n24) {
@ -1045,9 +1047,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) {
      return value;
  }
 }
-XE_NOINLINE
+XE_FORCEINLINE
 XE_NOALIAS
-static uint32_t GpuSwap(uint32_t value, Endian endianness) {
+static uint32_t GpuSwapInline(uint32_t value, Endian endianness) {
  switch (endianness) {
    default:
    case Endian::kNone:
@ -1065,6 +1067,11 @@ static uint32_t GpuSwap(uint32_t value, Endian endianness) {
      return ((value >> 16) & 0xFFFF) | (value << 16);
  }
 }
+XE_NOINLINE
+XE_NOALIAS
+static uint32_t GpuSwap(uint32_t value, Endian endianness) {
+  return GpuSwapInline(value, endianness);
+}

 inline float GpuSwap(float value, Endian endianness) {
  union {
--- a/src/xenia/hid/input_system.cc
+++ b/src/xenia/hid/input_system.cc
@ -137,8 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel(
  modified_vibration.right_motor_speed = 0;
  return modified_vibration;
 }
-std::unique_lock<xe_unlikely_mutex> InputSystem::lock() {
-  return std::unique_lock<xe_unlikely_mutex>{lock_};
+std::unique_lock<xe_mutex> InputSystem::lock() {
+  return std::unique_lock<xe_mutex>{lock_};
 }
 }  // namespace hid
 }  // namespace xe
--- a/src/xenia/hid/input_system.h
+++ b/src/xenia/hid/input_system.h
@ -48,7 +48,7 @@ class InputSystem {
  void UpdateUsedSlot(uint8_t slot, bool connected);
  uint8_t GetConnectedSlots() const { return connected_slot; }

-  std::unique_lock<xe_unlikely_mutex> lock();
+  std::unique_lock<xe_mutex> lock();

 private:
  xe::ui::Window* window_ = nullptr;
@ -57,7 +57,7 @@ class InputSystem {

  X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration);
  uint8_t connected_slot = 0b0001;
-  xe_unlikely_mutex lock_;
+  xe_mutex lock_;
 };

 }  // namespace hid
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@ -911,11 +911,17 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
 DECLARE_XBOXKRNL_EXPORT3(NtSignalAndWaitForSingleObjectEx, kThreading,
                         kImplemented, kBlocking, kHighFrequency);

+static void PrefetchForCAS(const void* value) {
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
+    swcache::PrefetchW(value);
+  }
+}
+
 uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
  // XELOGD(
  //     "KfAcquireSpinLock({:08X})",
  //     lock_ptr);
-
+  PrefetchForCAS(lock);
  // Lock.
  while (!xe::atomic_cas(0, 1, lock)) {
    // Spin!
@ -956,6 +962,7 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
 void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
  // Lock.
  auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
+  PrefetchForCAS(lock);
  while (!xe::atomic_cas(0, 1, lock)) {
 #if XE_ARCH_AMD64 == 1
    // todo: this is just a nop if they don't have SMT, which is not great
@ -973,6 +980,7 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
 dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
  // Lock.
  auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
+  PrefetchForCAS(lock);
  if (!xe::atomic_cas(0, 1, lock)) {
    return 0;
  }
--- a/src/xenia/kernel/xthread.cc
+++ b/src/xenia/kernel/xthread.cc
@ -763,7 +763,8 @@ void XThread::SetActiveCpu(uint8_t cpu_index) {
      thread_->set_affinity_mask(uint64_t(1) << cpu_index);
    }
  } else {
-    XELOGW("Too few processor cores - scheduling will be wonky");
+	  //there no good reason why we need to log this... we don't perfectly emulate the 360's scheduler in any way
+   // XELOGW("Too few processor cores - scheduling will be wonky");
  }
 }

--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@ -713,6 +713,8 @@ void BaseHeap::Initialize(Memory* memory, uint8_t* membase, HeapType heap_type,
  heap_base_ = heap_base;
  heap_size_ = heap_size;
  page_size_ = page_size;
+  xenia_assert(xe::is_pow2(page_size_));
+  page_size_shift_ = xe::log2_floor(page_size_);
  host_address_offset_ = host_address_offset;
  page_table_.resize(heap_size / page_size);
  unreserved_page_count_ = uint32_t(page_table_.size());
@ -1234,14 +1236,14 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
  //  fails and returns without modifying the access protection of any pages in
  //  the specified region."

-  uint32_t start_page_number = (address - heap_base_) / page_size_;
+  uint32_t start_page_number = (address - heap_base_) >> page_size_shift_;
  if (start_page_number >= page_table_.size()) {
    XELOGE("BaseHeap::Protect failed due to out-of-bounds base address {:08X}",
           address);
    return false;
  }
  uint32_t end_page_number =
-      uint32_t((uint64_t(address) + size - 1 - heap_base_) / page_size_);
+      uint32_t((uint64_t(address) + size - 1 - heap_base_) >> page_size_shift_);
  if (end_page_number >= page_table_.size()) {
    XELOGE(
        "BaseHeap::Protect failed due to out-of-bounds range ({:08X} bytes "
@ -1268,17 +1270,21 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
      return false;
    }
  }
+  uint32_t xe_page_size = static_cast<uint32_t>(xe::memory::page_size());
+
+  uint32_t page_size_mask = xe_page_size - 1;

  // Attempt host change (hopefully won't fail).
  // We can only do this if our size matches system page granularity.
  uint32_t page_count = end_page_number - start_page_number + 1;
-  if (page_size_ == xe::memory::page_size() ||
-      (((page_count * page_size_) % xe::memory::page_size() == 0) &&
-       ((start_page_number * page_size_) % xe::memory::page_size() == 0))) {
+  if (page_size_ == xe_page_size ||
+      ((((page_count << page_size_shift_) & page_size_mask) == 0) &&
+       (((start_page_number << page_size_shift_) & page_size_mask) == 0))) {
    memory::PageAccess old_protect_access;
-    if (!xe::memory::Protect(TranslateRelative(start_page_number * page_size_),
-                             page_count * page_size_, ToPageAccess(protect),
-                             old_protect ? &old_protect_access : nullptr)) {
+    if (!xe::memory::Protect(
+            TranslateRelative(start_page_number << page_size_shift_),
+            page_count << page_size_shift_, ToPageAccess(protect),
+            old_protect ? &old_protect_access : nullptr)) {
      XELOGE("BaseHeap::Protect failed due to host VirtualProtect failure");
      return false;
    }
@ -1303,7 +1309,7 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,

 bool BaseHeap::QueryRegionInfo(uint32_t base_address,
                               HeapAllocationInfo* out_info) {
-  uint32_t start_page_number = (base_address - heap_base_) / page_size_;
+  uint32_t start_page_number = (base_address - heap_base_) >> page_size_shift_;
  if (start_page_number > page_table_.size()) {
    XELOGE("BaseHeap::QueryRegionInfo base page out of range");
    return false;
@ -1321,9 +1327,10 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
  if (start_page_entry.state) {
    // Committed/reserved region.
    out_info->allocation_base =
-        heap_base_ + start_page_entry.base_address * page_size_;
+        heap_base_ + (start_page_entry.base_address << page_size_shift_);
    out_info->allocation_protect = start_page_entry.allocation_protect;
-    out_info->allocation_size = start_page_entry.region_page_count * page_size_;
+    out_info->allocation_size = start_page_entry.region_page_count
+                                << page_size_shift_;
    out_info->state = start_page_entry.state;
    out_info->protect = start_page_entry.current_protect;

@ -1358,7 +1365,7 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
 }

 bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
-  uint32_t page_number = (address - heap_base_) / page_size_;
+  uint32_t page_number = (address - heap_base_) >> page_size_shift_;
  if (page_number > page_table_.size()) {
    XELOGE("BaseHeap::QuerySize base page out of range");
    *out_size = 0;
@ -1366,12 +1373,12 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
  }
  auto global_lock = global_critical_region_.Acquire();
  auto page_entry = page_table_[page_number];
-  *out_size = (page_entry.region_page_count * page_size_);
+  *out_size = (page_entry.region_page_count << page_size_shift_);
  return true;
 }

 bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
-  uint32_t page_number = (*in_out_address - heap_base_) / page_size_;
+  uint32_t page_number = (*in_out_address - heap_base_) >> page_size_shift_;
  if (page_number > page_table_.size()) {
    XELOGE("BaseHeap::QuerySize base page out of range");
    *out_size = 0;
@ -1379,13 +1386,13 @@ bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
  }
  auto global_lock = global_critical_region_.Acquire();
  auto page_entry = page_table_[page_number];
-  *in_out_address = (page_entry.base_address * page_size_);
-  *out_size = (page_entry.region_page_count * page_size_);
+  *in_out_address = (page_entry.base_address << page_size_shift_);
+  *out_size = (page_entry.region_page_count << page_size_shift_);
  return true;
 }

 bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
-  uint32_t page_number = (address - heap_base_) / page_size_;
+  uint32_t page_number = (address - heap_base_) >> page_size_shift_;
  if (page_number > page_table_.size()) {
    XELOGE("BaseHeap::QueryProtect base page out of range");
    *out_protect = 0;
@ -1403,8 +1410,8 @@ xe::memory::PageAccess BaseHeap::QueryRangeAccess(uint32_t low_address,
      (high_address - heap_base_) >= heap_size_) {
    return xe::memory::PageAccess::kNoAccess;
  }
-  uint32_t low_page_number = (low_address - heap_base_) / page_size_;
-  uint32_t high_page_number = (high_address - heap_base_) / page_size_;
+  uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
+  uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
  uint32_t protect = kMemoryProtectRead | kMemoryProtectWrite;
  {
    auto global_lock = global_critical_region_.Acquire();
@ -1446,6 +1453,8 @@ void PhysicalHeap::Initialize(Memory* memory, uint8_t* membase,
                       page_size, host_address_offset);
  parent_heap_ = parent_heap;
  system_page_size_ = uint32_t(xe::memory::page_size());
+  xenia_assert(xe::is_pow2(system_page_size_));
+  system_page_shift_ = xe::log2_floor(system_page_size_);

  system_page_count_ =
      (size_t(heap_size_) + host_address_offset + (system_page_size_ - 1)) /
@ -1665,10 +1674,11 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
  }

  uint32_t system_page_first =
-      (heap_relative_address + host_address_offset()) / system_page_size_;
+      (heap_relative_address + host_address_offset()) >> system_page_shift_;
+  swcache::PrefetchL1(&system_page_flags_[system_page_first >> 6]);
  uint32_t system_page_last =
-      (heap_relative_address + length - 1 + host_address_offset()) /
-      system_page_size_;
+      (heap_relative_address + length - 1 + host_address_offset()) >>
+      system_page_shift_;
  system_page_last = std::min(system_page_last, system_page_count_ - 1);
  assert_true(system_page_first <= system_page_last);

@ -1677,10 +1687,40 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
  xe::memory::PageAccess protect_access =
      enable_data_providers ? xe::memory::PageAccess::kNoAccess
                            : xe::memory::PageAccess::kReadOnly;
+
+  auto global_lock = global_critical_region_.Acquire();
+  if (enable_invalidation_notifications) {
+    EnableAccessCallbacksInner<true>(system_page_first, system_page_last,
+                                     protect_access);
+  } else {
+    EnableAccessCallbacksInner<false>(system_page_first, system_page_last,
+                                      protect_access);
+  }
+}
+
+template <bool enable_invalidation_notifications>
+XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
+    const uint32_t system_page_first, const uint32_t system_page_last,
+    xe::memory::PageAccess protect_access) XE_RESTRICT {
  uint8_t* protect_base = membase_ + heap_base_;
  uint32_t protect_system_page_first = UINT32_MAX;
-  auto global_lock = global_critical_region_.Acquire();
-  for (uint32_t i = system_page_first; i <= system_page_last; ++i) {
+
+  SystemPageFlagsBlock* XE_RESTRICT sys_page_flags = system_page_flags_.data();
+  PageEntry* XE_RESTRICT page_table_ptr = page_table_.data();
+
+  // chrispy: a lot of time is spent in this loop, and i think some of the work
+  // may be avoidable and repetitive profiling shows quite a bit of time spent
+  // in this loop, but very little spent actually calling Protect
+  uint32_t i = system_page_first;
+
+  uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
+  uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
+
+  uint32_t guest_one =
+      SystemPagenumToGuestPagenum(1);
+
+  uint32_t system_one = GuestPagenumToSystemPagenum(1);
+  for (; i <= system_page_last; ++i) {
    // Check if need to enable callbacks for the page and raise its protection.
    //
    // If enabling invalidation notifications:
@ -1702,12 +1742,19 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
    //
    // Enabling data providers doesn't need to be deferred - providers will be
    // polled for the last time without releasing the lock.
-    SystemPageFlagsBlock& page_flags_block = system_page_flags_[i >> 6];
+    SystemPageFlagsBlock& page_flags_block = sys_page_flags[i >> 6];
+
+#if XE_ARCH_AMD64 == 1
+    // x86 modulus shift
+    uint64_t page_flags_bit = uint64_t(1) << i;
+#else
    uint64_t page_flags_bit = uint64_t(1) << (i & 63);
-    uint32_t guest_page_number =
-        xe::sat_sub(i * system_page_size_, host_address_offset()) / page_size_;
+#endif
+
+    uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
+    //swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]);
    xe::memory::PageAccess current_page_access =
-        ToPageAccess(page_table_[guest_page_number].current_protect);
+        ToPageAccess(page_table_ptr[guest_page_number].current_protect);
    bool protect_system_page = false;
    // Don't do anything with inaccessible pages - don't protect, don't enable
    // callbacks - because real access violations are needed there. And don't
@ -1715,7 +1762,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
    // reason.
    if (current_page_access != xe::memory::PageAccess::kNoAccess) {
      // TODO(Triang3l): Enable data providers.
-      if (enable_invalidation_notifications) {
+      if constexpr (enable_invalidation_notifications) {
        if (current_page_access != xe::memory::PageAccess::kReadOnly &&
            (page_flags_block.notify_on_invalidation & page_flags_bit) == 0) {
          // TODO(Triang3l): Check if data providers are already enabled.
@ -1733,21 +1780,22 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
    } else {
      if (protect_system_page_first != UINT32_MAX) {
        xe::memory::Protect(
-            protect_base + protect_system_page_first * system_page_size_,
-            (i - protect_system_page_first) * system_page_size_,
+            protect_base + (protect_system_page_first << system_page_shift_),
+            (i - protect_system_page_first) << system_page_shift_,
            protect_access);
        protect_system_page_first = UINT32_MAX;
      }
    }
  }
+
  if (protect_system_page_first != UINT32_MAX) {
    xe::memory::Protect(
-        protect_base + protect_system_page_first * system_page_size_,
-        (system_page_last + 1 - protect_system_page_first) * system_page_size_,
+        protect_base + (protect_system_page_first << system_page_shift_),
+        (system_page_last + 1 - protect_system_page_first)
+            << system_page_shift_,
        protect_access);
  }
 }
-
 bool PhysicalHeap::TriggerCallbacks(
    global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
    uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
@ -1774,10 +1822,10 @@ bool PhysicalHeap::TriggerCallbacks(
  }

  uint32_t system_page_first =
-      (heap_relative_address + host_address_offset()) / system_page_size_;
+      (heap_relative_address + host_address_offset()) >> system_page_shift_;
  uint32_t system_page_last =
-      (heap_relative_address + length - 1 + host_address_offset()) /
-      system_page_size_;
+      (heap_relative_address + length - 1 + host_address_offset()) >>
+      system_page_shift_;
  system_page_last = std::min(system_page_last, system_page_count_ - 1);
  assert_true(system_page_first <= system_page_last);
  uint32_t block_index_first = system_page_first >> 6;
@ -1810,11 +1858,11 @@ bool PhysicalHeap::TriggerCallbacks(
  }
  uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
  uint32_t physical_address_start =
-      xe::sat_sub(system_page_first * system_page_size_,
+      xe::sat_sub(system_page_first << system_page_shift_,
                  host_address_offset()) +
      physical_address_offset;
  uint32_t physical_length = std::min(
-      xe::sat_sub(system_page_last * system_page_size_ + system_page_size_,
+      xe::sat_sub((system_page_last << system_page_shift_) + system_page_size_,
                  host_address_offset()) +
          physical_address_offset - physical_address_start,
      heap_size_ - (physical_address_start - physical_address_offset));
@ -1858,8 +1906,8 @@ bool PhysicalHeap::TriggerCallbacks(
    unwatch_first += host_address_offset();
    unwatch_last += host_address_offset();
    assert_true(unwatch_first <= unwatch_last);
-    system_page_first = unwatch_first / system_page_size_;
-    system_page_last = unwatch_last / system_page_size_;
+    system_page_first = unwatch_first >> system_page_shift_;
+    system_page_last = unwatch_last >> system_page_shift_;
    block_index_first = system_page_first >> 6;
    block_index_last = system_page_last >> 6;
  }
@ -1874,8 +1922,8 @@ bool PhysicalHeap::TriggerCallbacks(
                             (uint64_t(1) << (i & 63))) != 0;
      if (unprotect_page) {
        uint32_t guest_page_number =
-            xe::sat_sub(i * system_page_size_, host_address_offset()) /
-            page_size_;
+            xe::sat_sub(i << system_page_shift_, host_address_offset()) >>
+            page_size_shift_;
        if (ToPageAccess(page_table_[guest_page_number].current_protect) !=
            xe::memory::PageAccess::kReadWrite) {
          unprotect_page = false;
@ -1888,8 +1936,9 @@ bool PhysicalHeap::TriggerCallbacks(
      } else {
        if (unprotect_system_page_first != UINT32_MAX) {
          xe::memory::Protect(
-              protect_base + unprotect_system_page_first * system_page_size_,
-              (i - unprotect_system_page_first) * system_page_size_,
+              protect_base +
+                  (unprotect_system_page_first << system_page_shift_),
+              (i - unprotect_system_page_first) << system_page_shift_,
              xe::memory::PageAccess::kReadWrite);
          unprotect_system_page_first = UINT32_MAX;
        }
@ -1897,9 +1946,9 @@ bool PhysicalHeap::TriggerCallbacks(
    }
    if (unprotect_system_page_first != UINT32_MAX) {
      xe::memory::Protect(
-          protect_base + unprotect_system_page_first * system_page_size_,
-          (system_page_last + 1 - unprotect_system_page_first) *
-              system_page_size_,
+          protect_base + (unprotect_system_page_first << system_page_shift_),
+          (system_page_last + 1 - unprotect_system_page_first)
+              << system_page_shift_,
          xe::memory::PageAccess::kReadWrite);
    }
  }
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@ -216,6 +216,7 @@ class BaseHeap {
  uint32_t heap_base_;
  uint32_t heap_size_;
  uint32_t page_size_;
+  uint32_t page_size_shift_;
  uint32_t host_address_offset_;
  uint32_t unreserved_page_count_;
  xe::global_critical_region global_critical_region_;
@ -270,18 +271,36 @@ class PhysicalHeap : public BaseHeap {
  void EnableAccessCallbacks(uint32_t physical_address, uint32_t length,
                             bool enable_invalidation_notifications,
                             bool enable_data_providers);
+  template <bool enable_invalidation_notifications>
+  XE_NOINLINE void EnableAccessCallbacksInner(
+      const uint32_t system_page_first, const uint32_t system_page_last,
+      xe::memory::PageAccess protect_access) XE_RESTRICT;
+
  // Returns true if any page in the range was watched.
  bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
-      uint32_t virtual_address, uint32_t length, bool is_write,
-      bool unwatch_exact_range, bool unprotect = true);
+                        uint32_t virtual_address, uint32_t length,
+                        bool is_write, bool unwatch_exact_range,
+                        bool unprotect = true);

  uint32_t GetPhysicalAddress(uint32_t address) const;

+  uint32_t SystemPagenumToGuestPagenum(uint32_t num) const {
+    return ((num << system_page_shift_) - host_address_offset()) >> page_size_shift_;
+  }
+
+  uint32_t GuestPagenumToSystemPagenum(uint32_t num) {
+    num <<= page_size_shift_;
+    num += host_address_offset();
+    num >>= system_page_shift_;
+    return num;
+  }
 protected:
  VirtualHeap* parent_heap_;

  uint32_t system_page_size_;
  uint32_t system_page_count_;
+  uint32_t system_page_shift_;
+  uint32_t padding1_;

  struct SystemPageFlagsBlock {
    // Whether writing to each page should result trigger invalidation
@ -458,9 +477,9 @@ class Memory {
  // TODO(Triang3l): Implement data providers - this is why locking depth of 1
  // will be required in the future.
  bool TriggerPhysicalMemoryCallbacks(
-      global_unique_lock_type global_lock_locked_once,
-      uint32_t virtual_address, uint32_t length, bool is_write,
-      bool unwatch_exact_range, bool unprotect = true);
+      global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
+      uint32_t length, bool is_write, bool unwatch_exact_range,
+      bool unprotect = true);

  // Allocates virtual memory from the 'system' heap.
  // System memory is kept separate from game memory but is still accessible
@ -509,10 +528,10 @@ class Memory {
                                          const void* host_address);

  bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
-      void* host_address, bool is_write);
+                               void* host_address, bool is_write);
  static bool AccessViolationCallbackThunk(
-      global_unique_lock_type global_lock_locked_once,
-      void* context, void* host_address, bool is_write);
+      global_unique_lock_type global_lock_locked_once, void* context,
+      void* host_address, bool is_write);

  std::filesystem::path file_name_;
  uint32_t system_page_size_ = 0;