diff --git a/src/xenia/apu/conversion.h b/src/xenia/apu/conversion.h
index 211243348..0f807d67b 100644
--- a/src/xenia/apu/conversion.h
+++ b/src/xenia/apu/conversion.h
@@ -20,6 +20,8 @@ namespace apu {
 namespace conversion {
 
 #if XE_ARCH_AMD64
+
+#if 0
 inline void sequential_6_BE_to_interleaved_6_LE(float* output,
                                                 const float* input,
                                                 size_t ch_sample_count) {
@@ -41,7 +43,44 @@ inline void sequential_6_BE_to_interleaved_6_LE(float* output,
     out[sample * 6 + 5] = sample2;
   }
 }
+#else
+XE_NOINLINE
+static void _generic_sequential_6_BE_to_interleaved_6_LE(
+    float* XE_RESTRICT output, const float* XE_RESTRICT input,
+    unsigned ch_sample_count) {
+  for (unsigned sample = 0; sample < ch_sample_count; sample++) {
+    for (unsigned channel = 0; channel < 6; channel++) {
+      unsigned int value = *reinterpret_cast<const unsigned int*>(
+          &input[channel * ch_sample_count + sample]);
 
+      *reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
+          xe::byte_swap(value);
+    }
+  }
+}
+XE_NOINLINE
+static void _movbe_sequential_6_BE_to_interleaved_6_LE(
+    float* XE_RESTRICT output, const float* XE_RESTRICT input,
+    unsigned ch_sample_count) {
+  for (unsigned sample = 0; sample < ch_sample_count; sample++) {
+    for (unsigned channel = 0; channel < 6; channel++) {
+      *reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
+          _load_be_u32(reinterpret_cast<const unsigned int*>(
+              &input[channel * ch_sample_count + sample]));
+    }
+  }
+}
+
+inline static void sequential_6_BE_to_interleaved_6_LE(
+    float* output, const float* input, unsigned ch_sample_count) {
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovbe) {
+    _movbe_sequential_6_BE_to_interleaved_6_LE(output, input, ch_sample_count);
+  } else {
+    _generic_sequential_6_BE_to_interleaved_6_LE(output, input,
+                                                 ch_sample_count);
+  }
+}
+#endif
 inline void sequential_6_BE_to_interleaved_2_LE(float* output,
                                                 const float* input,
                                                 size_t ch_sample_count) {
diff --git a/src/xenia/base/cvar.h b/src/xenia/base/cvar.h
index 61b8faf11..144703665 100644
--- a/src/xenia/base/cvar.h
+++ b/src/xenia/base/cvar.h
@@ -335,7 +335,8 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,
 
 #define DEFINE_uint64(name, default_value, description, category) \
   DEFINE_CVar(name, default_value, description, category, false, uint64_t)
-
+#define DEFINE_int64(name, default_value, description, category) \
+  DEFINE_CVar(name, default_value, description, category, false, int64_t)
 #define DEFINE_double(name, default_value, description, category) \
   DEFINE_CVar(name, default_value, description, category, false, double)
 
@@ -383,7 +384,7 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,
 #define DECLARE_uint32(name) DECLARE_CVar(name, uint32_t)
 
 #define DECLARE_uint64(name) DECLARE_CVar(name, uint64_t)
-
+#define DECLARE_int64(name) DECLARE_CVar(name, int64_t)
 #define DECLARE_double(name) DECLARE_CVar(name, double)
 
 #define DECLARE_string(name) DECLARE_CVar(name, std::string)
diff --git a/src/xenia/base/mutex.cc b/src/xenia/base/mutex.cc
index 027cd7882..b975e4bc3 100644
--- a/src/xenia/base/mutex.cc
+++ b/src/xenia/base/mutex.cc
@@ -26,7 +26,7 @@ check this and release the mutex one way to do this is by using FlsAlloc and
 PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
 thread exits
 */
-thread_local unsigned global_mutex_depth = 0;
+
 static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
   return reinterpret_cast<CRITICAL_SECTION*>(mutex);
 }
@@ -38,29 +38,16 @@ xe_global_mutex::xe_global_mutex() {
 xe_global_mutex ::~xe_global_mutex() {
   DeleteCriticalSection(global_critical_section(this));
 }
+
 void xe_global_mutex::lock() {
-  if (global_mutex_depth) {
-  } else {
-    EnterCriticalSection(global_critical_section(this));
-  }
-  global_mutex_depth++;
+  EnterCriticalSection(global_critical_section(this));
 }
 void xe_global_mutex::unlock() {
-  if (--global_mutex_depth == 0) {
-    LeaveCriticalSection(global_critical_section(this));
-  }
+  LeaveCriticalSection(global_critical_section(this));
 }
 bool xe_global_mutex::try_lock() {
-  if (global_mutex_depth) {
-    ++global_mutex_depth;
-    return true;
-  } else {
-    BOOL success = TryEnterCriticalSection(global_critical_section(this));
-    if (success) {
-      ++global_mutex_depth;
-    }
-    return success;
-  }
+  BOOL success = TryEnterCriticalSection(global_critical_section(this));
+  return success;
 }
 
 CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {
diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h
index e99e8b83d..61749e4c7 100644
--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@@ -116,15 +116,15 @@
 #define XE_LIKELY(...) (!!(__VA_ARGS__))
 #define XE_UNLIKELY(...) (!!(__VA_ARGS__))
 #define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__)
-#define	XE_NOALIAS		__declspec(noalias)
+#define XE_NOALIAS __declspec(noalias)
 #elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
 #define XE_FORCEINLINE __attribute__((always_inline))
 #define XE_NOINLINE __attribute__((noinline))
 #define XE_COLD __attribute__((cold))
 #define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
 #define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
-#define XE_NOALIAS		
-//cant do unevaluated assume
+#define XE_NOALIAS
+// cant do unevaluated assume
 #define XE_MSVC_ASSUME(...) static_cast<void>(0)
 #else
 #define XE_FORCEINLINE inline
@@ -137,7 +137,13 @@
 #define XE_MSVC_ASSUME(...) static_cast<void>(0)
 
 #endif
-
+#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1
+#define XE_MSVC_OPTIMIZE_SMALL() __pragma(optimize("s", on))
+#define XE_MSVC_OPTIMIZE_REVERT() __pragma(optimize("", on))
+#else
+#define XE_MSVC_OPTIMIZE_SMALL()
+#define XE_MSVC_OPTIMIZE_REVERT()
+#endif
 #if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
 #define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__))
 #define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__))
@@ -180,7 +186,7 @@ const char kPathSeparator = '/';
 const char kGuestPathSeparator = '\\';
 
 }  // namespace xe
-#if XE_ARCH_AMD64==1
+#if XE_ARCH_AMD64 == 1
 #include "platform_amd64.h"
 #endif
 #endif  // XENIA_BASE_PLATFORM_H_
diff --git a/src/xenia/base/platform_amd64.cc b/src/xenia/base/platform_amd64.cc
index 31df3c497..7005420e5 100644
--- a/src/xenia/base/platform_amd64.cc
+++ b/src/xenia/base/platform_amd64.cc
@@ -7,13 +7,12 @@
  ******************************************************************************
  */
 
-
 #include "xenia/base/cvar.h"
 #include "xenia/base/platform.h"
 
 #include "third_party/xbyak/xbyak/xbyak.h"
 #include "third_party/xbyak/xbyak/xbyak_util.h"
-DEFINE_int32(x64_extension_mask, -1,
+DEFINE_int64(x64_extension_mask, -1LL,
              "Allow the detection and utilization of specific instruction set "
              "features.\n"
              "    0 = x86_64 + AVX1\n"
@@ -33,79 +32,92 @@ DEFINE_int32(x64_extension_mask, -1,
              "x64");
 namespace xe {
 namespace amd64 {
-static uint32_t g_feature_flags = 0U;
+static uint64_t g_feature_flags = 0U;
 static bool g_did_initialize_feature_flags = false;
-uint32_t GetFeatureFlags() { 
-	xenia_assert(g_did_initialize_feature_flags);
-	return g_feature_flags; 
+uint64_t GetFeatureFlags() {
+  xenia_assert(g_did_initialize_feature_flags);
+  return g_feature_flags;
 }
 XE_COLD
 XE_NOINLINE
 void InitFeatureFlags() {
-  uint32_t feature_flags_ = 0U;
-
-  Xbyak::util::Cpu cpu_;
+  uint64_t feature_flags_ = 0U;
+  {
+    Xbyak::util::Cpu cpu_;
 #define TEST_EMIT_FEATURE(emit, ext)                \
   if ((cvars::x64_extension_mask & emit) == emit) { \
     feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
   }
 
-  TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
-  TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
-  TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
-  TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
-  TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
-  TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
-  TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
-  TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
-  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
-  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
-  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
+    TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
+    TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
+    TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
+    TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
+    TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
+    TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
+    TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
+    TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
+    TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
+    TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
+    TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
+    TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
+    TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
 #undef TEST_EMIT_FEATURE
-  /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
-  latest version of xbyak
-*/
-  unsigned int data[4];
-  Xbyak::util::Cpu::getCpuid(0x80000001, data);
-  unsigned amd_flags = data[2];
-  if (amd_flags & (1U << 5)) {
-    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
-      feature_flags_ |= kX64EmitLZCNT;
-    }
-  }
-  // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
-  if (amd_flags & (1U << 16)) {
-    if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
-      feature_flags_ |= kX64EmitFMA4;
-    }
-  }
-  if (amd_flags & (1U << 21)) {
-    if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
-      feature_flags_ |= kX64EmitTBM;
-    }
-  }
-  if (amd_flags & (1U << 11)) {
-    if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
-      feature_flags_ |= kX64EmitXOP;
-    }
-  }
-  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-    bool is_zennish = cpu_.displayFamily >= 0x17;
     /*
-                chrispy: according to agner's tables, all amd architectures that
-       we support (ones with avx) have the same timings for
-       jrcxz/loop/loope/loopne as for other jmps
-        */
-    feature_flags_ |= kX64FastJrcx;
-    feature_flags_ |= kX64FastLoop;
-    if (is_zennish) {
-      // ik that i heard somewhere that this is the case for zen, but i need to
-      // verify. cant find my original source for that.
-      // todo: ask agner?
-      feature_flags_ |= kX64FlagsIndependentVars;
+    fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
+    latest version of xbyak
+  */
+    unsigned int data[4];
+    Xbyak::util::Cpu::getCpuid(0x80000001, data);
+    unsigned amd_flags = data[2];
+    if (amd_flags & (1U << 5)) {
+      if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
+        feature_flags_ |= kX64EmitLZCNT;
+      }
+    }
+    // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
+    if (amd_flags & (1U << 16)) {
+      if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
+        feature_flags_ |= kX64EmitFMA4;
+      }
+    }
+    if (amd_flags & (1U << 21)) {
+      if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
+        feature_flags_ |= kX64EmitTBM;
+      }
+    }
+    if (amd_flags & (1U << 11)) {
+      if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
+        feature_flags_ |= kX64EmitXOP;
+      }
+    }
+    if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
+      bool is_zennish = cpu_.displayFamily >= 0x17;
+      /*
+                  chrispy: according to agner's tables, all amd architectures
+         that we support (ones with avx) have the same timings for
+         jrcxz/loop/loope/loopne as for other jmps
+          */
+      feature_flags_ |= kX64FastJrcx;
+      feature_flags_ |= kX64FastLoop;
+      if (is_zennish) {
+        // ik that i heard somewhere that this is the case for zen, but i need
+        // to verify. cant find my original source for that. todo: ask agner?
+        feature_flags_ |= kX64FlagsIndependentVars;
+      }
+    }
+  }
+  {
+    unsigned int data[4];
+    memset(data, 0, sizeof(data));
+    // intel extended features
+    Xbyak::util::Cpu::getCpuidEx(7, 0, data);
+    if ((data[2] & (1 << 28)) &&
+        (cvars::x64_extension_mask & kX64EmitMovdir64M)) {
+      feature_flags_ |= kX64EmitMovdir64M;
+    }
+    if ((data[1] & (1 << 9)) && (cvars::x64_extension_mask & kX64FastRepMovs)) {
+      feature_flags_ |= kX64FastRepMovs;
     }
   }
   g_feature_flags = feature_flags_;
diff --git a/src/xenia/base/platform_amd64.h b/src/xenia/base/platform_amd64.h
index 326b69139..e5c20c670 100644
--- a/src/xenia/base/platform_amd64.h
+++ b/src/xenia/base/platform_amd64.h
@@ -13,7 +13,7 @@
 
 namespace xe {
 namespace amd64 {
-enum X64FeatureFlags {
+enum X64FeatureFlags : uint64_t {
   kX64EmitAVX2 = 1 << 0,
   kX64EmitFMA = 1 << 1,
   kX64EmitLZCNT = 1 << 2,  // this is actually ABM and includes popcount
@@ -44,14 +44,13 @@ enum X64FeatureFlags {
                            // instructions, and FX users need the boost
   kX64EmitFMA4 = 1 << 17,  // todo: also use on zen1?
   kX64EmitTBM = 1 << 18,
-  // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
-  // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
-  // instructions by using the legacy sse version if we recently cleared the
-  // high 128 bits of the
+  kX64EmitMovdir64M = 1 << 19,
+  kX64FastRepMovs = 1 << 20
+
 };
 
 XE_NOALIAS
-uint32_t GetFeatureFlags();
+uint64_t GetFeatureFlags();
 XE_COLD
 void InitFeatureFlags();
 
diff --git a/src/xenia/base/threading.h b/src/xenia/base/threading.h
index 67297716b..604819950 100644
--- a/src/xenia/base/threading.h
+++ b/src/xenia/base/threading.h
@@ -299,6 +299,12 @@ class Event : public WaitHandle {
   // the nonsignaled state after releasing the appropriate number of waiting
   // threads.
   virtual void Pulse() = 0;
+  #if XE_PLATFORM_WIN32 ==1
+  //SetEvent, but if there is a waiter we immediately transfer execution to it
+  virtual void SetBoostPriority() = 0;
+  #else
+  void SetBoostPriority() { Set() }
+  #endif
 };
 
 // Models a Win32-like semaphore object.
diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc
index 32ddf7487..01a4eb9be 100644
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@@ -39,6 +39,8 @@ XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject,
                 NtWaitForSingleObjectPointer);
 
 XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer);
+XE_NTDLL_IMPORT(NtSetEventBoostPriority, cls_NtSetEventBoostPriority,
+                NtSetEventBoostPriorityPointer);
 // difference between NtClearEvent and NtResetEvent is that NtResetEvent returns
 // the events state prior to the call, but we dont need that. might need to
 // check whether one or the other is faster in the kernel though yeah, just
@@ -53,6 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
 
 XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
                 NtDelayExecutionPointer);
+
 namespace xe {
 namespace threading {
 
@@ -137,7 +140,7 @@ void MaybeYield() {
 #endif
 #endif
   // memorybarrier is really not necessary here...
-  MemoryBarrier();
+  // MemoryBarrier();
 }
 
 void SyncMemory() { MemoryBarrier(); }
@@ -288,11 +291,19 @@ class Win32Event : public Win32Handle<Event> {
   void Set() override { NtSetEventPointer.invoke(handle_, nullptr); }
   void Reset() override { NtClearEventPointer.invoke(handle_); }
   void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
+  void SetBoostPriority() override {
+    // no previous state for boostpriority
+    NtSetEventBoostPriorityPointer.invoke(handle_);
+  }
 #else
   void Set() override { SetEvent(handle_); }
   void Reset() override { ResetEvent(handle_); }
   void Pulse() override { PulseEvent(handle_); }
 
+  void SetBoostPriority() override {
+    // no win32 version of boostpriority
+    SetEvent(handle_);
+  }
 #endif
 };
 
diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h
index d4ded3e83..cb5a375ec 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@@ -23,7 +23,7 @@
 #define XE_X64_PROFILER_AVAILABLE 1
 #endif
 
-DECLARE_int32(x64_extension_mask);
+DECLARE_int64(x64_extension_mask);
 
 namespace xe {
 class Exception;
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index 74515d38e..03b8b4abd 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -103,74 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
         "FAQ for system requirements at https://xenia.jp");
     return;
   }
-#if 1
-  feature_flags_ = amd64::GetFeatureFlags();
-#else
-#define TEST_EMIT_FEATURE(emit, ext)                \
-  if ((cvars::x64_extension_mask & emit) == emit) { \
-    feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
-  }
 
-  TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
-  TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
-  TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
-  TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
-  TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
-  TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
-  TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
-  TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
-  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
-  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
-  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
-#undef TEST_EMIT_FEATURE
-  /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
-  latest version of xbyak
-*/
-  unsigned int data[4];
-  Xbyak::util::Cpu::getCpuid(0x80000001, data);
-  unsigned amd_flags = data[2];
-  if (amd_flags & (1U << 5)) {
-    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
-      feature_flags_ |= kX64EmitLZCNT;
-    }
-  }
-  // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
-  if (amd_flags & (1U << 16)) {
-    if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
-      feature_flags_ |= kX64EmitFMA4;
-    }
-  }
-  if (amd_flags & (1U << 21)) {
-    if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
-      feature_flags_ |= kX64EmitTBM;
-    }
-  }
-  if (amd_flags & (1U << 11)) {
-    if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
-      feature_flags_ |= kX64EmitXOP;
-      XELOGCPU("Cpu support XOP!\n\n");
-    }
-  }
-  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-    bool is_zennish = cpu_.displayFamily >= 0x17;
-    /*
-                chrispy: according to agner's tables, all amd architectures that
-       we support (ones with avx) have the same timings for
-       jrcxz/loop/loope/loopne as for other jmps
-        */
-    feature_flags_ |= kX64FastJrcx;
-    feature_flags_ |= kX64FastLoop;
-    if (is_zennish) {
-      // ik that i heard somewhere that this is the case for zen, but i need to
-      // verify. cant find my original source for that.
-      // todo: ask agner?
-      feature_flags_ |= kX64FlagsIndependentVars;
-    }
-  }
-#endif
+  feature_flags_ = amd64::GetFeatureFlags();
+
   may_use_membase32_as_zero_reg_ =
       static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
           processor()->memory()->virtual_membase())) == 0;
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index 69e3b80ec..91f4016c1 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -299,7 +299,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
   void* FindWordConstantOffset(unsigned wordvalue);
   void* FindDwordConstantOffset(unsigned bytevalue);
   void* FindQwordConstantOffset(uint64_t bytevalue);
-  bool IsFeatureEnabled(uint32_t feature_flag) const {
+  bool IsFeatureEnabled(uint64_t feature_flag) const {
     return (feature_flags_ & feature_flag) == feature_flag;
   }
 
@@ -395,7 +395,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
   XbyakAllocator* allocator_ = nullptr;
   XexModule* guest_module_ = nullptr;
   Xbyak::util::Cpu cpu_;
-  uint32_t feature_flags_ = 0;
+  uint64_t feature_flags_ = 0;
   uint32_t current_guest_function_ = 0;
   Xbyak::Label* epilog_label_ = nullptr;
 
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 06a37ab91..28b33fd76 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -39,7 +39,7 @@
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"
-
+XE_MSVC_OPTIMIZE_SMALL()
 DEFINE_bool(use_fast_dot_product, false,
             "Experimental optimization, much shorter sequence on dot products, "
             "treating inf as overflow instead of using mcxsr"
diff --git a/src/xenia/cpu/entry_table.cc b/src/xenia/cpu/entry_table.cc
index 1d82f0538..840706171 100644
--- a/src/xenia/cpu/entry_table.cc
+++ b/src/xenia/cpu/entry_table.cc
@@ -19,16 +19,19 @@ EntryTable::EntryTable() = default;
 
 EntryTable::~EntryTable() {
   auto global_lock = global_critical_region_.Acquire();
-  for (auto it : map_) {
-    Entry* entry = it.second;
+  for (auto it : map_.Values()) {
+    Entry* entry = it;
     delete entry;
   }
 }
 
 Entry* EntryTable::Get(uint32_t address) {
   auto global_lock = global_critical_region_.Acquire();
-  const auto& it = map_.find(address);
-  Entry* entry = it != map_.end() ? it->second : nullptr;
+  uint32_t idx = map_.IndexForKey(address);
+  if (idx == map_.size() || *map_.KeyAt(idx) != address) {
+    return nullptr;
+  }
+  Entry* entry = *map_.ValueAt(idx);
   if (entry) {
     // TODO(benvanik): wait if needed?
     if (entry->status != Entry::STATUS_READY) {
@@ -43,8 +46,12 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
   // https://github.com/facebook/folly/blob/master/folly/AtomicHashMap.h
 
   auto global_lock = global_critical_region_.Acquire();
-  const auto& it = map_.find(address);
-  Entry* entry = it != map_.end() ? it->second : nullptr;
+
+  uint32_t idx = map_.IndexForKey(address);
+
+  Entry* entry = idx != map_.size() && *map_.KeyAt(idx) == address
+                     ? *map_.ValueAt(idx)
+                     : nullptr;
   Entry::Status status;
   if (entry) {
     // If we aren't ready yet spin and wait.
@@ -65,7 +72,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
     entry->end_address = 0;
     entry->status = Entry::STATUS_COMPILING;
     entry->function = 0;
-    map_[address] = entry;
+    map_.InsertAt(address, entry, idx);
+    // map_[address] = entry;
     status = Entry::STATUS_NEW;
   }
   global_lock.unlock();
@@ -75,18 +83,18 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
 
 void EntryTable::Delete(uint32_t address) {
   auto global_lock = global_critical_region_.Acquire();
-  const auto itr = map_.find(address);
-
-  if (itr != map_.cend()) {
-    map_.erase(itr);
+  // doesnt this leak memory by not deleting the entry?
+  uint32_t idx = map_.IndexForKey(address);
+  if (idx != map_.size() && *map_.KeyAt(idx) == address) {
+    map_.EraseAt(idx);
   }
 }
 
 std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
   auto global_lock = global_critical_region_.Acquire();
   std::vector<Function*> fns;
-  for (auto& it : map_) {
-    Entry* entry = it.second;
+  for (auto& it : map_.Values()) {
+    Entry* entry = it;
     if (address >= entry->address && address <= entry->end_address) {
       if (entry->status == Entry::STATUS_READY) {
         fns.push_back(entry->function);
@@ -95,6 +103,5 @@ std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
   }
   return fns;
 }
-
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/entry_table.h b/src/xenia/cpu/entry_table.h
index 14a3e6c82..2ca2133c2 100644
--- a/src/xenia/cpu/entry_table.h
+++ b/src/xenia/cpu/entry_table.h
@@ -14,7 +14,7 @@
 #include <vector>
 
 #include "xenia/base/mutex.h"
-
+#include "xenia/base/split_map.h"
 namespace xe {
 namespace cpu {
 
@@ -48,7 +48,8 @@ class EntryTable {
  private:
   xe::global_critical_region global_critical_region_;
   // TODO(benvanik): replace with a better data structure.
-  std::unordered_map<uint32_t, Entry*> map_;
+  xe::split_map<uint32_t, Entry*> map_;
+  //std::unordered_map<uint32_t, Entry*> map_;
 };
 
 }  // namespace cpu
diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc
index ab54438d7..66da46546 100644
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@@ -334,7 +334,7 @@ void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
 
 void CommandProcessor::UpdateWritePointer(uint32_t value) {
   write_ptr_index_ = value;
-  write_ptr_index_event_->Set();
+  write_ptr_index_event_->SetBoostPriority();
 }
 void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
                                                   uint32_t value) {
@@ -665,6 +665,11 @@ uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,
 
   reader_.set_read_offset(read_index * sizeof(uint32_t));
   reader_.set_write_offset(write_index * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      GetCurrentRingReadCount());
   do {
     if (!ExecutePacket()) {
       // This probably should be fatal - but we're going to continue anyways.
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 4e7ee919c..a24d468ae 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -380,7 +380,8 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature(
   root_signatures_bindful_.emplace(index, root_signature);
   return root_signature;
 }
-
+XE_NOINLINE
+XE_COLD
 uint32_t D3D12CommandProcessor::GetRootBindfulExtraParameterIndices(
     const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
     RootBindfulExtraParameterIndices& indices_out) {
@@ -2484,7 +2485,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     return false;
   }
   pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+  const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
 
   // Pixel shader analysis.
   bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
@@ -2512,9 +2513,10 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
       return true;
     }
   }
-  bool memexport_used_pixel =
+
+  const bool memexport_used_pixel =
       pixel_shader && pixel_shader->is_valid_memexport_used();
-  bool memexport_used = memexport_used_vertex || memexport_used_pixel;
+  const bool memexport_used = memexport_used_vertex || memexport_used_pixel;
 
   if (!BeginSubmission(true)) {
     return false;
@@ -2639,6 +2641,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     previous_viewport_info_args_ = gviargs;
     previous_viewport_info_ = viewport_info;
   }
+  // todo: use SIMD for getscissor + scaling here, should reduce code size more
   draw_util::Scissor scissor;
   draw_util::GetScissor(regs, scissor);
   scissor.offset[0] *= draw_resolution_scale_x;
@@ -2711,102 +2714,13 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   // Gather memexport ranges and ensure the heaps for them are resident, and
   // also load the data surrounding the export and to fill the regions that
   // won't be modified by the shaders.
-  struct MemExportRange {
-    uint32_t base_address_dwords;
-    uint32_t size_dwords;
-  };
-  MemExportRange memexport_ranges[512];
-  uint32_t memexport_range_count = 0;
-  if (memexport_used_vertex) {
-    for (uint32_t constant_index :
-         vertex_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
-      }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::GetName(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      // Try to reduce the number of shared memory operations when writing
-      // different elements into the same buffer through different exports
-      // (happens in 4D5307E6).
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        MemExportRange& memexport_range = memexport_ranges[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      // Add a new range if haven't expanded an existing one.
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges[memexport_range_count++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
-  }
-  if (memexport_used_pixel) {
-    for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
-      }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::GetName(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        MemExportRange& memexport_range = memexport_ranges[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges[memexport_range_count++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
-  }
-  for (uint32_t i = 0; i < memexport_range_count; ++i) {
-    const MemExportRange& memexport_range = memexport_ranges[i];
-    if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
-                                      memexport_range.size_dwords << 2)) {
-      XELOGE(
-          "Failed to request memexport stream at 0x{:08X} (size {}) in the "
-          "shared memory",
-          memexport_range.base_address_dwords << 2,
-          memexport_range.size_dwords << 2);
-      return false;
-    }
-  }
 
+  memexport_range_count_ = 0;
+  if (memexport_used_vertex || memexport_used_pixel) {
+    bool retflag;
+    bool retval = GatherMemexportRangesAndMakeResident(retflag);
+    if (retflag) return retval;
+  }
   // Primitive topology.
   D3D_PRIMITIVE_TOPOLOGY primitive_topology;
   if (primitive_processing_result.IsTessellated()) {
@@ -2876,10 +2790,11 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   // Draw.
   if (primitive_processing_result.index_buffer_type ==
       PrimitiveProcessor::ProcessedIndexBufferType::kNone) {
-    if (memexport_used) {
-      shared_memory_->UseForWriting();
-    } else {
+    if (!memexport_used) {
       shared_memory_->UseForReading();
+
+    } else {
+      shared_memory_->UseForWriting();
     }
     SubmitBarriers();
     deferred_command_list_.D3DDrawInstanced(
@@ -2903,22 +2818,11 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
           // If the shared memory is a UAV, it can't be used as an index buffer
           // (UAV is a read/write state, index buffer is a read-only state).
           // Need to copy the indices to a buffer in the index buffer state.
-          scratch_index_buffer = RequestScratchGPUBuffer(
-              index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
-          if (scratch_index_buffer == nullptr) {
-            return false;
-          }
-          shared_memory_->UseAsCopySource();
-          SubmitBarriers();
-          deferred_command_list_.D3DCopyBufferRegion(
-              scratch_index_buffer, 0, shared_memory_->GetBuffer(),
-              primitive_processing_result.guest_index_base,
-              index_buffer_view.SizeInBytes);
-          PushTransitionBarrier(scratch_index_buffer,
-                                D3D12_RESOURCE_STATE_COPY_DEST,
-                                D3D12_RESOURCE_STATE_INDEX_BUFFER);
-          index_buffer_view.BufferLocation =
-              scratch_index_buffer->GetGPUVirtualAddress();
+          bool retflag;
+          bool retval = HandleMemexportGuestDMA(
+              scratch_index_buffer, index_buffer_view,
+              primitive_processing_result.guest_index_base, retflag);
+          if (retflag) return retval;
         } else {
           index_buffer_view.BufferLocation =
               shared_memory_->GetGPUAddress() +
@@ -2956,66 +2860,199 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   }
 
   if (memexport_used) {
-    // Make sure this memexporting draw is ordered with other work using shared
-    // memory as a UAV.
-    // TODO(Triang3l): Find some PM4 command that can be used for indication of
-    // when memexports should be awaited?
-    shared_memory_->MarkUAVWritesCommitNeeded();
-    // Invalidate textures in memexported memory and watch for changes.
-    for (uint32_t i = 0; i < memexport_range_count; ++i) {
-      const MemExportRange& memexport_range = memexport_ranges[i];
-      shared_memory_->RangeWrittenByGpu(
-          memexport_range.base_address_dwords << 2,
-          memexport_range.size_dwords << 2, false);
-    }
-    if (cvars::d3d12_readback_memexport) {
-      // Read the exported data on the CPU.
-      uint32_t memexport_total_size = 0;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        memexport_total_size += memexport_ranges[i].size_dwords << 2;
+    HandleMemexportDrawOrdering_AndReadback();
+  }
+
+  return true;
+}
+XE_COLD
+XE_NOINLINE
+bool D3D12CommandProcessor::HandleMemexportGuestDMA(
+    ID3D12Resource*& scratch_index_buffer,
+    D3D12_INDEX_BUFFER_VIEW& index_buffer_view, uint32_t guest_index_base,
+    // xe::gpu::PrimitiveProcessor::ProcessingResult&
+    // primitive_processing_result,
+    bool& retflag) {
+  retflag = true;
+  scratch_index_buffer = RequestScratchGPUBuffer(
+      index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
+  if (scratch_index_buffer == nullptr) {
+    return false;
+  }
+  shared_memory_->UseAsCopySource();
+  SubmitBarriers();
+  deferred_command_list_.D3DCopyBufferRegion(
+      scratch_index_buffer, 0, shared_memory_->GetBuffer(), guest_index_base,
+      index_buffer_view.SizeInBytes);
+  PushTransitionBarrier(scratch_index_buffer, D3D12_RESOURCE_STATE_COPY_DEST,
+                        D3D12_RESOURCE_STATE_INDEX_BUFFER);
+  index_buffer_view.BufferLocation =
+      scratch_index_buffer->GetGPUVirtualAddress();
+  retflag = false;
+  return {};
+}
+XE_NOINLINE
+XE_COLD
+bool D3D12CommandProcessor::GatherMemexportRangesAndMakeResident(
+    bool& retflag) {
+  auto vertex_shader = static_cast<D3D12Shader*>(active_vertex_shader());
+  auto pixel_shader = static_cast<D3D12Shader*>(active_pixel_shader());
+  const xe::gpu::RegisterFile& regs = *register_file_;
+  const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+  const bool memexport_used_pixel =
+      pixel_shader && pixel_shader->is_valid_memexport_used();
+  retflag = true;
+  if (memexport_used_vertex) {
+    for (uint32_t constant_index :
+         vertex_shader->memexport_stream_constants()) {
+      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
+          XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
+      if (memexport_stream.index_count == 0) {
+        continue;
       }
-      if (memexport_total_size != 0) {
-        ID3D12Resource* readback_buffer =
-            RequestReadbackBuffer(memexport_total_size);
-        if (readback_buffer != nullptr) {
-          shared_memory_->UseAsCopySource();
-          SubmitBarriers();
-          ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
-          uint32_t readback_buffer_offset = 0;
-          for (uint32_t i = 0; i < memexport_range_count; ++i) {
-            const MemExportRange& memexport_range = memexport_ranges[i];
-            uint32_t memexport_range_size = memexport_range.size_dwords << 2;
-            deferred_command_list_.D3DCopyBufferRegion(
-                readback_buffer, readback_buffer_offset, shared_memory_buffer,
-                memexport_range.base_address_dwords << 2, memexport_range_size);
-            readback_buffer_offset += memexport_range_size;
-          }
-          if (AwaitAllQueueOperationsCompletion()) {
-            D3D12_RANGE readback_range;
-            readback_range.Begin = 0;
-            readback_range.End = memexport_total_size;
-            void* readback_mapping;
-            if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
-                                               &readback_mapping))) {
-              const uint32_t* readback_dwords =
-                  reinterpret_cast<const uint32_t*>(readback_mapping);
-              for (uint32_t i = 0; i < memexport_range_count; ++i) {
-                const MemExportRange& memexport_range = memexport_ranges[i];
-                std::memcpy(memory_->TranslatePhysical(
-                                memexport_range.base_address_dwords << 2),
-                            readback_dwords, memexport_range.size_dwords << 2);
-                readback_dwords += memexport_range.size_dwords;
-              }
-              D3D12_RANGE readback_write_range = {};
-              readback_buffer->Unmap(0, &readback_write_range);
+      uint32_t memexport_format_size =
+          GetSupportedMemExportFormatSize(memexport_stream.format);
+      if (memexport_format_size == 0) {
+        XELOGE("Unsupported memexport format {}",
+               FormatInfo::GetName(
+                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
+        return false;
+      }
+      uint32_t memexport_size_dwords =
+          memexport_stream.index_count * memexport_format_size;
+      // Try to reduce the number of shared memory operations when writing
+      // different elements into the same buffer through different exports
+      // (happens in 4D5307E6).
+      bool memexport_range_reused = false;
+      for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+        MemExportRange& memexport_range = memexport_ranges_[i];
+        if (memexport_range.base_address_dwords ==
+            memexport_stream.base_address) {
+          memexport_range.size_dwords =
+              std::max(memexport_range.size_dwords, memexport_size_dwords);
+          memexport_range_reused = true;
+          break;
+        }
+      }
+      // Add a new range if haven't expanded an existing one.
+      if (!memexport_range_reused) {
+        MemExportRange& memexport_range =
+            memexport_ranges_[memexport_range_count_++];
+        memexport_range.base_address_dwords = memexport_stream.base_address;
+        memexport_range.size_dwords = memexport_size_dwords;
+      }
+    }
+  }
+  if (memexport_used_pixel) {
+    for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
+      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
+          XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
+      if (memexport_stream.index_count == 0) {
+        continue;
+      }
+      uint32_t memexport_format_size =
+          GetSupportedMemExportFormatSize(memexport_stream.format);
+      if (memexport_format_size == 0) {
+        XELOGE("Unsupported memexport format {}",
+               FormatInfo::GetName(
+                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
+        return false;
+      }
+      uint32_t memexport_size_dwords =
+          memexport_stream.index_count * memexport_format_size;
+      bool memexport_range_reused = false;
+      for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+        MemExportRange& memexport_range = memexport_ranges_[i];
+        if (memexport_range.base_address_dwords ==
+            memexport_stream.base_address) {
+          memexport_range.size_dwords =
+              std::max(memexport_range.size_dwords, memexport_size_dwords);
+          memexport_range_reused = true;
+          break;
+        }
+      }
+      if (!memexport_range_reused) {
+        MemExportRange& memexport_range =
+            memexport_ranges_[memexport_range_count_++];
+        memexport_range.base_address_dwords = memexport_stream.base_address;
+        memexport_range.size_dwords = memexport_size_dwords;
+      }
+    }
+  }
+  for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+    const MemExportRange& memexport_range = memexport_ranges_[i];
+    if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
+                                      memexport_range.size_dwords << 2)) {
+      XELOGE(
+          "Failed to request memexport stream at 0x{:08X} (size {}) in the "
+          "shared memory",
+          memexport_range.base_address_dwords << 2,
+          memexport_range.size_dwords << 2);
+      return false;
+    }
+  }
+  retflag = false;
+  return {};
+}
+XE_NOINLINE
+XE_COLD
+void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
+  // Make sure this memexporting draw is ordered with other work using shared
+  // memory as a UAV.
+  // TODO(Triang3l): Find some PM4 command that can be used for indication of
+  // when memexports should be awaited?
+  shared_memory_->MarkUAVWritesCommitNeeded();
+  // Invalidate textures in memexported memory and watch for changes.
+  for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+    const MemExportRange& memexport_range = memexport_ranges_[i];
+    shared_memory_->RangeWrittenByGpu(memexport_range.base_address_dwords << 2,
+                                      memexport_range.size_dwords << 2, false);
+  }
+  if (cvars::d3d12_readback_memexport) {
+    // Read the exported data on the CPU.
+    uint32_t memexport_total_size = 0;
+    for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+      memexport_total_size += memexport_ranges_[i].size_dwords << 2;
+    }
+    if (memexport_total_size != 0) {
+      ID3D12Resource* readback_buffer =
+          RequestReadbackBuffer(memexport_total_size);
+      if (readback_buffer != nullptr) {
+        shared_memory_->UseAsCopySource();
+        SubmitBarriers();
+        ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
+        uint32_t readback_buffer_offset = 0;
+        for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+          const MemExportRange& memexport_range = memexport_ranges_[i];
+          uint32_t memexport_range_size = memexport_range.size_dwords << 2;
+          deferred_command_list_.D3DCopyBufferRegion(
+              readback_buffer, readback_buffer_offset, shared_memory_buffer,
+              memexport_range.base_address_dwords << 2, memexport_range_size);
+          readback_buffer_offset += memexport_range_size;
+        }
+        if (AwaitAllQueueOperationsCompletion()) {
+          D3D12_RANGE readback_range;
+          readback_range.Begin = 0;
+          readback_range.End = memexport_total_size;
+          void* readback_mapping;
+          if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
+                                             &readback_mapping))) {
+            const uint32_t* readback_dwords =
+                reinterpret_cast<const uint32_t*>(readback_mapping);
+            for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+              const MemExportRange& memexport_range = memexport_ranges_[i];
+              std::memcpy(memory_->TranslatePhysical(
+                              memexport_range.base_address_dwords << 2),
+                          readback_dwords, memexport_range.size_dwords << 2);
+              readback_dwords += memexport_range.size_dwords;
             }
+            D3D12_RANGE readback_write_range = {};
+            readback_buffer->Unmap(0, &readback_write_range);
           }
         }
       }
     }
   }
-
-  return true;
 }
 
 void D3D12CommandProcessor::InitializeTrace() {
@@ -3065,23 +3102,33 @@ bool D3D12CommandProcessor::IssueCopy() {
   if (!BeginSubmission(true)) {
     return false;
   }
-  uint32_t written_address, written_length;
-  if (!render_target_cache_->Resolve(*memory_, *shared_memory_, *texture_cache_,
-                                     written_address, written_length)) {
-    return false;
+
+  if (!cvars::d3d12_readback_resolve) {
+    uint32_t written_address, written_length;
+    return render_target_cache_->Resolve(*memory_, *shared_memory_,
+                                         *texture_cache_, written_address,
+                                         written_length);
+  } else {
+    return IssueCopy_ReadbackResolvePath();
   }
-  if (cvars::d3d12_readback_resolve &&
-      !texture_cache_->IsDrawResolutionScaled() && written_length) {
-    // Read the resolved data on the CPU.
-    ID3D12Resource* readback_buffer = RequestReadbackBuffer(written_length);
-    if (readback_buffer != nullptr) {
-      shared_memory_->UseAsCopySource();
-      SubmitBarriers();
-      ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
-      deferred_command_list_.D3DCopyBufferRegion(
-          readback_buffer, 0, shared_memory_buffer, written_address,
-          written_length);
-      if (AwaitAllQueueOperationsCompletion()) {
+  return true;
+}
+XE_NOINLINE
+bool D3D12CommandProcessor::IssueCopy_ReadbackResolvePath() {
+  uint32_t written_address, written_length;
+  if (render_target_cache_->Resolve(*memory_, *shared_memory_, *texture_cache_,
+                                    written_address, written_length)) {
+    if (!texture_cache_->IsDrawResolutionScaled() && written_length) {
+      // Read the resolved data on the CPU.
+      ID3D12Resource* readback_buffer = RequestReadbackBuffer(written_length);
+      if (readback_buffer != nullptr) {
+        shared_memory_->UseAsCopySource();
+        SubmitBarriers();
+        ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
+        deferred_command_list_.D3DCopyBufferRegion(
+            readback_buffer, 0, shared_memory_buffer, written_address,
+            written_length);
+        if (AwaitAllQueueOperationsCompletion()) {
 #if 1
         D3D12_RANGE readback_range;
         readback_range.Begin = 0;
@@ -3099,23 +3146,25 @@ bool D3D12CommandProcessor::IssueCopy() {
         }
 
 #else
-        dma::XeDMAJob job{};
-        job.destination = memory_->TranslatePhysical(written_address);
-        job.size = written_length;
-        job.source = nullptr;
-        job.userdata1 = (void*)readback_buffer;
-        job.precall = DmaPrefunc;
-        job.postcall = DmaPostfunc;
+          dma::XeDMAJob job{};
+          job.destination = memory_->TranslatePhysical(written_address);
+          job.size = written_length;
+          job.source = nullptr;
+          job.userdata1 = (void*)readback_buffer;
+          job.precall = DmaPrefunc;
+          job.postcall = DmaPostfunc;
 
-        readback_available_ = GetDMAC()->PushDMAJob(&job);
+          readback_available_ = GetDMAC()->PushDMAJob(&job);
 
 #endif
+        }
       }
     }
+  } else {
+    return false;
   }
   return true;
 }
-
 void D3D12CommandProcessor::CheckSubmissionFence(uint64_t await_submission) {
   if (await_submission >= submission_current_) {
     if (submission_open_) {
@@ -4707,195 +4756,11 @@ bool D3D12CommandProcessor::UpdateBindings(
           ~(1u << kRootParameter_Bindless_DescriptorIndicesPixel);
     }
   } else {
-    //
-    // Bindful descriptors path.
-    //
-
-    // See what descriptors need to be updated.
-    // Samplers have already been checked.
-    bool write_textures_vertex =
-        texture_count_vertex &&
-        (!bindful_textures_written_vertex_ ||
-         current_texture_layout_uid_vertex_ != texture_layout_uid_vertex ||
-         !texture_cache_->AreActiveTextureSRVKeysUpToDate(
-             current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
-             texture_count_vertex));
-    bool write_textures_pixel =
-        texture_count_pixel &&
-        (!bindful_textures_written_pixel_ ||
-         current_texture_layout_uid_pixel_ != texture_layout_uid_pixel ||
-         !texture_cache_->AreActiveTextureSRVKeysUpToDate(
-             current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
-             texture_count_pixel));
-    bool write_samplers_vertex =
-        sampler_count_vertex && !bindful_samplers_written_vertex_;
-    bool write_samplers_pixel =
-        sampler_count_pixel && !bindful_samplers_written_pixel_;
-    bool edram_rov_used = render_target_cache_->GetPath() ==
-                          RenderTargetCache::Path::kPixelShaderInterlock;
-
-    // Allocate the descriptors.
-    size_t view_count_partial_update = 0;
-    if (write_textures_vertex) {
-      view_count_partial_update += texture_count_vertex;
-    }
-    if (write_textures_pixel) {
-      view_count_partial_update += texture_count_pixel;
-    }
-    // All the constants + shared memory SRV and UAV + textures.
-    size_t view_count_full_update =
-        2 + texture_count_vertex + texture_count_pixel;
-    if (edram_rov_used) {
-      // + EDRAM UAV.
-      ++view_count_full_update;
-    }
-    D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle;
-    D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle;
-    uint32_t descriptor_size_view = provider.GetViewDescriptorSize();
-    uint64_t view_heap_index = RequestViewBindfulDescriptors(
-        draw_view_bindful_heap_index_, uint32_t(view_count_partial_update),
-        uint32_t(view_count_full_update), view_cpu_handle, view_gpu_handle);
-    if (view_heap_index ==
-        ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
-      XELOGE("Failed to allocate view descriptors");
-      return false;
-    }
-    size_t sampler_count_partial_update = 0;
-    if (write_samplers_vertex) {
-      sampler_count_partial_update += sampler_count_vertex;
-    }
-    if (write_samplers_pixel) {
-      sampler_count_partial_update += sampler_count_pixel;
-    }
-    D3D12_CPU_DESCRIPTOR_HANDLE sampler_cpu_handle = {};
-    D3D12_GPU_DESCRIPTOR_HANDLE sampler_gpu_handle = {};
-    uint32_t descriptor_size_sampler = provider.GetSamplerDescriptorSize();
-    uint64_t sampler_heap_index =
-        ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid;
-    if (sampler_count_vertex != 0 || sampler_count_pixel != 0) {
-      sampler_heap_index = RequestSamplerBindfulDescriptors(
-          draw_sampler_bindful_heap_index_,
-          uint32_t(sampler_count_partial_update),
-          uint32_t(sampler_count_vertex + sampler_count_pixel),
-          sampler_cpu_handle, sampler_gpu_handle);
-      if (sampler_heap_index ==
-          ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
-        XELOGE("Failed to allocate sampler descriptors");
-        return false;
-      }
-    }
-    if (draw_view_bindful_heap_index_ != view_heap_index) {
-      // Need to update all view descriptors.
-      write_textures_vertex = texture_count_vertex != 0;
-      write_textures_pixel = texture_count_pixel != 0;
-      bindful_textures_written_vertex_ = false;
-      bindful_textures_written_pixel_ = false;
-      // If updating fully, write the shared memory SRV and UAV descriptors and,
-      // if needed, the EDRAM descriptor.
-      gpu_handle_shared_memory_and_edram_ = view_gpu_handle;
-      shared_memory_->WriteRawSRVDescriptor(view_cpu_handle);
-      view_cpu_handle.ptr += descriptor_size_view;
-      view_gpu_handle.ptr += descriptor_size_view;
-      shared_memory_->WriteRawUAVDescriptor(view_cpu_handle);
-      view_cpu_handle.ptr += descriptor_size_view;
-      view_gpu_handle.ptr += descriptor_size_view;
-      if (edram_rov_used) {
-        render_target_cache_->WriteEdramUintPow2UAVDescriptor(view_cpu_handle,
-                                                              2);
-        view_cpu_handle.ptr += descriptor_size_view;
-        view_gpu_handle.ptr += descriptor_size_view;
-      }
-      current_graphics_root_up_to_date_ &=
-          ~(1u << kRootParameter_Bindful_SharedMemoryAndEdram);
-    }
-    if (sampler_heap_index !=
-            ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid &&
-        draw_sampler_bindful_heap_index_ != sampler_heap_index) {
-      write_samplers_vertex = sampler_count_vertex != 0;
-      write_samplers_pixel = sampler_count_pixel != 0;
-      bindful_samplers_written_vertex_ = false;
-      bindful_samplers_written_pixel_ = false;
-    }
-
-    // Write the descriptors.
-    if (write_textures_vertex) {
-      assert_true(current_graphics_root_bindful_extras_.textures_vertex !=
-                  RootBindfulExtraParameterIndices::kUnavailable);
-      gpu_handle_textures_vertex_ = view_gpu_handle;
-      for (size_t i = 0; i < texture_count_vertex; ++i) {
-        texture_cache_->WriteActiveTextureBindfulSRV(textures_vertex[i],
-                                                     view_cpu_handle);
-        view_cpu_handle.ptr += descriptor_size_view;
-        view_gpu_handle.ptr += descriptor_size_view;
-      }
-      current_texture_layout_uid_vertex_ = texture_layout_uid_vertex;
-      current_texture_srv_keys_vertex_.resize(
-          std::max(current_texture_srv_keys_vertex_.size(),
-                   size_t(texture_count_vertex)));
-      texture_cache_->WriteActiveTextureSRVKeys(
-          current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
-          texture_count_vertex);
-      bindful_textures_written_vertex_ = true;
-      current_graphics_root_up_to_date_ &=
-          ~(1u << current_graphics_root_bindful_extras_.textures_vertex);
-    }
-    if (write_textures_pixel) {
-      assert_true(current_graphics_root_bindful_extras_.textures_pixel !=
-                  RootBindfulExtraParameterIndices::kUnavailable);
-      gpu_handle_textures_pixel_ = view_gpu_handle;
-      for (size_t i = 0; i < texture_count_pixel; ++i) {
-        texture_cache_->WriteActiveTextureBindfulSRV((*textures_pixel)[i],
-                                                     view_cpu_handle);
-        view_cpu_handle.ptr += descriptor_size_view;
-        view_gpu_handle.ptr += descriptor_size_view;
-      }
-      current_texture_layout_uid_pixel_ = texture_layout_uid_pixel;
-      current_texture_srv_keys_pixel_.resize(std::max(
-          current_texture_srv_keys_pixel_.size(), size_t(texture_count_pixel)));
-      texture_cache_->WriteActiveTextureSRVKeys(
-          current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
-          texture_count_pixel);
-      bindful_textures_written_pixel_ = true;
-      current_graphics_root_up_to_date_ &=
-          ~(1u << current_graphics_root_bindful_extras_.textures_pixel);
-    }
-    if (write_samplers_vertex) {
-      assert_true(current_graphics_root_bindful_extras_.samplers_vertex !=
-                  RootBindfulExtraParameterIndices::kUnavailable);
-      gpu_handle_samplers_vertex_ = sampler_gpu_handle;
-      for (size_t i = 0; i < sampler_count_vertex; ++i) {
-        texture_cache_->WriteSampler(current_samplers_vertex_[i],
-                                     sampler_cpu_handle);
-        sampler_cpu_handle.ptr += descriptor_size_sampler;
-        sampler_gpu_handle.ptr += descriptor_size_sampler;
-      }
-      // Current samplers have already been updated.
-      bindful_samplers_written_vertex_ = true;
-      current_graphics_root_up_to_date_ &=
-          ~(1u << current_graphics_root_bindful_extras_.samplers_vertex);
-    }
-    if (write_samplers_pixel) {
-      assert_true(current_graphics_root_bindful_extras_.samplers_pixel !=
-                  RootBindfulExtraParameterIndices::kUnavailable);
-      gpu_handle_samplers_pixel_ = sampler_gpu_handle;
-      for (size_t i = 0; i < sampler_count_pixel; ++i) {
-        texture_cache_->WriteSampler(current_samplers_pixel_[i],
-                                     sampler_cpu_handle);
-        sampler_cpu_handle.ptr += descriptor_size_sampler;
-        sampler_gpu_handle.ptr += descriptor_size_sampler;
-      }
-      // Current samplers have already been updated.
-      bindful_samplers_written_pixel_ = true;
-      current_graphics_root_up_to_date_ &=
-          ~(1u << current_graphics_root_bindful_extras_.samplers_pixel);
-    }
-
-    // Wrote new descriptors on the current page.
-    draw_view_bindful_heap_index_ = view_heap_index;
-    if (sampler_heap_index !=
-        ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
-      draw_sampler_bindful_heap_index_ = sampler_heap_index;
-    }
+    bool retflag;
+    bool retval = UpdateBindings_BindfulPath(
+        texture_layout_uid_vertex, textures_vertex, texture_layout_uid_pixel,
+        textures_pixel, sampler_count_vertex, sampler_count_pixel, retflag);
+    if (retflag) return retval;
   }
 
   // Update the root parameters.
@@ -4967,47 +4832,255 @@ bool D3D12CommandProcessor::UpdateBindings(
                                            << kRootParameter_Bindless_ViewHeap;
     }
   } else {
-    if (!(current_graphics_root_up_to_date_ &
-          (1u << kRootParameter_Bindful_SharedMemoryAndEdram))) {
-      deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
-          kRootParameter_Bindful_SharedMemoryAndEdram,
-          gpu_handle_shared_memory_and_edram_);
-      current_graphics_root_up_to_date_ |=
-          1u << kRootParameter_Bindful_SharedMemoryAndEdram;
-    }
-    uint32_t extra_index;
-    extra_index = current_graphics_root_bindful_extras_.textures_pixel;
-    if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
-        !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
-      deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
-          extra_index, gpu_handle_textures_pixel_);
-      current_graphics_root_up_to_date_ |= 1u << extra_index;
-    }
-    extra_index = current_graphics_root_bindful_extras_.samplers_pixel;
-    if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
-        !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
-      deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
-          extra_index, gpu_handle_samplers_pixel_);
-      current_graphics_root_up_to_date_ |= 1u << extra_index;
-    }
-    extra_index = current_graphics_root_bindful_extras_.textures_vertex;
-    if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
-        !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
-      deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
-          extra_index, gpu_handle_textures_vertex_);
-      current_graphics_root_up_to_date_ |= 1u << extra_index;
-    }
-    extra_index = current_graphics_root_bindful_extras_.samplers_vertex;
-    if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
-        !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
-      deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
-          extra_index, gpu_handle_samplers_vertex_);
-      current_graphics_root_up_to_date_ |= 1u << extra_index;
-    }
+    UpdateBindings_UpdateRootBindful();
   }
 
   return true;
 }
+XE_COLD
+XE_NOINLINE
+void D3D12CommandProcessor::UpdateBindings_UpdateRootBindful() {
+  if (!(current_graphics_root_up_to_date_ &
+        (1u << kRootParameter_Bindful_SharedMemoryAndEdram))) {
+    deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
+        kRootParameter_Bindful_SharedMemoryAndEdram,
+        gpu_handle_shared_memory_and_edram_);
+    current_graphics_root_up_to_date_ |=
+        1u << kRootParameter_Bindful_SharedMemoryAndEdram;
+  }
+  uint32_t extra_index;
+  extra_index = current_graphics_root_bindful_extras_.textures_pixel;
+  if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
+      !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
+    deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
+        extra_index, gpu_handle_textures_pixel_);
+    current_graphics_root_up_to_date_ |= 1u << extra_index;
+  }
+  extra_index = current_graphics_root_bindful_extras_.samplers_pixel;
+  if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
+      !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
+    deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
+        extra_index, gpu_handle_samplers_pixel_);
+    current_graphics_root_up_to_date_ |= 1u << extra_index;
+  }
+  extra_index = current_graphics_root_bindful_extras_.textures_vertex;
+  if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
+      !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
+    deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
+        extra_index, gpu_handle_textures_vertex_);
+    current_graphics_root_up_to_date_ |= 1u << extra_index;
+  }
+  extra_index = current_graphics_root_bindful_extras_.samplers_vertex;
+  if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
+      !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
+    deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
+        extra_index, gpu_handle_samplers_vertex_);
+    current_graphics_root_up_to_date_ |= 1u << extra_index;
+  }
+}
+XE_NOINLINE
+XE_COLD
+bool D3D12CommandProcessor::UpdateBindings_BindfulPath(
+    const size_t texture_layout_uid_vertex,
+    const std::vector<xe::gpu::DxbcShader::TextureBinding>& textures_vertex,
+    const size_t texture_layout_uid_pixel,
+    const std::vector<xe::gpu::DxbcShader::TextureBinding>* textures_pixel,
+    const size_t sampler_count_vertex, const size_t sampler_count_pixel,
+    bool& retflag) {
+  retflag = true;
+  auto& provider = this->GetD3D12Provider();
+  size_t texture_count_pixel = textures_pixel->size();
+  size_t texture_count_vertex = textures_vertex.size();
+  //
+  // Bindful descriptors path.
+  //
+
+  // See what descriptors need to be updated.
+  // Samplers have already been checked.
+  bool write_textures_vertex =
+      texture_count_vertex &&
+      (!bindful_textures_written_vertex_ ||
+       current_texture_layout_uid_vertex_ != texture_layout_uid_vertex ||
+       !texture_cache_->AreActiveTextureSRVKeysUpToDate(
+           current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
+           texture_count_vertex));
+  bool write_textures_pixel =
+      texture_count_pixel &&
+      (!bindful_textures_written_pixel_ ||
+       current_texture_layout_uid_pixel_ != texture_layout_uid_pixel ||
+       !texture_cache_->AreActiveTextureSRVKeysUpToDate(
+           current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
+           texture_count_pixel));
+  bool write_samplers_vertex =
+      sampler_count_vertex && !bindful_samplers_written_vertex_;
+  bool write_samplers_pixel =
+      sampler_count_pixel && !bindful_samplers_written_pixel_;
+  bool edram_rov_used = render_target_cache_->GetPath() ==
+                        RenderTargetCache::Path::kPixelShaderInterlock;
+
+  // Allocate the descriptors.
+  size_t view_count_partial_update = 0;
+  if (write_textures_vertex) {
+    view_count_partial_update += texture_count_vertex;
+  }
+  if (write_textures_pixel) {
+    view_count_partial_update += texture_count_pixel;
+  }
+  // All the constants + shared memory SRV and UAV + textures.
+  size_t view_count_full_update =
+      2 + texture_count_vertex + texture_count_pixel;
+  if (edram_rov_used) {
+    // + EDRAM UAV.
+    ++view_count_full_update;
+  }
+  D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle;
+  D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle;
+  uint32_t descriptor_size_view = provider.GetViewDescriptorSize();
+  uint64_t view_heap_index = RequestViewBindfulDescriptors(
+      draw_view_bindful_heap_index_, uint32_t(view_count_partial_update),
+      uint32_t(view_count_full_update), view_cpu_handle, view_gpu_handle);
+  if (view_heap_index ==
+      ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
+    XELOGE("Failed to allocate view descriptors");
+    return false;
+  }
+  size_t sampler_count_partial_update = 0;
+  if (write_samplers_vertex) {
+    sampler_count_partial_update += sampler_count_vertex;
+  }
+  if (write_samplers_pixel) {
+    sampler_count_partial_update += sampler_count_pixel;
+  }
+  D3D12_CPU_DESCRIPTOR_HANDLE sampler_cpu_handle = {};
+  D3D12_GPU_DESCRIPTOR_HANDLE sampler_gpu_handle = {};
+  uint32_t descriptor_size_sampler = provider.GetSamplerDescriptorSize();
+  uint64_t sampler_heap_index =
+      ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid;
+  if (sampler_count_vertex != 0 || sampler_count_pixel != 0) {
+    sampler_heap_index = RequestSamplerBindfulDescriptors(
+        draw_sampler_bindful_heap_index_,
+        uint32_t(sampler_count_partial_update),
+        uint32_t(sampler_count_vertex + sampler_count_pixel),
+        sampler_cpu_handle, sampler_gpu_handle);
+    if (sampler_heap_index ==
+        ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
+      XELOGE("Failed to allocate sampler descriptors");
+      return false;
+    }
+  }
+  if (draw_view_bindful_heap_index_ != view_heap_index) {
+    // Need to update all view descriptors.
+    write_textures_vertex = texture_count_vertex != 0;
+    write_textures_pixel = texture_count_pixel != 0;
+    bindful_textures_written_vertex_ = false;
+    bindful_textures_written_pixel_ = false;
+    // If updating fully, write the shared memory SRV and UAV descriptors and,
+    // if needed, the EDRAM descriptor.
+    gpu_handle_shared_memory_and_edram_ = view_gpu_handle;
+    shared_memory_->WriteRawSRVDescriptor(view_cpu_handle);
+    view_cpu_handle.ptr += descriptor_size_view;
+    view_gpu_handle.ptr += descriptor_size_view;
+    shared_memory_->WriteRawUAVDescriptor(view_cpu_handle);
+    view_cpu_handle.ptr += descriptor_size_view;
+    view_gpu_handle.ptr += descriptor_size_view;
+    if (edram_rov_used) {
+      render_target_cache_->WriteEdramUintPow2UAVDescriptor(view_cpu_handle, 2);
+      view_cpu_handle.ptr += descriptor_size_view;
+      view_gpu_handle.ptr += descriptor_size_view;
+    }
+    current_graphics_root_up_to_date_ &=
+        ~(1u << kRootParameter_Bindful_SharedMemoryAndEdram);
+  }
+  if (sampler_heap_index !=
+          ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid &&
+      draw_sampler_bindful_heap_index_ != sampler_heap_index) {
+    write_samplers_vertex = sampler_count_vertex != 0;
+    write_samplers_pixel = sampler_count_pixel != 0;
+    bindful_samplers_written_vertex_ = false;
+    bindful_samplers_written_pixel_ = false;
+  }
+
+  // Write the descriptors.
+  if (write_textures_vertex) {
+    assert_true(current_graphics_root_bindful_extras_.textures_vertex !=
+                RootBindfulExtraParameterIndices::kUnavailable);
+    gpu_handle_textures_vertex_ = view_gpu_handle;
+    for (size_t i = 0; i < texture_count_vertex; ++i) {
+      texture_cache_->WriteActiveTextureBindfulSRV(textures_vertex[i],
+                                                   view_cpu_handle);
+      view_cpu_handle.ptr += descriptor_size_view;
+      view_gpu_handle.ptr += descriptor_size_view;
+    }
+    current_texture_layout_uid_vertex_ = texture_layout_uid_vertex;
+    current_texture_srv_keys_vertex_.resize(std::max(
+        current_texture_srv_keys_vertex_.size(), size_t(texture_count_vertex)));
+    texture_cache_->WriteActiveTextureSRVKeys(
+        current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
+        texture_count_vertex);
+    bindful_textures_written_vertex_ = true;
+    current_graphics_root_up_to_date_ &=
+        ~(1u << current_graphics_root_bindful_extras_.textures_vertex);
+  }
+  if (write_textures_pixel) {
+    assert_true(current_graphics_root_bindful_extras_.textures_pixel !=
+                RootBindfulExtraParameterIndices::kUnavailable);
+    gpu_handle_textures_pixel_ = view_gpu_handle;
+    for (size_t i = 0; i < texture_count_pixel; ++i) {
+      texture_cache_->WriteActiveTextureBindfulSRV((*textures_pixel)[i],
+                                                   view_cpu_handle);
+      view_cpu_handle.ptr += descriptor_size_view;
+      view_gpu_handle.ptr += descriptor_size_view;
+    }
+    current_texture_layout_uid_pixel_ = texture_layout_uid_pixel;
+    current_texture_srv_keys_pixel_.resize(std::max(
+        current_texture_srv_keys_pixel_.size(), size_t(texture_count_pixel)));
+    texture_cache_->WriteActiveTextureSRVKeys(
+        current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
+        texture_count_pixel);
+    bindful_textures_written_pixel_ = true;
+    current_graphics_root_up_to_date_ &=
+        ~(1u << current_graphics_root_bindful_extras_.textures_pixel);
+  }
+  if (write_samplers_vertex) {
+    assert_true(current_graphics_root_bindful_extras_.samplers_vertex !=
+                RootBindfulExtraParameterIndices::kUnavailable);
+    gpu_handle_samplers_vertex_ = sampler_gpu_handle;
+    for (size_t i = 0; i < sampler_count_vertex; ++i) {
+      texture_cache_->WriteSampler(current_samplers_vertex_[i],
+                                   sampler_cpu_handle);
+      sampler_cpu_handle.ptr += descriptor_size_sampler;
+      sampler_gpu_handle.ptr += descriptor_size_sampler;
+    }
+    // Current samplers have already been updated.
+    bindful_samplers_written_vertex_ = true;
+    current_graphics_root_up_to_date_ &=
+        ~(1u << current_graphics_root_bindful_extras_.samplers_vertex);
+  }
+  if (write_samplers_pixel) {
+    assert_true(current_graphics_root_bindful_extras_.samplers_pixel !=
+                RootBindfulExtraParameterIndices::kUnavailable);
+    gpu_handle_samplers_pixel_ = sampler_gpu_handle;
+    for (size_t i = 0; i < sampler_count_pixel; ++i) {
+      texture_cache_->WriteSampler(current_samplers_pixel_[i],
+                                   sampler_cpu_handle);
+      sampler_cpu_handle.ptr += descriptor_size_sampler;
+      sampler_gpu_handle.ptr += descriptor_size_sampler;
+    }
+    // Current samplers have already been updated.
+    bindful_samplers_written_pixel_ = true;
+    current_graphics_root_up_to_date_ &=
+        ~(1u << current_graphics_root_bindful_extras_.samplers_pixel);
+  }
+
+  // Wrote new descriptors on the current page.
+  draw_view_bindful_heap_index_ = view_heap_index;
+  if (sampler_heap_index !=
+      ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
+    draw_sampler_bindful_heap_index_ = sampler_heap_index;
+  }
+  retflag = false;
+  return {};
+}
 
 uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize(
     xenos::ColorFormat format) {
@@ -5043,7 +5116,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
   if (size == 0) {
     return nullptr;
   }
-#if 0
+#if 1
   if (readback_available_) {
     GetDMAC()->WaitJobDone(readback_available_);
     readback_available_ = 0;
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 37d048d29..ba2c17a82 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -45,7 +45,10 @@
 namespace xe {
 namespace gpu {
 namespace d3d12 {
-
+struct MemExportRange {
+  uint32_t base_address_dwords;
+  uint32_t size_dwords;
+};
 class D3D12CommandProcessor final : public CommandProcessor {
  public:
 #include "../pm4_command_processor_declare.h"
@@ -287,8 +290,21 @@ class D3D12CommandProcessor final : public CommandProcessor {
   bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count,
                  IndexBufferInfo* index_buffer_info,
                  bool major_mode_explicit) override;
+  XE_COLD
+  XE_NOINLINE
+  bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer,
+                               D3D12_INDEX_BUFFER_VIEW& index_buffer_view,
+                               uint32_t guest_index_base,
+                               bool& retflag);
+  XE_NOINLINE
+  XE_COLD
+  bool GatherMemexportRangesAndMakeResident(bool& retflag);
+  XE_NOINLINE
+  XE_COLD
+  void HandleMemexportDrawOrdering_AndReadback();
   bool IssueCopy() override;
-
+  XE_NOINLINE
+  bool IssueCopy_ReadbackResolvePath();
   void InitializeTrace() override;
 
  private:
@@ -363,6 +379,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
   };
   // Gets the indices of optional root parameters. Returns the total parameter
   // count.
+  XE_NOINLINE
+  XE_COLD
   static uint32_t GetRootBindfulExtraParameterIndices(
       const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
       RootBindfulExtraParameterIndices& indices_out);
@@ -437,6 +455,18 @@ class D3D12CommandProcessor final : public CommandProcessor {
   bool UpdateBindings(const D3D12Shader* vertex_shader,
                       const D3D12Shader* pixel_shader,
                       ID3D12RootSignature* root_signature);
+  XE_COLD
+  XE_NOINLINE
+  void UpdateBindings_UpdateRootBindful();
+  XE_NOINLINE
+  XE_COLD
+  bool UpdateBindings_BindfulPath(
+      const size_t texture_layout_uid_vertex,
+      const std::vector<xe::gpu::DxbcShader::TextureBinding>& textures_vertex,
+      const size_t texture_layout_uid_pixel,
+      const std::vector<xe::gpu::DxbcShader::TextureBinding>* textures_pixel,
+      const size_t sampler_count_vertex, const size_t sampler_count_pixel,
+      bool& retflag);
 
   // Returns dword count for one element for a memexport format, or 0 if it's
   // not supported by the D3D12 command processor (if it's smaller that 1 dword,
@@ -743,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
 
   draw_util::GetViewportInfoArgs previous_viewport_info_args_;
   draw_util::ViewportInfo previous_viewport_info_;
+  // scratch memexport data
+  MemExportRange memexport_ranges_[512];
+  uint32_t memexport_range_count_ = 0;
 };
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/d3d12/deferred_command_list.cc b/src/xenia/gpu/d3d12/deferred_command_list.cc
index c27c8b226..0d647331f 100644
--- a/src/xenia/gpu/d3d12/deferred_command_list.cc
+++ b/src/xenia/gpu/d3d12/deferred_command_list.cc
@@ -266,22 +266,9 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
 
 void* DeferredCommandList::WriteCommand(Command command,
                                         size_t arguments_size_bytes) {
-
   size_t arguments_size_elements =
       round_up(arguments_size_bytes, sizeof(uintmax_t), false);
 
-      //(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
-  #if 0
-  size_t offset = command_stream_.size();
-  command_stream_.resize(offset + kCommandHeaderSizeElements +
-                         arguments_size_elements);
-  CommandHeader& header =
-      *reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
-  header.command = command;
-  header.arguments_size_elements = uint32_t(arguments_size_elements);
-  return command_stream_.data() + (offset + kCommandHeaderSizeElements);
-  #else
-
   size_t offset = command_stream_.size();
   constexpr size_t kCommandHeaderSizeBytes =
       kCommandHeaderSizeElements * sizeof(uintmax_t);
@@ -290,9 +277,9 @@ void* DeferredCommandList::WriteCommand(Command command,
   CommandHeader& header =
       *reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
   header.command = command;
-  header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t);
+  header.arguments_size_elements =
+      uint32_t(arguments_size_elements) / sizeof(uintmax_t);
   return command_stream_.data() + (offset + kCommandHeaderSizeBytes);
-  #endif
 }
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc
index d9914e566..29501b299 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@@ -183,7 +183,7 @@ void PipelineCache::Shutdown() {
   // creating them.
   if (!creation_threads_.empty()) {
     {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
       creation_threads_shutdown_from_ = 0;
     }
     creation_request_cond_.notify_all();
@@ -681,7 +681,7 @@ void PipelineCache::InitializeShaderStorage(
       if (!creation_threads_.empty()) {
         // Submit the pipeline for creation to any available thread.
         {
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
           creation_queue_.push_back(new_pipeline);
         }
         creation_request_cond_.notify_one();
@@ -695,7 +695,7 @@ void PipelineCache::InitializeShaderStorage(
       CreateQueuedPipelinesOnProcessorThread();
       if (creation_threads_.size() > creation_thread_original_count) {
         {
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
           creation_threads_shutdown_from_ = creation_thread_original_count;
           // Assuming the queue is empty because of
           // CreateQueuedPipelinesOnProcessorThread.
@@ -708,7 +708,7 @@ void PipelineCache::InitializeShaderStorage(
         bool await_creation_completion_event;
         {
           // Cleanup so additional threads can be created later again.
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
           creation_threads_shutdown_from_ = SIZE_MAX;
           // If the invocation is blocking, all the shader storage
           // initialization is expected to be done before proceeding, to avoid
@@ -813,7 +813,7 @@ void PipelineCache::EndSubmission() {
     // Await creation of all queued pipelines.
     bool await_creation_completion_event;
     {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
       // Assuming the creation queue is already empty (because the processor
       // thread also worked on creating the leftover pipelines), so only check
       // if there are threads with pipelines currently being created.
@@ -834,7 +834,7 @@ bool PipelineCache::IsCreatingPipelines() {
   if (creation_threads_.empty()) {
     return false;
   }
-  std::lock_guard<std::mutex> lock(creation_request_lock_);
+  std::lock_guard<xe_mutex> lock(creation_request_lock_);
   return !creation_queue_.empty() || creation_threads_busy_ != 0;
 }
 
@@ -1076,7 +1076,7 @@ bool PipelineCache::ConfigurePipeline(
   if (!creation_threads_.empty()) {
     // Submit the pipeline for creation to any available thread.
     {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
       creation_queue_.push_back(new_pipeline);
     }
     creation_request_cond_.notify_one();
@@ -3314,7 +3314,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
     // Check if need to shut down or set the completion event and dequeue the
     // pipeline if there is any.
     {
-      std::unique_lock<std::mutex> lock(creation_request_lock_);
+      std::unique_lock<xe_mutex> lock(creation_request_lock_);
       if (thread_index >= creation_threads_shutdown_from_ ||
           creation_queue_.empty()) {
         if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
@@ -3345,7 +3345,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
     // completion event if needed (at the next iteration, or in some other
     // thread).
     {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
       --creation_threads_busy_;
     }
   }
@@ -3356,7 +3356,7 @@ void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
   while (true) {
     Pipeline* pipeline_to_create;
     {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
       if (creation_queue_.empty()) {
         break;
       }
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h
index 37e73cae4..43e528d35 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@@ -403,8 +403,8 @@ class PipelineCache {
   // Pipeline creation threads.
   void CreationThread(size_t thread_index);
   void CreateQueuedPipelinesOnProcessorThread();
-  std::mutex creation_request_lock_;
-  std::condition_variable creation_request_cond_;
+  xe_mutex creation_request_lock_;
+  std::condition_variable_any creation_request_cond_;
   // Protected with creation_request_lock_, notify_one creation_request_cond_
   // when set.
   std::deque<Pipeline*> creation_queue_;
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index 24b1eefdc..5c62c50c3 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -650,7 +650,8 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
   }
   return normalized_color_mask;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 xenos::CopySampleSelect SanitizeCopySampleSelect(
     xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
     bool is_depth) {
@@ -737,7 +738,7 @@ const ResolveCopyShaderInfo
         {"Resolve Copy Full 64bpp", true, 2, 4, 5, 3},
         {"Resolve Copy Full 128bpp", true, 2, 4, 4, 3},
 };
-
+XE_MSVC_OPTIMIZE_SMALL()
 bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                     TraceWriter& trace_writer, uint32_t draw_resolution_scale_x,
                     uint32_t draw_resolution_scale_y,
@@ -869,7 +870,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
     y1 = y0 + int32_t(xenos::kMaxResolveSize);
   }
   // fails in forza horizon 1
-  assert_true(x0 < x1 && y0 < y1);
+  //x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
+  assert_true(x0 <= x1 && y0 <= y1);
   if (x0 >= x1 || y0 >= y1) {
     XELOGE("Resolve region is empty");
     return false;
@@ -1108,7 +1110,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
   info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
   info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
   info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
-
+  #if 0
   XELOGD(
       "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
       "modified memory range 0x{:08X} to 0x{:08X})",
@@ -1119,10 +1121,10 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                      xenos::ColorRenderTargetFormat(color_edram_info.format)),
       FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
       copy_dest_extent_end);
-
+  #endif
   return true;
 }
-
+XE_MSVC_OPTIMIZE_REVERT()
 ResolveCopyShaderIndex ResolveInfo::GetCopyShader(
     uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y,
     ResolveCopyShaderConstants& constants_out, uint32_t& group_count_x_out,
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index 420bafcf2..15c014520 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -475,6 +475,8 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
 
 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
+XE_NOINLINE
+XE_NOALIAS
 xenos::CopySampleSelect SanitizeCopySampleSelect(
     xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
     bool is_depth);
diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h
index 53b81b888..1c877a9ab 100644
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@@ -14,6 +14,11 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
   new (&reader_)
       RingBuffer(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t));
   reader_.set_write_offset(count * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      COMMAND_PROCESSOR::GetCurrentRingReadCount());
   do {
     if (COMMAND_PROCESSOR::ExecutePacket()) {
       continue;
@@ -30,11 +35,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
 }
 
 bool COMMAND_PROCESSOR::ExecutePacket() {
-  // prefetch the wraparound range
-  // it likely is already in L3 cache, but in a zen system it may be another
-  // chiplets l3
-  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
-      COMMAND_PROCESSOR::GetCurrentRingReadCount());
   const uint32_t packet = reader_.ReadAndSwap<uint32_t>();
   const uint32_t packet_type = packet >> 30;
 
@@ -495,7 +495,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
         } else {
           xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100));
         }
-        xe::threading::SyncMemory();
+        // xe::threading::SyncMemory();
         ReturnFromWait();
 
         if (!worker_running_) {
@@ -599,27 +599,28 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_COND_WRITE(
     value = register_file_->values[poll_reg_addr].u32;
   }
   bool matched = false;
+  value &= mask;
   switch (wait_info & 0x7) {
     case 0x0:  // Never.
       matched = false;
       break;
     case 0x1:  // Less than reference.
-      matched = (value & mask) < ref;
+      matched = value < ref;
       break;
     case 0x2:  // Less than or equal to reference.
-      matched = (value & mask) <= ref;
+      matched = value <= ref;
       break;
     case 0x3:  // Equal to reference.
-      matched = (value & mask) == ref;
+      matched = value == ref;
       break;
     case 0x4:  // Not equal to reference.
-      matched = (value & mask) != ref;
+      matched = value != ref;
       break;
     case 0x5:  // Greater than or equal to reference.
-      matched = (value & mask) >= ref;
+      matched = value >= ref;
       break;
     case 0x6:  // Greater than reference.
-      matched = (value & mask) > ref;
+      matched = value > ref;
       break;
     case 0x7:  // Always
       matched = true;
@@ -1064,7 +1065,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
   assert_true(count - 2 >= size_dwords);
   auto shader = COMMAND_PROCESSOR::LoadShader(
       shader_type, uint32_t(reader_.read_ptr()),
-                 reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
+      reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
   switch (shader_type) {
     case xenos::ShaderType::kVertex:
       active_vertex_shader_ = shader;
diff --git a/src/xenia/gpu/primitive_processor.h b/src/xenia/gpu/primitive_processor.h
index aac84885d..7c2e96e3b 100644
--- a/src/xenia/gpu/primitive_processor.h
+++ b/src/xenia/gpu/primitive_processor.h
@@ -430,7 +430,7 @@ class PrimitiveProcessor {
       --count;
       uint32_t index = *(source++) & low_bits_mask_guest_endian;
       *(dest++) = index != reset_index_guest_endian
-                      ? xenos::GpuSwap(index, HostSwap)
+                      ? xenos::GpuSwapInline(index, HostSwap)
                       : UINT32_MAX;
     }
     if (count >= kSimdVectorU32Elements) {
@@ -442,10 +442,10 @@ class PrimitiveProcessor {
       __m128i host_swap_shuffle;
       if constexpr (HostSwap != xenos::Endian::kNone) {
         host_swap_shuffle = _mm_set_epi32(
-            int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap)));
+            int32_t(xenos::GpuSwapInline(uint32_t(0x0F0E0D0C), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x0B0A0908), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x07060504), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x03020100), HostSwap)));
       }
 #endif  // XE_ARCH_AMD64
       while (count >= kSimdVectorU32Elements) {
@@ -490,7 +490,7 @@ class PrimitiveProcessor {
     while (count--) {
       uint32_t index = *(source++) & low_bits_mask_guest_endian;
       *(dest++) = index != reset_index_guest_endian
-                      ? xenos::GpuSwap(index, HostSwap)
+                      ? xenos::GpuSwapInline(index, HostSwap)
                       : UINT32_MAX;
     }
   }
@@ -510,19 +510,19 @@ class PrimitiveProcessor {
   };
   struct To24Swapping8In16IndexTransform {
     uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k8in16) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k8in16) &
              xenos::kVertexIndexMask;
     }
   };
   struct To24Swapping8In32IndexTransform {
     uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k8in32) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k8in32) &
              xenos::kVertexIndexMask;
     }
   };
   struct To24Swapping16In32IndexTransform {
     uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k16in32) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k16in32) &
              xenos::kVertexIndexMask;
     }
   };
diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc
index ffd77246e..38a8c54e9 100644
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@@ -388,6 +388,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
 
   bool any_data_resolved = false;
   uint32_t block_first = page_first >> 6;
+  swcache::PrefetchL1(&system_page_flags_[block_first]);
   uint32_t block_last = page_last >> 6;
   uint32_t range_start = UINT32_MAX;
 
diff --git a/src/xenia/gpu/texture_util.cc b/src/xenia/gpu/texture_util.cc
index b20194a78..cbe6c62bd 100644
--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@@ -464,7 +464,8 @@ TextureGuestLayout GetGuestTextureLayout(
 
   return layout;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
                          uint32_t bytes_per_block_log2) {
   // https://github.com/gildor2/UModel/blob/de8fbd3bc922427ea056b7340202dcdcc19ccff5/Unreal/UnTexture.cpp#L489
@@ -481,7 +482,8 @@ int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
   return ((offset & ~0x1FF) << 3) + ((y & 16) << 7) + ((offset & 0x1C0) << 2) +
          (((((y & 8) >> 2) + (x >> 3)) & 3) << 6) + (offset & 0x3F);
 }
-
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
                          uint32_t height, uint32_t bytes_per_block_log2) {
   // Reconstructed from disassembly of XGRAPHICS::TileVolume.
@@ -509,7 +511,8 @@ int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
   address += offset2 & 63;
   return address;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
                                      uint32_t pitch,
                                      uint32_t bytes_per_block_log2) {
@@ -538,7 +541,8 @@ uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
   }
   return upper_bound;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
                                      uint32_t back, uint32_t pitch,
                                      uint32_t height,
diff --git a/src/xenia/gpu/texture_util.h b/src/xenia/gpu/texture_util.h
index bcc080de3..a6513a0c0 100644
--- a/src/xenia/gpu/texture_util.h
+++ b/src/xenia/gpu/texture_util.h
@@ -280,8 +280,12 @@ void GetTextureTotalSize(xenos::DataDimension dimension,
 // bytes_per_block_log2 is log2_floor according to how Direct3D 9 calculates it,
 // but k_32_32_32 textures are never tiled anyway likely.
 
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
                          uint32_t bytes_per_block_log2);
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
                          uint32_t height, uint32_t bytes_per_block_log2);
 // Because (0, 0, 0) within each 32x32x4-block tile is stored in memory first,
@@ -308,9 +312,13 @@ inline uint32_t GetTiledAddressLowerBound3D(uint32_t left, uint32_t top,
 // Supporting the right > pitch and bottom > height (in tiles) cases also, for
 // estimation how far addresses can actually go even potentially beyond the
 // subresource stride.
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
                                      uint32_t pitch,
                                      uint32_t bytes_per_block_log2);
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
                                      uint32_t back, uint32_t pitch,
                                      uint32_t height,
diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc
index f15c621cd..997e9a48a 100644
--- a/src/xenia/gpu/xenos.cc
+++ b/src/xenia/gpu/xenos.cc
@@ -125,8 +125,8 @@ float Float7e3To32(uint32_t f10) {
 // Based on CFloat24 from d3dref9.dll and the 6e4 code from:
 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
 // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
-
-uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
+XE_NOALIAS
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept {
   if (!(f32 > 0.0f)) {
     // Positive only, and not -0 or NaN.
     return 0;
@@ -150,8 +150,8 @@ uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
   }
   return (f32u32 >> 3) & 0xFFFFFF;
 }
-
-float Float20e4To32(uint32_t f24) {
+XE_NOALIAS
+float Float20e4To32(uint32_t f24) noexcept {
   f24 &= 0xFFFFFF;
   if (!f24) {
     return 0.0f;
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 8c03be479..8e9fd5c11 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -421,10 +421,12 @@ float Float7e3To32(uint32_t f10);
 // floating-point number.
 // Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
 // depth, rounding to the nearest even or towards zero.
-uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
+XE_NOALIAS 
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept;
 // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
 // IEEE-754 32-bit floating-point number.
-float Float20e4To32(uint32_t f24);
+XE_NOALIAS
+float Float20e4To32(uint32_t f24) noexcept;
 // Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
 // floating-point number.
 constexpr float UNorm24To32(uint32_t n24) {
@@ -1045,9 +1047,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) {
       return value;
   }
 }
-XE_NOINLINE
+XE_FORCEINLINE
 XE_NOALIAS
-static uint32_t GpuSwap(uint32_t value, Endian endianness) {
+static uint32_t GpuSwapInline(uint32_t value, Endian endianness) {
   switch (endianness) {
     default:
     case Endian::kNone:
@@ -1065,6 +1067,11 @@ static uint32_t GpuSwap(uint32_t value, Endian endianness) {
       return ((value >> 16) & 0xFFFF) | (value << 16);
   }
 }
+XE_NOINLINE
+XE_NOALIAS
+static uint32_t GpuSwap(uint32_t value, Endian endianness) {
+  return GpuSwapInline(value, endianness);
+}
 
 inline float GpuSwap(float value, Endian endianness) {
   union {
diff --git a/src/xenia/hid/input_system.cc b/src/xenia/hid/input_system.cc
index 588faefe3..a21ce5a7b 100644
--- a/src/xenia/hid/input_system.cc
+++ b/src/xenia/hid/input_system.cc
@@ -137,8 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel(
   modified_vibration.right_motor_speed = 0;
   return modified_vibration;
 }
-std::unique_lock<xe_unlikely_mutex> InputSystem::lock() {
-  return std::unique_lock<xe_unlikely_mutex>{lock_};
+std::unique_lock<xe_mutex> InputSystem::lock() {
+  return std::unique_lock<xe_mutex>{lock_};
 }
 }  // namespace hid
 }  // namespace xe
diff --git a/src/xenia/hid/input_system.h b/src/xenia/hid/input_system.h
index 333116499..c294edc64 100644
--- a/src/xenia/hid/input_system.h
+++ b/src/xenia/hid/input_system.h
@@ -48,7 +48,7 @@ class InputSystem {
   void UpdateUsedSlot(uint8_t slot, bool connected);
   uint8_t GetConnectedSlots() const { return connected_slot; }
 
-  std::unique_lock<xe_unlikely_mutex> lock();
+  std::unique_lock<xe_mutex> lock();
 
  private:
   xe::ui::Window* window_ = nullptr;
@@ -57,7 +57,7 @@ class InputSystem {
 
   X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration);
   uint8_t connected_slot = 0b0001;
-  xe_unlikely_mutex lock_;
+  xe_mutex lock_;
 };
 
 }  // namespace hid
diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
index 8e66ac683..b5bb6c57b 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@@ -911,11 +911,17 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
 DECLARE_XBOXKRNL_EXPORT3(NtSignalAndWaitForSingleObjectEx, kThreading,
                          kImplemented, kBlocking, kHighFrequency);
 
+static void PrefetchForCAS(const void* value) {
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
+    swcache::PrefetchW(value);
+  }
+}
+
 uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
   // XELOGD(
   //     "KfAcquireSpinLock({:08X})",
   //     lock_ptr);
-
+  PrefetchForCAS(lock);
   // Lock.
   while (!xe::atomic_cas(0, 1, lock)) {
     // Spin!
@@ -956,6 +962,7 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
 void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
   // Lock.
   auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
+  PrefetchForCAS(lock);
   while (!xe::atomic_cas(0, 1, lock)) {
 #if XE_ARCH_AMD64 == 1
     // todo: this is just a nop if they don't have SMT, which is not great
@@ -973,6 +980,7 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
 dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
   // Lock.
   auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
+  PrefetchForCAS(lock);
   if (!xe::atomic_cas(0, 1, lock)) {
     return 0;
   }
diff --git a/src/xenia/kernel/xthread.cc b/src/xenia/kernel/xthread.cc
index b842c2c08..084485c66 100644
--- a/src/xenia/kernel/xthread.cc
+++ b/src/xenia/kernel/xthread.cc
@@ -763,7 +763,8 @@ void XThread::SetActiveCpu(uint8_t cpu_index) {
       thread_->set_affinity_mask(uint64_t(1) << cpu_index);
     }
   } else {
-    XELOGW("Too few processor cores - scheduling will be wonky");
+	  //there no good reason why we need to log this... we don't perfectly emulate the 360's scheduler in any way
+   // XELOGW("Too few processor cores - scheduling will be wonky");
   }
 }
 
diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc
index 16e2b8336..f29eb21dc 100644
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@@ -713,6 +713,8 @@ void BaseHeap::Initialize(Memory* memory, uint8_t* membase, HeapType heap_type,
   heap_base_ = heap_base;
   heap_size_ = heap_size;
   page_size_ = page_size;
+  xenia_assert(xe::is_pow2(page_size_));
+  page_size_shift_ = xe::log2_floor(page_size_);
   host_address_offset_ = host_address_offset;
   page_table_.resize(heap_size / page_size);
   unreserved_page_count_ = uint32_t(page_table_.size());
@@ -1234,14 +1236,14 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
   //  fails and returns without modifying the access protection of any pages in
   //  the specified region."
 
-  uint32_t start_page_number = (address - heap_base_) / page_size_;
+  uint32_t start_page_number = (address - heap_base_) >> page_size_shift_;
   if (start_page_number >= page_table_.size()) {
     XELOGE("BaseHeap::Protect failed due to out-of-bounds base address {:08X}",
            address);
     return false;
   }
   uint32_t end_page_number =
-      uint32_t((uint64_t(address) + size - 1 - heap_base_) / page_size_);
+      uint32_t((uint64_t(address) + size - 1 - heap_base_) >> page_size_shift_);
   if (end_page_number >= page_table_.size()) {
     XELOGE(
         "BaseHeap::Protect failed due to out-of-bounds range ({:08X} bytes "
@@ -1268,17 +1270,21 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
       return false;
     }
   }
+  uint32_t xe_page_size = static_cast<uint32_t>(xe::memory::page_size());
+
+  uint32_t page_size_mask = xe_page_size - 1;
 
   // Attempt host change (hopefully won't fail).
   // We can only do this if our size matches system page granularity.
   uint32_t page_count = end_page_number - start_page_number + 1;
-  if (page_size_ == xe::memory::page_size() ||
-      (((page_count * page_size_) % xe::memory::page_size() == 0) &&
-       ((start_page_number * page_size_) % xe::memory::page_size() == 0))) {
+  if (page_size_ == xe_page_size ||
+      ((((page_count << page_size_shift_) & page_size_mask) == 0) &&
+       (((start_page_number << page_size_shift_) & page_size_mask) == 0))) {
     memory::PageAccess old_protect_access;
-    if (!xe::memory::Protect(TranslateRelative(start_page_number * page_size_),
-                             page_count * page_size_, ToPageAccess(protect),
-                             old_protect ? &old_protect_access : nullptr)) {
+    if (!xe::memory::Protect(
+            TranslateRelative(start_page_number << page_size_shift_),
+            page_count << page_size_shift_, ToPageAccess(protect),
+            old_protect ? &old_protect_access : nullptr)) {
       XELOGE("BaseHeap::Protect failed due to host VirtualProtect failure");
       return false;
     }
@@ -1303,7 +1309,7 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
 
 bool BaseHeap::QueryRegionInfo(uint32_t base_address,
                                HeapAllocationInfo* out_info) {
-  uint32_t start_page_number = (base_address - heap_base_) / page_size_;
+  uint32_t start_page_number = (base_address - heap_base_) >> page_size_shift_;
   if (start_page_number > page_table_.size()) {
     XELOGE("BaseHeap::QueryRegionInfo base page out of range");
     return false;
@@ -1321,9 +1327,10 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
   if (start_page_entry.state) {
     // Committed/reserved region.
     out_info->allocation_base =
-        heap_base_ + start_page_entry.base_address * page_size_;
+        heap_base_ + (start_page_entry.base_address << page_size_shift_);
     out_info->allocation_protect = start_page_entry.allocation_protect;
-    out_info->allocation_size = start_page_entry.region_page_count * page_size_;
+    out_info->allocation_size = start_page_entry.region_page_count
+                                << page_size_shift_;
     out_info->state = start_page_entry.state;
     out_info->protect = start_page_entry.current_protect;
 
@@ -1358,7 +1365,7 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
 }
 
 bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
-  uint32_t page_number = (address - heap_base_) / page_size_;
+  uint32_t page_number = (address - heap_base_) >> page_size_shift_;
   if (page_number > page_table_.size()) {
     XELOGE("BaseHeap::QuerySize base page out of range");
     *out_size = 0;
@@ -1366,12 +1373,12 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
   }
   auto global_lock = global_critical_region_.Acquire();
   auto page_entry = page_table_[page_number];
-  *out_size = (page_entry.region_page_count * page_size_);
+  *out_size = (page_entry.region_page_count << page_size_shift_);
   return true;
 }
 
 bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
-  uint32_t page_number = (*in_out_address - heap_base_) / page_size_;
+  uint32_t page_number = (*in_out_address - heap_base_) >> page_size_shift_;
   if (page_number > page_table_.size()) {
     XELOGE("BaseHeap::QuerySize base page out of range");
     *out_size = 0;
@@ -1379,13 +1386,13 @@ bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
   }
   auto global_lock = global_critical_region_.Acquire();
   auto page_entry = page_table_[page_number];
-  *in_out_address = (page_entry.base_address * page_size_);
-  *out_size = (page_entry.region_page_count * page_size_);
+  *in_out_address = (page_entry.base_address << page_size_shift_);
+  *out_size = (page_entry.region_page_count << page_size_shift_);
   return true;
 }
 
 bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
-  uint32_t page_number = (address - heap_base_) / page_size_;
+  uint32_t page_number = (address - heap_base_) >> page_size_shift_;
   if (page_number > page_table_.size()) {
     XELOGE("BaseHeap::QueryProtect base page out of range");
     *out_protect = 0;
@@ -1403,8 +1410,8 @@ xe::memory::PageAccess BaseHeap::QueryRangeAccess(uint32_t low_address,
       (high_address - heap_base_) >= heap_size_) {
     return xe::memory::PageAccess::kNoAccess;
   }
-  uint32_t low_page_number = (low_address - heap_base_) / page_size_;
-  uint32_t high_page_number = (high_address - heap_base_) / page_size_;
+  uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
+  uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
   uint32_t protect = kMemoryProtectRead | kMemoryProtectWrite;
   {
     auto global_lock = global_critical_region_.Acquire();
@@ -1446,6 +1453,8 @@ void PhysicalHeap::Initialize(Memory* memory, uint8_t* membase,
                        page_size, host_address_offset);
   parent_heap_ = parent_heap;
   system_page_size_ = uint32_t(xe::memory::page_size());
+  xenia_assert(xe::is_pow2(system_page_size_));
+  system_page_shift_ = xe::log2_floor(system_page_size_);
 
   system_page_count_ =
       (size_t(heap_size_) + host_address_offset + (system_page_size_ - 1)) /
@@ -1665,10 +1674,11 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
   }
 
   uint32_t system_page_first =
-      (heap_relative_address + host_address_offset()) / system_page_size_;
+      (heap_relative_address + host_address_offset()) >> system_page_shift_;
+  swcache::PrefetchL1(&system_page_flags_[system_page_first >> 6]);
   uint32_t system_page_last =
-      (heap_relative_address + length - 1 + host_address_offset()) /
-      system_page_size_;
+      (heap_relative_address + length - 1 + host_address_offset()) >>
+      system_page_shift_;
   system_page_last = std::min(system_page_last, system_page_count_ - 1);
   assert_true(system_page_first <= system_page_last);
 
@@ -1677,10 +1687,40 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
   xe::memory::PageAccess protect_access =
       enable_data_providers ? xe::memory::PageAccess::kNoAccess
                             : xe::memory::PageAccess::kReadOnly;
+
+  auto global_lock = global_critical_region_.Acquire();
+  if (enable_invalidation_notifications) {
+    EnableAccessCallbacksInner<true>(system_page_first, system_page_last,
+                                     protect_access);
+  } else {
+    EnableAccessCallbacksInner<false>(system_page_first, system_page_last,
+                                      protect_access);
+  }
+}
+
+template <bool enable_invalidation_notifications>
+XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
+    const uint32_t system_page_first, const uint32_t system_page_last,
+    xe::memory::PageAccess protect_access) XE_RESTRICT {
   uint8_t* protect_base = membase_ + heap_base_;
   uint32_t protect_system_page_first = UINT32_MAX;
-  auto global_lock = global_critical_region_.Acquire();
-  for (uint32_t i = system_page_first; i <= system_page_last; ++i) {
+
+  SystemPageFlagsBlock* XE_RESTRICT sys_page_flags = system_page_flags_.data();
+  PageEntry* XE_RESTRICT page_table_ptr = page_table_.data();
+
+  // chrispy: a lot of time is spent in this loop, and i think some of the work
+  // may be avoidable and repetitive profiling shows quite a bit of time spent
+  // in this loop, but very little spent actually calling Protect
+  uint32_t i = system_page_first;
+
+  uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
+  uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
+
+  uint32_t guest_one =
+      SystemPagenumToGuestPagenum(1);
+
+  uint32_t system_one = GuestPagenumToSystemPagenum(1);
+  for (; i <= system_page_last; ++i) {
     // Check if need to enable callbacks for the page and raise its protection.
     //
     // If enabling invalidation notifications:
@@ -1702,12 +1742,19 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
     //
     // Enabling data providers doesn't need to be deferred - providers will be
     // polled for the last time without releasing the lock.
-    SystemPageFlagsBlock& page_flags_block = system_page_flags_[i >> 6];
+    SystemPageFlagsBlock& page_flags_block = sys_page_flags[i >> 6];
+
+#if XE_ARCH_AMD64 == 1
+    // x86 modulus shift
+    uint64_t page_flags_bit = uint64_t(1) << i;
+#else
     uint64_t page_flags_bit = uint64_t(1) << (i & 63);
-    uint32_t guest_page_number =
-        xe::sat_sub(i * system_page_size_, host_address_offset()) / page_size_;
+#endif
+
+    uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
+    //swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]);
     xe::memory::PageAccess current_page_access =
-        ToPageAccess(page_table_[guest_page_number].current_protect);
+        ToPageAccess(page_table_ptr[guest_page_number].current_protect);
     bool protect_system_page = false;
     // Don't do anything with inaccessible pages - don't protect, don't enable
     // callbacks - because real access violations are needed there. And don't
@@ -1715,7 +1762,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
     // reason.
     if (current_page_access != xe::memory::PageAccess::kNoAccess) {
       // TODO(Triang3l): Enable data providers.
-      if (enable_invalidation_notifications) {
+      if constexpr (enable_invalidation_notifications) {
         if (current_page_access != xe::memory::PageAccess::kReadOnly &&
             (page_flags_block.notify_on_invalidation & page_flags_bit) == 0) {
           // TODO(Triang3l): Check if data providers are already enabled.
@@ -1733,21 +1780,22 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
     } else {
       if (protect_system_page_first != UINT32_MAX) {
         xe::memory::Protect(
-            protect_base + protect_system_page_first * system_page_size_,
-            (i - protect_system_page_first) * system_page_size_,
+            protect_base + (protect_system_page_first << system_page_shift_),
+            (i - protect_system_page_first) << system_page_shift_,
             protect_access);
         protect_system_page_first = UINT32_MAX;
       }
     }
   }
+
   if (protect_system_page_first != UINT32_MAX) {
     xe::memory::Protect(
-        protect_base + protect_system_page_first * system_page_size_,
-        (system_page_last + 1 - protect_system_page_first) * system_page_size_,
+        protect_base + (protect_system_page_first << system_page_shift_),
+        (system_page_last + 1 - protect_system_page_first)
+            << system_page_shift_,
         protect_access);
   }
 }
-
 bool PhysicalHeap::TriggerCallbacks(
     global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
     uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
@@ -1774,10 +1822,10 @@ bool PhysicalHeap::TriggerCallbacks(
   }
 
   uint32_t system_page_first =
-      (heap_relative_address + host_address_offset()) / system_page_size_;
+      (heap_relative_address + host_address_offset()) >> system_page_shift_;
   uint32_t system_page_last =
-      (heap_relative_address + length - 1 + host_address_offset()) /
-      system_page_size_;
+      (heap_relative_address + length - 1 + host_address_offset()) >>
+      system_page_shift_;
   system_page_last = std::min(system_page_last, system_page_count_ - 1);
   assert_true(system_page_first <= system_page_last);
   uint32_t block_index_first = system_page_first >> 6;
@@ -1810,11 +1858,11 @@ bool PhysicalHeap::TriggerCallbacks(
   }
   uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
   uint32_t physical_address_start =
-      xe::sat_sub(system_page_first * system_page_size_,
+      xe::sat_sub(system_page_first << system_page_shift_,
                   host_address_offset()) +
       physical_address_offset;
   uint32_t physical_length = std::min(
-      xe::sat_sub(system_page_last * system_page_size_ + system_page_size_,
+      xe::sat_sub((system_page_last << system_page_shift_) + system_page_size_,
                   host_address_offset()) +
           physical_address_offset - physical_address_start,
       heap_size_ - (physical_address_start - physical_address_offset));
@@ -1858,8 +1906,8 @@ bool PhysicalHeap::TriggerCallbacks(
     unwatch_first += host_address_offset();
     unwatch_last += host_address_offset();
     assert_true(unwatch_first <= unwatch_last);
-    system_page_first = unwatch_first / system_page_size_;
-    system_page_last = unwatch_last / system_page_size_;
+    system_page_first = unwatch_first >> system_page_shift_;
+    system_page_last = unwatch_last >> system_page_shift_;
     block_index_first = system_page_first >> 6;
     block_index_last = system_page_last >> 6;
   }
@@ -1874,8 +1922,8 @@ bool PhysicalHeap::TriggerCallbacks(
                              (uint64_t(1) << (i & 63))) != 0;
       if (unprotect_page) {
         uint32_t guest_page_number =
-            xe::sat_sub(i * system_page_size_, host_address_offset()) /
-            page_size_;
+            xe::sat_sub(i << system_page_shift_, host_address_offset()) >>
+            page_size_shift_;
         if (ToPageAccess(page_table_[guest_page_number].current_protect) !=
             xe::memory::PageAccess::kReadWrite) {
           unprotect_page = false;
@@ -1888,8 +1936,9 @@ bool PhysicalHeap::TriggerCallbacks(
       } else {
         if (unprotect_system_page_first != UINT32_MAX) {
           xe::memory::Protect(
-              protect_base + unprotect_system_page_first * system_page_size_,
-              (i - unprotect_system_page_first) * system_page_size_,
+              protect_base +
+                  (unprotect_system_page_first << system_page_shift_),
+              (i - unprotect_system_page_first) << system_page_shift_,
               xe::memory::PageAccess::kReadWrite);
           unprotect_system_page_first = UINT32_MAX;
         }
@@ -1897,9 +1946,9 @@ bool PhysicalHeap::TriggerCallbacks(
     }
     if (unprotect_system_page_first != UINT32_MAX) {
       xe::memory::Protect(
-          protect_base + unprotect_system_page_first * system_page_size_,
-          (system_page_last + 1 - unprotect_system_page_first) *
-              system_page_size_,
+          protect_base + (unprotect_system_page_first << system_page_shift_),
+          (system_page_last + 1 - unprotect_system_page_first)
+              << system_page_shift_,
           xe::memory::PageAccess::kReadWrite);
     }
   }
diff --git a/src/xenia/memory.h b/src/xenia/memory.h
index 3d4cf5637..672115d5c 100644
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@@ -216,6 +216,7 @@ class BaseHeap {
   uint32_t heap_base_;
   uint32_t heap_size_;
   uint32_t page_size_;
+  uint32_t page_size_shift_;
   uint32_t host_address_offset_;
   uint32_t unreserved_page_count_;
   xe::global_critical_region global_critical_region_;
@@ -270,18 +271,36 @@ class PhysicalHeap : public BaseHeap {
   void EnableAccessCallbacks(uint32_t physical_address, uint32_t length,
                              bool enable_invalidation_notifications,
                              bool enable_data_providers);
+  template <bool enable_invalidation_notifications>
+  XE_NOINLINE void EnableAccessCallbacksInner(
+      const uint32_t system_page_first, const uint32_t system_page_last,
+      xe::memory::PageAccess protect_access) XE_RESTRICT;
+
   // Returns true if any page in the range was watched.
   bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
-      uint32_t virtual_address, uint32_t length, bool is_write,
-      bool unwatch_exact_range, bool unprotect = true);
+                        uint32_t virtual_address, uint32_t length,
+                        bool is_write, bool unwatch_exact_range,
+                        bool unprotect = true);
 
   uint32_t GetPhysicalAddress(uint32_t address) const;
 
+  uint32_t SystemPagenumToGuestPagenum(uint32_t num) const {
+    return ((num << system_page_shift_) - host_address_offset()) >> page_size_shift_;
+  }
+
+  uint32_t GuestPagenumToSystemPagenum(uint32_t num) {
+    num <<= page_size_shift_;
+    num += host_address_offset();
+    num >>= system_page_shift_;
+    return num;
+  }
  protected:
   VirtualHeap* parent_heap_;
 
   uint32_t system_page_size_;
   uint32_t system_page_count_;
+  uint32_t system_page_shift_;
+  uint32_t padding1_;
 
   struct SystemPageFlagsBlock {
     // Whether writing to each page should result trigger invalidation
@@ -458,9 +477,9 @@ class Memory {
   // TODO(Triang3l): Implement data providers - this is why locking depth of 1
   // will be required in the future.
   bool TriggerPhysicalMemoryCallbacks(
-      global_unique_lock_type global_lock_locked_once,
-      uint32_t virtual_address, uint32_t length, bool is_write,
-      bool unwatch_exact_range, bool unprotect = true);
+      global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
+      uint32_t length, bool is_write, bool unwatch_exact_range,
+      bool unprotect = true);
 
   // Allocates virtual memory from the 'system' heap.
   // System memory is kept separate from game memory but is still accessible
@@ -509,10 +528,10 @@ class Memory {
                                           const void* host_address);
 
   bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
-      void* host_address, bool is_write);
+                               void* host_address, bool is_write);
   static bool AccessViolationCallbackThunk(
-      global_unique_lock_type global_lock_locked_once,
-      void* context, void* host_address, bool is_write);
+      global_unique_lock_type global_lock_locked_once, void* context,
+      void* host_address, bool is_write);
 
   std::filesystem::path file_name_;
   uint32_t system_page_size_ = 0;