From eb8154908c6330b323d593994d8b6b9025a4b8d2 Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Sat, 17 Sep 2022 04:04:53 -0700
Subject: [PATCH] atomic cas use prefetchw if available remove useless
 memorybarrier remove double membarrier in wait pm4 cmd add int64 cvar use
 int64 cvar for x64 feature mask Rework some functions that were frontend
 bound according to vtune placing some of their code in different noinline
 functions, profiling after indicating l1 cache misses decreased and perf of
 func increased remove long vpinsrd dep chain code for conversion.h, instead
 do normal load+bswap or movbe if avail Much faster entry table via split_map,
 code size could be improved though GetResolveInfo was very large and had
 impact on icache, mark callees as noinline + msvc pragma optimize small use
 log2 shifts instead of integer divides in memory minor optimizations in
 PhysicalHeap::EnableAccessCallbacks, the majority of time in the function is
 spent looping, NOT calling Protect! Someone should optimize this function and
 rework the algo completely remove wonky scheduling log message, it was spammy
 and unhelpful lock count was unnecessary for criticalsection mutex,
 criticalsection is already a recursive mutex brief notes i gotta run

---
 src/xenia/apu/conversion.h                    |  39 +
 src/xenia/base/cvar.h                         |   5 +-
 src/xenia/base/mutex.cc                       |  25 +-
 src/xenia/base/platform.h                     |  16 +-
 src/xenia/base/platform_amd64.cc              | 138 +--
 src/xenia/base/platform_amd64.h               |  11 +-
 src/xenia/base/threading.h                    |   6 +
 src/xenia/base/threading_win.cc               |  13 +-
 src/xenia/cpu/backend/x64/x64_backend.h       |   2 +-
 src/xenia/cpu/backend/x64/x64_emitter.cc      |  69 +-
 src/xenia/cpu/backend/x64/x64_emitter.h       |   4 +-
 src/xenia/cpu/backend/x64/x64_sequences.cc    |   2 +-
 src/xenia/cpu/entry_table.cc                  |  35 +-
 src/xenia/cpu/entry_table.h                   |   5 +-
 src/xenia/gpu/command_processor.cc            |   7 +-
 .../gpu/d3d12/d3d12_command_processor.cc      | 919 ++++++++++--------
 src/xenia/gpu/d3d12/d3d12_command_processor.h |  37 +-
 src/xenia/gpu/d3d12/deferred_command_list.cc  |  17 +-
 src/xenia/gpu/d3d12/pipeline_cache.cc         |  20 +-
 src/xenia/gpu/d3d12/pipeline_cache.h          |   4 +-
 src/xenia/gpu/draw_util.cc                    |  14 +-
 src/xenia/gpu/draw_util.h                     |   2 +
 .../gpu/pm4_command_processor_implement.h     |  27 +-
 src/xenia/gpu/primitive_processor.h           |  18 +-
 src/xenia/gpu/shared_memory.cc                |   1 +
 src/xenia/gpu/texture_util.cc                 |  12 +-
 src/xenia/gpu/texture_util.h                  |   8 +
 src/xenia/gpu/xenos.cc                        |   8 +-
 src/xenia/gpu/xenos.h                         |  15 +-
 src/xenia/hid/input_system.cc                 |   4 +-
 src/xenia/hid/input_system.h                  |   4 +-
 .../kernel/xboxkrnl/xboxkrnl_threading.cc     |  10 +-
 src/xenia/kernel/xthread.cc                   |   3 +-
 src/xenia/memory.cc                           | 145 ++-
 src/xenia/memory.h                            |  35 +-
 35 files changed, 942 insertions(+), 738 deletions(-)

diff --git a/src/xenia/apu/conversion.h b/src/xenia/apu/conversion.h
index 211243348..0f807d67b 100644
--- a/src/xenia/apu/conversion.h
+++ b/src/xenia/apu/conversion.h
@@ -20,6 +20,8 @@ namespace apu {
 namespace conversion {
 
 #if XE_ARCH_AMD64
+
+#if 0
 inline void sequential_6_BE_to_interleaved_6_LE(float* output,
                                                 const float* input,
                                                 size_t ch_sample_count) {
@@ -41,7 +43,44 @@ inline void sequential_6_BE_to_interleaved_6_LE(float* output,
     out[sample * 6 + 5] = sample2;
   }
 }
+#else
+XE_NOINLINE
+static void _generic_sequential_6_BE_to_interleaved_6_LE(
+    float* XE_RESTRICT output, const float* XE_RESTRICT input,
+    unsigned ch_sample_count) {
+  for (unsigned sample = 0; sample < ch_sample_count; sample++) {
+    for (unsigned channel = 0; channel < 6; channel++) {
+      unsigned int value = *reinterpret_cast<const unsigned int*>(
+          &input[channel * ch_sample_count + sample]);
 
+      *reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
+          xe::byte_swap(value);
+    }
+  }
+}
+XE_NOINLINE
+static void _movbe_sequential_6_BE_to_interleaved_6_LE(
+    float* XE_RESTRICT output, const float* XE_RESTRICT input,
+    unsigned ch_sample_count) {
+  for (unsigned sample = 0; sample < ch_sample_count; sample++) {
+    for (unsigned channel = 0; channel < 6; channel++) {
+      *reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
+          _load_be_u32(reinterpret_cast<const unsigned int*>(
+              &input[channel * ch_sample_count + sample]));
+    }
+  }
+}
+
+inline static void sequential_6_BE_to_interleaved_6_LE(
+    float* output, const float* input, unsigned ch_sample_count) {
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovbe) {
+    _movbe_sequential_6_BE_to_interleaved_6_LE(output, input, ch_sample_count);
+  } else {
+    _generic_sequential_6_BE_to_interleaved_6_LE(output, input,
+                                                 ch_sample_count);
+  }
+}
+#endif
 inline void sequential_6_BE_to_interleaved_2_LE(float* output,
                                                 const float* input,
                                                 size_t ch_sample_count) {
diff --git a/src/xenia/base/cvar.h b/src/xenia/base/cvar.h
index 61b8faf11..144703665 100644
--- a/src/xenia/base/cvar.h
+++ b/src/xenia/base/cvar.h
@@ -335,7 +335,8 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,
 
 #define DEFINE_uint64(name, default_value, description, category) \
   DEFINE_CVar(name, default_value, description, category, false, uint64_t)
-
+#define DEFINE_int64(name, default_value, description, category) \
+  DEFINE_CVar(name, default_value, description, category, false, int64_t)
 #define DEFINE_double(name, default_value, description, category) \
   DEFINE_CVar(name, default_value, description, category, false, double)
 
@@ -383,7 +384,7 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,
 #define DECLARE_uint32(name) DECLARE_CVar(name, uint32_t)
 
 #define DECLARE_uint64(name) DECLARE_CVar(name, uint64_t)
-
+#define DECLARE_int64(name) DECLARE_CVar(name, int64_t)
 #define DECLARE_double(name) DECLARE_CVar(name, double)
 
 #define DECLARE_string(name) DECLARE_CVar(name, std::string)
diff --git a/src/xenia/base/mutex.cc b/src/xenia/base/mutex.cc
index 027cd7882..b975e4bc3 100644
--- a/src/xenia/base/mutex.cc
+++ b/src/xenia/base/mutex.cc
@@ -26,7 +26,7 @@ check this and release the mutex one way to do this is by using FlsAlloc and
 PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
 thread exits
 */
-thread_local unsigned global_mutex_depth = 0;
+
 static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
   return reinterpret_cast<CRITICAL_SECTION*>(mutex);
 }
@@ -38,29 +38,16 @@ xe_global_mutex::xe_global_mutex() {
 xe_global_mutex ::~xe_global_mutex() {
   DeleteCriticalSection(global_critical_section(this));
 }
+
 void xe_global_mutex::lock() {
-  if (global_mutex_depth) {
-  } else {
-    EnterCriticalSection(global_critical_section(this));
-  }
-  global_mutex_depth++;
+  EnterCriticalSection(global_critical_section(this));
 }
 void xe_global_mutex::unlock() {
-  if (--global_mutex_depth == 0) {
-    LeaveCriticalSection(global_critical_section(this));
-  }
+  LeaveCriticalSection(global_critical_section(this));
 }
 bool xe_global_mutex::try_lock() {
-  if (global_mutex_depth) {
-    ++global_mutex_depth;
-    return true;
-  } else {
-    BOOL success = TryEnterCriticalSection(global_critical_section(this));
-    if (success) {
-      ++global_mutex_depth;
-    }
-    return success;
-  }
+  BOOL success = TryEnterCriticalSection(global_critical_section(this));
+  return success;
 }
 
 CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {
diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h
index e99e8b83d..61749e4c7 100644
--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@@ -116,15 +116,15 @@
 #define XE_LIKELY(...) (!!(__VA_ARGS__))
 #define XE_UNLIKELY(...) (!!(__VA_ARGS__))
 #define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__)
-#define	XE_NOALIAS		__declspec(noalias)
+#define XE_NOALIAS __declspec(noalias)
 #elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
 #define XE_FORCEINLINE __attribute__((always_inline))
 #define XE_NOINLINE __attribute__((noinline))
 #define XE_COLD __attribute__((cold))
 #define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
 #define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
-#define XE_NOALIAS		
-//cant do unevaluated assume
+#define XE_NOALIAS
+// cant do unevaluated assume
 #define XE_MSVC_ASSUME(...) static_cast<void>(0)
 #else
 #define XE_FORCEINLINE inline
@@ -137,7 +137,13 @@
 #define XE_MSVC_ASSUME(...) static_cast<void>(0)
 
 #endif
-
+#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1
+#define XE_MSVC_OPTIMIZE_SMALL() __pragma(optimize("s", on))
+#define XE_MSVC_OPTIMIZE_REVERT() __pragma(optimize("", on))
+#else
+#define XE_MSVC_OPTIMIZE_SMALL()
+#define XE_MSVC_OPTIMIZE_REVERT()
+#endif
 #if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
 #define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__))
 #define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__))
@@ -180,7 +186,7 @@ const char kPathSeparator = '/';
 const char kGuestPathSeparator = '\\';
 
 }  // namespace xe
-#if XE_ARCH_AMD64==1
+#if XE_ARCH_AMD64 == 1
 #include "platform_amd64.h"
 #endif
 #endif  // XENIA_BASE_PLATFORM_H_
diff --git a/src/xenia/base/platform_amd64.cc b/src/xenia/base/platform_amd64.cc
index 31df3c497..7005420e5 100644
--- a/src/xenia/base/platform_amd64.cc
+++ b/src/xenia/base/platform_amd64.cc
@@ -7,13 +7,12 @@
  ******************************************************************************
  */
 
-
 #include "xenia/base/cvar.h"
 #include "xenia/base/platform.h"
 
 #include "third_party/xbyak/xbyak/xbyak.h"
 #include "third_party/xbyak/xbyak/xbyak_util.h"
-DEFINE_int32(x64_extension_mask, -1,
+DEFINE_int64(x64_extension_mask, -1LL,
              "Allow the detection and utilization of specific instruction set "
              "features.\n"
              "    0 = x86_64 + AVX1\n"
@@ -33,79 +32,92 @@ DEFINE_int32(x64_extension_mask, -1,
              "x64");
 namespace xe {
 namespace amd64 {
-static uint32_t g_feature_flags = 0U;
+static uint64_t g_feature_flags = 0U;
 static bool g_did_initialize_feature_flags = false;
-uint32_t GetFeatureFlags() { 
-	xenia_assert(g_did_initialize_feature_flags);
-	return g_feature_flags; 
+uint64_t GetFeatureFlags() {
+  xenia_assert(g_did_initialize_feature_flags);
+  return g_feature_flags;
 }
 XE_COLD
 XE_NOINLINE
 void InitFeatureFlags() {
-  uint32_t feature_flags_ = 0U;
-
-  Xbyak::util::Cpu cpu_;
+  uint64_t feature_flags_ = 0U;
+  {
+    Xbyak::util::Cpu cpu_;
 #define TEST_EMIT_FEATURE(emit, ext)                \
   if ((cvars::x64_extension_mask & emit) == emit) { \
     feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
   }
 
-  TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
-  TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
-  TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
-  TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
-  TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
-  TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
-  TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
-  TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
-  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
-  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
-  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
+    TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
+    TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
+    TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
+    TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
+    TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
+    TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
+    TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
+    TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
+    TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
+    TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
+    TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
+    TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
+    TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
 #undef TEST_EMIT_FEATURE
-  /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
-  latest version of xbyak
-*/
-  unsigned int data[4];
-  Xbyak::util::Cpu::getCpuid(0x80000001, data);
-  unsigned amd_flags = data[2];
-  if (amd_flags & (1U << 5)) {
-    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
-      feature_flags_ |= kX64EmitLZCNT;
-    }
-  }
-  // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
-  if (amd_flags & (1U << 16)) {
-    if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
-      feature_flags_ |= kX64EmitFMA4;
-    }
-  }
-  if (amd_flags & (1U << 21)) {
-    if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
-      feature_flags_ |= kX64EmitTBM;
-    }
-  }
-  if (amd_flags & (1U << 11)) {
-    if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
-      feature_flags_ |= kX64EmitXOP;
-    }
-  }
-  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-    bool is_zennish = cpu_.displayFamily >= 0x17;
     /*
-                chrispy: according to agner's tables, all amd architectures that
-       we support (ones with avx) have the same timings for
-       jrcxz/loop/loope/loopne as for other jmps
-        */
-    feature_flags_ |= kX64FastJrcx;
-    feature_flags_ |= kX64FastLoop;
-    if (is_zennish) {
-      // ik that i heard somewhere that this is the case for zen, but i need to
-      // verify. cant find my original source for that.
-      // todo: ask agner?
-      feature_flags_ |= kX64FlagsIndependentVars;
+    fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
+    latest version of xbyak
+  */
+    unsigned int data[4];
+    Xbyak::util::Cpu::getCpuid(0x80000001, data);
+    unsigned amd_flags = data[2];
+    if (amd_flags & (1U << 5)) {
+      if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
+        feature_flags_ |= kX64EmitLZCNT;
+      }
+    }
+    // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
+    if (amd_flags & (1U << 16)) {
+      if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
+        feature_flags_ |= kX64EmitFMA4;
+      }
+    }
+    if (amd_flags & (1U << 21)) {
+      if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
+        feature_flags_ |= kX64EmitTBM;
+      }
+    }
+    if (amd_flags & (1U << 11)) {
+      if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
+        feature_flags_ |= kX64EmitXOP;
+      }
+    }
+    if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
+      bool is_zennish = cpu_.displayFamily >= 0x17;
+      /*
+                  chrispy: according to agner's tables, all amd architectures
+         that we support (ones with avx) have the same timings for
+         jrcxz/loop/loope/loopne as for other jmps
+          */
+      feature_flags_ |= kX64FastJrcx;
+      feature_flags_ |= kX64FastLoop;
+      if (is_zennish) {
+        // ik that i heard somewhere that this is the case for zen, but i need
+        // to verify. cant find my original source for that. todo: ask agner?
+        feature_flags_ |= kX64FlagsIndependentVars;
+      }
+    }
+  }
+  {
+    unsigned int data[4];
+    memset(data, 0, sizeof(data));
+    // intel extended features
+    Xbyak::util::Cpu::getCpuidEx(7, 0, data);
+    if ((data[2] & (1 << 28)) &&
+        (cvars::x64_extension_mask & kX64EmitMovdir64M)) {
+      feature_flags_ |= kX64EmitMovdir64M;
+    }
+    if ((data[1] & (1 << 9)) && (cvars::x64_extension_mask & kX64FastRepMovs)) {
+      feature_flags_ |= kX64FastRepMovs;
     }
   }
   g_feature_flags = feature_flags_;
diff --git a/src/xenia/base/platform_amd64.h b/src/xenia/base/platform_amd64.h
index 326b69139..e5c20c670 100644
--- a/src/xenia/base/platform_amd64.h
+++ b/src/xenia/base/platform_amd64.h
@@ -13,7 +13,7 @@
 
 namespace xe {
 namespace amd64 {
-enum X64FeatureFlags {
+enum X64FeatureFlags : uint64_t {
   kX64EmitAVX2 = 1 << 0,
   kX64EmitFMA = 1 << 1,
   kX64EmitLZCNT = 1 << 2,  // this is actually ABM and includes popcount
@@ -44,14 +44,13 @@ enum X64FeatureFlags {
                            // instructions, and FX users need the boost
   kX64EmitFMA4 = 1 << 17,  // todo: also use on zen1?
   kX64EmitTBM = 1 << 18,
-  // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
-  // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
-  // instructions by using the legacy sse version if we recently cleared the
-  // high 128 bits of the
+  kX64EmitMovdir64M = 1 << 19,
+  kX64FastRepMovs = 1 << 20
+
 };
 
 XE_NOALIAS
-uint32_t GetFeatureFlags();
+uint64_t GetFeatureFlags();
 XE_COLD
 void InitFeatureFlags();
 
diff --git a/src/xenia/base/threading.h b/src/xenia/base/threading.h
index 67297716b..604819950 100644
--- a/src/xenia/base/threading.h
+++ b/src/xenia/base/threading.h
@@ -299,6 +299,12 @@ class Event : public WaitHandle {
   // the nonsignaled state after releasing the appropriate number of waiting
   // threads.
   virtual void Pulse() = 0;
+  #if XE_PLATFORM_WIN32 ==1
+  //SetEvent, but if there is a waiter we immediately transfer execution to it
+  virtual void SetBoostPriority() = 0;
+  #else
+  void SetBoostPriority() { Set() }
+  #endif
 };
 
 // Models a Win32-like semaphore object.
diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc
index 32ddf7487..01a4eb9be 100644
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@@ -39,6 +39,8 @@ XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject,
                 NtWaitForSingleObjectPointer);
 
 XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer);
+XE_NTDLL_IMPORT(NtSetEventBoostPriority, cls_NtSetEventBoostPriority,
+                NtSetEventBoostPriorityPointer);
 // difference between NtClearEvent and NtResetEvent is that NtResetEvent returns
 // the events state prior to the call, but we dont need that. might need to
 // check whether one or the other is faster in the kernel though yeah, just
@@ -53,6 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
 
 XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
                 NtDelayExecutionPointer);
+
 namespace xe {
 namespace threading {
 
@@ -137,7 +140,7 @@ void MaybeYield() {
 #endif
 #endif
   // memorybarrier is really not necessary here...
-  MemoryBarrier();
+  // MemoryBarrier();
 }
 
 void SyncMemory() { MemoryBarrier(); }
@@ -288,11 +291,19 @@ class Win32Event : public Win32Handle<Event> {
   void Set() override { NtSetEventPointer.invoke(handle_, nullptr); }
   void Reset() override { NtClearEventPointer.invoke(handle_); }
   void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
+  void SetBoostPriority() override {
+    // no previous state for boostpriority
+    NtSetEventBoostPriorityPointer.invoke(handle_);
+  }
 #else
   void Set() override { SetEvent(handle_); }
   void Reset() override { ResetEvent(handle_); }
   void Pulse() override { PulseEvent(handle_); }
 
+  void SetBoostPriority() override {
+    // no win32 version of boostpriority
+    SetEvent(handle_);
+  }
 #endif
 };
 
diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h
index d4ded3e83..cb5a375ec 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@@ -23,7 +23,7 @@
 #define XE_X64_PROFILER_AVAILABLE 1
 #endif
 
-DECLARE_int32(x64_extension_mask);
+DECLARE_int64(x64_extension_mask);
 
 namespace xe {
 class Exception;
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index 74515d38e..03b8b4abd 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -103,74 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
         "FAQ for system requirements at https://xenia.jp");
     return;
   }
-#if 1
-  feature_flags_ = amd64::GetFeatureFlags();
-#else
-#define TEST_EMIT_FEATURE(emit, ext)                \
-  if ((cvars::x64_extension_mask & emit) == emit) { \
-    feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
-  }
 
-  TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
-  TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
-  TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
-  TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
-  TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
-  TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
-  TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
-  TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
-  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
-  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
-  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
-#undef TEST_EMIT_FEATURE
-  /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
-  latest version of xbyak
-*/
-  unsigned int data[4];
-  Xbyak::util::Cpu::getCpuid(0x80000001, data);
-  unsigned amd_flags = data[2];
-  if (amd_flags & (1U << 5)) {
-    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
-      feature_flags_ |= kX64EmitLZCNT;
-    }
-  }
-  // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
-  if (amd_flags & (1U << 16)) {
-    if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
-      feature_flags_ |= kX64EmitFMA4;
-    }
-  }
-  if (amd_flags & (1U << 21)) {
-    if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
-      feature_flags_ |= kX64EmitTBM;
-    }
-  }
-  if (amd_flags & (1U << 11)) {
-    if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
-      feature_flags_ |= kX64EmitXOP;
-      XELOGCPU("Cpu support XOP!\n\n");
-    }
-  }
-  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-    bool is_zennish = cpu_.displayFamily >= 0x17;
-    /*
-                chrispy: according to agner's tables, all amd architectures that
-       we support (ones with avx) have the same timings for
-       jrcxz/loop/loope/loopne as for other jmps
-        */
-    feature_flags_ |= kX64FastJrcx;
-    feature_flags_ |= kX64FastLoop;
-    if (is_zennish) {
-      // ik that i heard somewhere that this is the case for zen, but i need to
-      // verify. cant find my original source for that.
-      // todo: ask agner?
-      feature_flags_ |= kX64FlagsIndependentVars;
-    }
-  }
-#endif
+  feature_flags_ = amd64::GetFeatureFlags();
+
   may_use_membase32_as_zero_reg_ =
       static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
           processor()->memory()->virtual_membase())) == 0;
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index 69e3b80ec..91f4016c1 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -299,7 +299,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
   void* FindWordConstantOffset(unsigned wordvalue);
   void* FindDwordConstantOffset(unsigned bytevalue);
   void* FindQwordConstantOffset(uint64_t bytevalue);
-  bool IsFeatureEnabled(uint32_t feature_flag) const {
+  bool IsFeatureEnabled(uint64_t feature_flag) const {
     return (feature_flags_ & feature_flag) == feature_flag;
   }
 
@@ -395,7 +395,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
   XbyakAllocator* allocator_ = nullptr;
   XexModule* guest_module_ = nullptr;
   Xbyak::util::Cpu cpu_;
-  uint32_t feature_flags_ = 0;
+  uint64_t feature_flags_ = 0;
   uint32_t current_guest_function_ = 0;
   Xbyak::Label* epilog_label_ = nullptr;
 
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 06a37ab91..28b33fd76 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -39,7 +39,7 @@
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"
-
+XE_MSVC_OPTIMIZE_SMALL()
 DEFINE_bool(use_fast_dot_product, false,
             "Experimental optimization, much shorter sequence on dot products, "
             "treating inf as overflow instead of using mcxsr"
diff --git a/src/xenia/cpu/entry_table.cc b/src/xenia/cpu/entry_table.cc
index 1d82f0538..840706171 100644
--- a/src/xenia/cpu/entry_table.cc
+++ b/src/xenia/cpu/entry_table.cc
@@ -19,16 +19,19 @@ EntryTable::EntryTable() = default;
 
 EntryTable::~EntryTable() {
   auto global_lock = global_critical_region_.Acquire();
-  for (auto it : map_) {
-    Entry* entry = it.second;
+  for (auto it : map_.Values()) {
+    Entry* entry = it;
     delete entry;
   }
 }
 
 Entry* EntryTable::Get(uint32_t address) {
   auto global_lock = global_critical_region_.Acquire();
-  const auto& it = map_.find(address);
-  Entry* entry = it != map_.end() ? it->second : nullptr;
+  uint32_t idx = map_.IndexForKey(address);
+  if (idx == map_.size() || *map_.KeyAt(idx) != address) {
+    return nullptr;
+  }
+  Entry* entry = *map_.ValueAt(idx);
   if (entry) {
     // TODO(benvanik): wait if needed?
     if (entry->status != Entry::STATUS_READY) {
@@ -43,8 +46,12 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
   // https://github.com/facebook/folly/blob/master/folly/AtomicHashMap.h
 
   auto global_lock = global_critical_region_.Acquire();
-  const auto& it = map_.find(address);
-  Entry* entry = it != map_.end() ? it->second : nullptr;
+
+  uint32_t idx = map_.IndexForKey(address);
+
+  Entry* entry = idx != map_.size() && *map_.KeyAt(idx) == address
+                     ? *map_.ValueAt(idx)
+                     : nullptr;
   Entry::Status status;
   if (entry) {
     // If we aren't ready yet spin and wait.
@@ -65,7 +72,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
     entry->end_address = 0;
     entry->status = Entry::STATUS_COMPILING;
     entry->function = 0;
-    map_[address] = entry;
+    map_.InsertAt(address, entry, idx);
+    // map_[address] = entry;
     status = Entry::STATUS_NEW;
   }
   global_lock.unlock();
@@ -75,18 +83,18 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
 
 void EntryTable::Delete(uint32_t address) {
   auto global_lock = global_critical_region_.Acquire();
-  const auto itr = map_.find(address);
-
-  if (itr != map_.cend()) {
-    map_.erase(itr);
+  // doesnt this leak memory by not deleting the entry?
+  uint32_t idx = map_.IndexForKey(address);
+  if (idx != map_.size() && *map_.KeyAt(idx) == address) {
+    map_.EraseAt(idx);
   }
 }
 
 std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
   auto global_lock = global_critical_region_.Acquire();
   std::vector<Function*> fns;
-  for (auto& it : map_) {
-    Entry* entry = it.second;
+  for (auto& it : map_.Values()) {
+    Entry* entry = it;
     if (address >= entry->address && address <= entry->end_address) {
       if (entry->status == Entry::STATUS_READY) {
         fns.push_back(entry->function);
@@ -95,6 +103,5 @@ std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
   }
   return fns;
 }
-
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/entry_table.h b/src/xenia/cpu/entry_table.h
index 14a3e6c82..2ca2133c2 100644
--- a/src/xenia/cpu/entry_table.h
+++ b/src/xenia/cpu/entry_table.h
@@ -14,7 +14,7 @@
 #include <vector>
 
 #include "xenia/base/mutex.h"
-
+#include "xenia/base/split_map.h"
 namespace xe {
 namespace cpu {
 
@@ -48,7 +48,8 @@ class EntryTable {
  private:
   xe::global_critical_region global_critical_region_;
   // TODO(benvanik): replace with a better data structure.
-  std::unordered_map<uint32_t, Entry*> map_;
+  xe::split_map<uint32_t, Entry*> map_;
+  //std::unordered_map<uint32_t, Entry*> map_;
 };
 
 }  // namespace cpu
diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc
index ab54438d7..66da46546 100644
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@@ -334,7 +334,7 @@ void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
 
 void CommandProcessor::UpdateWritePointer(uint32_t value) {
   write_ptr_index_ = value;
-  write_ptr_index_event_->Set();
+  write_ptr_index_event_->SetBoostPriority();
 }
 void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
                                                   uint32_t value) {
@@ -665,6 +665,11 @@ uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,
 
   reader_.set_read_offset(read_index * sizeof(uint32_t));
   reader_.set_write_offset(write_index * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      GetCurrentRingReadCount());
   do {
     if (!ExecutePacket()) {
       // This probably should be fatal - but we're going to continue anyways.
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 4e7ee919c..a24d468ae 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -380,7 +380,8 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature(
   root_signatures_bindful_.emplace(index, root_signature);
   return root_signature;
 }
-
+XE_NOINLINE
+XE_COLD
 uint32_t D3D12CommandProcessor::GetRootBindfulExtraParameterIndices(
     const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
     RootBindfulExtraParameterIndices& indices_out) {
@@ -2484,7 +2485,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     return false;
   }
   pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+  const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
 
   // Pixel shader analysis.
   bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
@@ -2512,9 +2513,10 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
       return true;
     }
   }
-  bool memexport_used_pixel =
+
+  const bool memexport_used_pixel =
       pixel_shader && pixel_shader->is_valid_memexport_used();
-  bool memexport_used = memexport_used_vertex || memexport_used_pixel;
+  const bool memexport_used = memexport_used_vertex || memexport_used_pixel;
 
   if (!BeginSubmission(true)) {
     return false;
@@ -2639,6 +2641,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     previous_viewport_info_args_ = gviargs;
     previous_viewport_info_ = viewport_info;
   }
+  // todo: use SIMD for getscissor + scaling here, should reduce code size more
   draw_util::Scissor scissor;
   draw_util::GetScissor(regs, scissor);
   scissor.offset[0] *= draw_resolution_scale_x;
@@ -2711,102 +2714,13 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   // Gather memexport ranges and ensure the heaps for them are resident, and
   // also load the data surrounding the export and to fill the regions that
   // won't be modified by the shaders.
-  struct MemExportRange {
-    uint32_t base_address_dwords;
-    uint32_t size_dwords;
-  };
-  MemExportRange memexport_ranges[512];
-  uint32_t memexport_range_count = 0;
-  if (memexport_used_vertex) {
-    for (uint32_t constant_index :
-         vertex_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
-      }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::GetName(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      // Try to reduce the number of shared memory operations when writing
-      // different elements into the same buffer through different exports
-      // (happens in 4D5307E6).
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        MemExportRange& memexport_range = memexport_ranges[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      // Add a new range if haven't expanded an existing one.
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges[memexport_range_count++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
-  }
-  if (memexport_used_pixel) {
-    for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
-      }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::GetName(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        MemExportRange& memexport_range = memexport_ranges[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges[memexport_range_count++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
-  }
-  for (uint32_t i = 0; i < memexport_range_count; ++i) {
-    const MemExportRange& memexport_range = memexport_ranges[i];
-    if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
-                                      memexport_range.size_dwords << 2)) {
-      XELOGE(
-          "Failed to request memexport stream at 0x{:08X} (size {}) in the "
-          "shared memory",
-          memexport_range.base_address_dwords << 2,
-          memexport_range.size_dwords << 2);
-      return false;
-    }
-  }
 
+  memexport_range_count_ = 0;
+  if (memexport_used_vertex || memexport_used_pixel) {
+    bool retflag;
+    bool retval = GatherMemexportRangesAndMakeResident(retflag);
+    if (retflag) return retval;
+  }
   // Primitive topology.
   D3D_PRIMITIVE_TOPOLOGY primitive_topology;
   if (primitive_processing_result.IsTessellated()) {
@@ -2876,10 +2790,11 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   // Draw.
   if (primitive_processing_result.index_buffer_type ==
       PrimitiveProcessor::ProcessedIndexBufferType::kNone) {
-    if (memexport_used) {
-      shared_memory_->UseForWriting();
-    } else {
+    if (!memexport_used) {
       shared_memory_->UseForReading();
+
+    } else {
+      shared_memory_->UseForWriting();
     }
     SubmitBarriers();
     deferred_command_list_.D3DDrawInstanced(
@@ -2903,22 +2818,11 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
           // If the shared memory is a UAV, it can't be used as an index buffer
           // (UAV is a read/write state, index buffer is a read-only state).
           // Need to copy the indices to a buffer in the index buffer state.
-          scratch_index_buffer = RequestScratchGPUBuffer(
-              index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
-          if (scratch_index_buffer == nullptr) {
-            return false;
-          }
-          shared_memory_->UseAsCopySource();
-          SubmitBarriers();
-          deferred_command_list_.D3DCopyBufferRegion(
-              scratch_index_buffer, 0, shared_memory_->GetBuffer(),
-              primitive_processing_result.guest_index_base,
-              index_buffer_view.SizeInBytes);
-          PushTransitionBarrier(scratch_index_buffer,
-                                D3D12_RESOURCE_STATE_COPY_DEST,
-                                D3D12_RESOURCE_STATE_INDEX_BUFFER);
-          index_buffer_view.BufferLocation =
-              scratch_index_buffer->GetGPUVirtualAddress();
+          bool retflag;
+          bool retval = HandleMemexportGuestDMA(
+              scratch_index_buffer, index_buffer_view,
+              primitive_processing_result.guest_index_base, retflag);
+          if (retflag) return retval;
         } else {
           index_buffer_view.BufferLocation =
               shared_memory_->GetGPUAddress() +
@@ -2956,66 +2860,199 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   }
 
   if (memexport_used) {
-    // Make sure this memexporting draw is ordered with other work using shared
-    // memory as a UAV.
-    // TODO(Triang3l): Find some PM4 command that can be used for indication of
-    // when memexports should be awaited?
-    shared_memory_->MarkUAVWritesCommitNeeded();
-    // Invalidate textures in memexported memory and watch for changes.
-    for (uint32_t i = 0; i < memexport_range_count; ++i) {
-      const MemExportRange& memexport_range = memexport_ranges[i];
-      shared_memory_->RangeWrittenByGpu(
-          memexport_range.base_address_dwords << 2,
-          memexport_range.size_dwords << 2, false);
-    }
-    if (cvars::d3d12_readback_memexport) {
-      // Read the exported data on the CPU.
-      uint32_t memexport_total_size = 0;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        memexport_total_size += memexport_ranges[i].size_dwords << 2;
+    HandleMemexportDrawOrdering_AndReadback();
+  }
+
+  return true;
+}
+XE_COLD
+XE_NOINLINE
+bool D3D12CommandProcessor::HandleMemexportGuestDMA(
+    ID3D12Resource*& scratch_index_buffer,
+    D3D12_INDEX_BUFFER_VIEW& index_buffer_view, uint32_t guest_index_base,
+    // xe::gpu::PrimitiveProcessor::ProcessingResult&
+    // primitive_processing_result,
+    bool& retflag) {
+  retflag = true;
+  scratch_index_buffer = RequestScratchGPUBuffer(
+      index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
+  if (scratch_index_buffer == nullptr) {
+    return false;
+  }
+  shared_memory_->UseAsCopySource();
+  SubmitBarriers();
+  deferred_command_list_.D3DCopyBufferRegion(
+      scratch_index_buffer, 0, shared_memory_->GetBuffer(), guest_index_base,
+      index_buffer_view.SizeInBytes);
+  PushTransitionBarrier(scratch_index_buffer, D3D12_RESOURCE_STATE_COPY_DEST,
+                        D3D12_RESOURCE_STATE_INDEX_BUFFER);
+  index_buffer_view.BufferLocation =
+      scratch_index_buffer->GetGPUVirtualAddress();
+  retflag = false;
+  return {};
+}
+XE_NOINLINE
+XE_COLD
+bool D3D12CommandProcessor::GatherMemexportRangesAndMakeResident(
+    bool& retflag) {
+  auto vertex_shader = static_cast<D3D12Shader*>(active_vertex_shader());
+  auto pixel_shader = static_cast<D3D12Shader*>(active_pixel_shader());
+  const xe::gpu::RegisterFile& regs = *register_file_;
+  const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+  const bool memexport_used_pixel =
+      pixel_shader && pixel_shader->is_valid_memexport_used();
+  retflag = true;
+  if (memexport_used_vertex) {
+    for (uint32_t constant_index :
+         vertex_shader->memexport_stream_constants()) {
+      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
+          XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
+      if (memexport_stream.index_count == 0) {
+        continue;
       }
-      if (memexport_total_size != 0) {
-        ID3D12Resource* readback_buffer =
-            RequestReadbackBuffer(memexport_total_size);
-        if (readback_buffer != nullptr) {
-          shared_memory_->UseAsCopySource();
-          SubmitBarriers();
-          ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
-          uint32_t readback_buffer_offset = 0;
-          for (uint32_t i = 0; i < memexport_range_count; ++i) {
-            const MemExportRange& memexport_range = memexport_ranges[i];
-            uint32_t memexport_range_size = memexport_range.size_dwords << 2;
-            deferred_command_list_.D3DCopyBufferRegion(
-                readback_buffer, readback_buffer_offset, shared_memory_buffer,
-                memexport_range.base_address_dwords << 2, memexport_range_size);
-            readback_buffer_offset += memexport_range_size;
-          }
-          if (AwaitAllQueueOperationsCompletion()) {
-            D3D12_RANGE readback_range;
-            readback_range.Begin = 0;
-            readback_range.End = memexport_total_size;
-            void* readback_mapping;
-            if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
-                                               &readback_mapping))) {
-              const uint32_t* readback_dwords =
-                  reinterpret_cast<const uint32_t*>(readback_mapping);
-              for (uint32_t i = 0; i < memexport_range_count; ++i) {
-                const MemExportRange& memexport_range = memexport_ranges[i];
-                std::memcpy(memory_->TranslatePhysical(
-                                memexport_range.base_address_dwords << 2),
-                            readback_dwords, memexport_range.size_dwords << 2);
-                readback_dwords += memexport_range.size_dwords;
-              }
-              D3D12_RANGE readback_write_range = {};
-              readback_buffer->Unmap(0, &readback_write_range);
+      uint32_t memexport_format_size =
+          GetSupportedMemExportFormatSize(memexport_stream.format);
+      if (memexport_format_size == 0) {
+        XELOGE("Unsupported memexport format {}",
+               FormatInfo::GetName(
+                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
+        return false;
+      }
+      uint32_t memexport_size_dwords =
+          memexport_stream.index_count * memexport_format_size;
+      // Try to reduce the number of shared memory operations when writing
+      // different elements into the same buffer through different exports
+      // (happens in 4D5307E6).
+      bool memexport_range_reused = false;
+      for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+        MemExportRange& memexport_range = memexport_ranges_[i];
+        if (memexport_range.base_address_dwords ==
+            memexport_stream.base_address) {
+          memexport_range.size_dwords =
+              std::max(memexport_range.size_dwords, memexport_size_dwords);
+          memexport_range_reused = true;
+          break;
+        }
+      }
+      // Add a new range if haven't expanded an existing one.
+      if (!memexport_range_reused) {
+        MemExportRange& memexport_range =
+            memexport_ranges_[memexport_range_count_++];
+        memexport_range.base_address_dwords = memexport_stream.base_address;
+        memexport_range.size_dwords = memexport_size_dwords;
+      }
+    }
+  }
+  if (memexport_used_pixel) {
+    for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
+      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
+          XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
+      if (memexport_stream.index_count == 0) {
+        continue;
+      }
+      uint32_t memexport_format_size =
+          GetSupportedMemExportFormatSize(memexport_stream.format);
+      if (memexport_format_size == 0) {
+        XELOGE("Unsupported memexport format {}",
+               FormatInfo::GetName(
+                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
+        return false;
+      }
+      uint32_t memexport_size_dwords =
+          memexport_stream.index_count * memexport_format_size;
+      bool memexport_range_reused = false;
+      for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+        MemExportRange& memexport_range = memexport_ranges_[i];
+        if (memexport_range.base_address_dwords ==
+            memexport_stream.base_address) {
+          memexport_range.size_dwords =
+              std::max(memexport_range.size_dwords, memexport_size_dwords);
+          memexport_range_reused = true;
+          break;
+        }
+      }
+      if (!memexport_range_reused) {
+        MemExportRange& memexport_range =
+            memexport_ranges_[memexport_range_count_++];
+        memexport_range.base_address_dwords = memexport_stream.base_address;
+        memexport_range.size_dwords = memexport_size_dwords;
+      }
+    }
+  }
+  for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+    const MemExportRange& memexport_range = memexport_ranges_[i];
+    if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
+                                      memexport_range.size_dwords << 2)) {
+      XELOGE(
+          "Failed to request memexport stream at 0x{:08X} (size {}) in the "
+          "shared memory",
+          memexport_range.base_address_dwords << 2,
+          memexport_range.size_dwords << 2);
+      return false;
+    }
+  }
+  retflag = false;
+  return {};
+}
+XE_NOINLINE
+XE_COLD
+void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
+  // Make sure this memexporting draw is ordered with other work using shared
+  // memory as a UAV.
+  // TODO(Triang3l): Find some PM4 command that can be used for indication of
+  // when memexports should be awaited?
+  shared_memory_->MarkUAVWritesCommitNeeded();
+  // Invalidate textures in memexported memory and watch for changes.
+  for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+    const MemExportRange& memexport_range = memexport_ranges_[i];
+    shared_memory_->RangeWrittenByGpu(memexport_range.base_address_dwords << 2,
+                                      memexport_range.size_dwords << 2, false);
+  }
+  if (cvars::d3d12_readback_memexport) {
+    // Read the exported data on the CPU.
+    uint32_t memexport_total_size = 0;
+    for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+      memexport_total_size += memexport_ranges_[i].size_dwords << 2;
+    }
+    if (memexport_total_size != 0) {
+      ID3D12Resource* readback_buffer =
+          RequestReadbackBuffer(memexport_total_size);
+      if (readback_buffer != nullptr) {
+        shared_memory_->UseAsCopySource();
+        SubmitBarriers();
+        ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
+        uint32_t readback_buffer_offset = 0;
+        for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+          const MemExportRange& memexport_range = memexport_ranges_[i];
+          uint32_t memexport_range_size = memexport_range.size_dwords << 2;
+          deferred_command_list_.D3DCopyBufferRegion(
+              readback_buffer, readback_buffer_offset, shared_memory_buffer,
+              memexport_range.base_address_dwords << 2, memexport_range_size);
+          readback_buffer_offset += memexport_range_size;
+        }
+        if (AwaitAllQueueOperationsCompletion()) {
+          D3D12_RANGE readback_range;
+          readback_range.Begin = 0;
+          readback_range.End = memexport_total_size;
+          void* readback_mapping;
+          if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
+                                             &readback_mapping))) {
+            const uint32_t* readback_dwords =
+                reinterpret_cast<const uint32_t*>(readback_mapping);
+            for (uint32_t i = 0; i < memexport_range_count_; ++i) {
+              const MemExportRange& memexport_range = memexport_ranges_[i];
+              std::memcpy(memory_->TranslatePhysical(
+                              memexport_range.base_address_dwords << 2),
+                          readback_dwords, memexport_range.size_dwords << 2);
+              readback_dwords += memexport_range.size_dwords;
             }
+            D3D12_RANGE readback_write_range = {};
+            readback_buffer->Unmap(0, &readback_write_range);
           }
         }
       }
     }
   }
-
-  return true;
 }
 
 void D3D12CommandProcessor::InitializeTrace() {
@@ -3065,23 +3102,33 @@ bool D3D12CommandProcessor::IssueCopy() {
   if (!BeginSubmission(true)) {
     return false;
   }
-  uint32_t written_address, written_length;
-  if (!render_target_cache_->Resolve(*memory_, *shared_memory_, *texture_cache_,
-                                     written_address, written_length)) {
-    return false;
+
+  if (!cvars::d3d12_readback_resolve) {
+    uint32_t written_address, written_length;
+    return render_target_cache_->Resolve(*memory_, *shared_memory_,
+                                         *texture_cache_, written_address,
+                                         written_length);
+  } else {
+    return IssueCopy_ReadbackResolvePath();
   }
-  if (cvars::d3d12_readback_resolve &&
-      !texture_cache_->IsDrawResolutionScaled() && written_length) {
-    // Read the resolved data on the CPU.
-    ID3D12Resource* readback_buffer = RequestReadbackBuffer(written_length);
-    if (readback_buffer != nullptr) {
-      shared_memory_->UseAsCopySource();
-      SubmitBarriers();
-      ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
-      deferred_command_list_.D3DCopyBufferRegion(
-          readback_buffer, 0, shared_memory_buffer, written_address,
-          written_length);
-      if (AwaitAllQueueOperationsCompletion()) {
+  return true;
+}
+XE_NOINLINE
+bool D3D12CommandProcessor::IssueCopy_ReadbackResolvePath() {
+  uint32_t written_address, written_length;
+  if (render_target_cache_->Resolve(*memory_, *shared_memory_, *texture_cache_,
+                                    written_address, written_length)) {
+    if (!texture_cache_->IsDrawResolutionScaled() && written_length) {
+      // Read the resolved data on the CPU.
+      ID3D12Resource* readback_buffer = RequestReadbackBuffer(written_length);
+      if (readback_buffer != nullptr) {
+        shared_memory_->UseAsCopySource();
+        SubmitBarriers();
+        ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
+        deferred_command_list_.D3DCopyBufferRegion(
+            readback_buffer, 0, shared_memory_buffer, written_address,
+            written_length);
+        if (AwaitAllQueueOperationsCompletion()) {
 #if 1
         D3D12_RANGE readback_range;
         readback_range.Begin = 0;
@@ -3099,23 +3146,25 @@ bool D3D12CommandProcessor::IssueCopy() {
         }
 
 #else
-        dma::XeDMAJob job{};
-        job.destination = memory_->TranslatePhysical(written_address);
-        job.size = written_length;
-        job.source = nullptr;
-        job.userdata1 = (void*)readback_buffer;
-        job.precall = DmaPrefunc;
-        job.postcall = DmaPostfunc;
+          dma::XeDMAJob job{};
+          job.destination = memory_->TranslatePhysical(written_address);
+          job.size = written_length;
+          job.source = nullptr;
+          job.userdata1 = (void*)readback_buffer;
+          job.precall = DmaPrefunc;
+          job.postcall = DmaPostfunc;
 
-        readback_available_ = GetDMAC()->PushDMAJob(&job);
+          readback_available_ = GetDMAC()->PushDMAJob(&job);
 
 #endif
+        }
       }
     }
+  } else {
+    return false;
   }
   return true;
 }
-
 void D3D12CommandProcessor::CheckSubmissionFence(uint64_t await_submission) {
   if (await_submission >= submission_current_) {
     if (submission_open_) {
@@ -4707,195 +4756,11 @@ bool D3D12CommandProcessor::UpdateBindings(
           ~(1u << kRootParameter_Bindless_DescriptorIndicesPixel);
     }
   } else {
-    //
-    // Bindful descriptors path.
-    //
-
-    // See what descriptors need to be updated.
-    // Samplers have already been checked.
-    bool write_textures_vertex =
-        texture_count_vertex &&
-        (!bindful_textures_written_vertex_ ||
-         current_texture_layout_uid_vertex_ != texture_layout_uid_vertex ||
-         !texture_cache_->AreActiveTextureSRVKeysUpToDate(
-             current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
-             texture_count_vertex));
-    bool write_textures_pixel =
-        texture_count_pixel &&
-        (!bindful_textures_written_pixel_ ||
-         current_texture_layout_uid_pixel_ != texture_layout_uid_pixel ||
-         !texture_cache_->AreActiveTextureSRVKeysUpToDate(
-             current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
-             texture_count_pixel));
-    bool write_samplers_vertex =
-        sampler_count_vertex && !bindful_samplers_written_vertex_;
-    bool write_samplers_pixel =
-        sampler_count_pixel && !bindful_samplers_written_pixel_;
-    bool edram_rov_used = render_target_cache_->GetPath() ==
-                          RenderTargetCache::Path::kPixelShaderInterlock;
-
-    // Allocate the descriptors.
-    size_t view_count_partial_update = 0;
-    if (write_textures_vertex) {
-      view_count_partial_update += texture_count_vertex;
-    }
-    if (write_textures_pixel) {
-      view_count_partial_update += texture_count_pixel;
-    }
-    // All the constants + shared memory SRV and UAV + textures.
-    size_t view_count_full_update =
-        2 + texture_count_vertex + texture_count_pixel;
-    if (edram_rov_used) {
-      // + EDRAM UAV.
-      ++view_count_full_update;
-    }
-    D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle;
-    D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle;
-    uint32_t descriptor_size_view = provider.GetViewDescriptorSize();
-    uint64_t view_heap_index = RequestViewBindfulDescriptors(
-        draw_view_bindful_heap_index_, uint32_t(view_count_partial_update),
-        uint32_t(view_count_full_update), view_cpu_handle, view_gpu_handle);
-    if (view_heap_index ==
-        ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
-      XELOGE("Failed to allocate view descriptors");
-      return false;
-    }
-    size_t sampler_count_partial_update = 0;
-    if (write_samplers_vertex) {
-      sampler_count_partial_update += sampler_count_vertex;
-    }
-    if (write_samplers_pixel) {
-      sampler_count_partial_update += sampler_count_pixel;
-    }
-    D3D12_CPU_DESCRIPTOR_HANDLE sampler_cpu_handle = {};
-    D3D12_GPU_DESCRIPTOR_HANDLE sampler_gpu_handle = {};
-    uint32_t descriptor_size_sampler = provider.GetSamplerDescriptorSize();
-    uint64_t sampler_heap_index =
-        ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid;
-    if (sampler_count_vertex != 0 || sampler_count_pixel != 0) {
-      sampler_heap_index = RequestSamplerBindfulDescriptors(
-          draw_sampler_bindful_heap_index_,
-          uint32_t(sampler_count_partial_update),
-          uint32_t(sampler_count_vertex + sampler_count_pixel),
-          sampler_cpu_handle, sampler_gpu_handle);
-      if (sampler_heap_index ==
-          ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
-        XELOGE("Failed to allocate sampler descriptors");
-        return false;
-      }
-    }
-    if (draw_view_bindful_heap_index_ != view_heap_index) {
-      // Need to update all view descriptors.
-      write_textures_vertex = texture_count_vertex != 0;
-      write_textures_pixel = texture_count_pixel != 0;
-      bindful_textures_written_vertex_ = false;
-      bindful_textures_written_pixel_ = false;
-      // If updating fully, write the shared memory SRV and UAV descriptors and,
-      // if needed, the EDRAM descriptor.
-      gpu_handle_shared_memory_and_edram_ = view_gpu_handle;
-      shared_memory_->WriteRawSRVDescriptor(view_cpu_handle);
-      view_cpu_handle.ptr += descriptor_size_view;
-      view_gpu_handle.ptr += descriptor_size_view;
-      shared_memory_->WriteRawUAVDescriptor(view_cpu_handle);
-      view_cpu_handle.ptr += descriptor_size_view;
-      view_gpu_handle.ptr += descriptor_size_view;
-      if (edram_rov_used) {
-        render_target_cache_->WriteEdramUintPow2UAVDescriptor(view_cpu_handle,
-                                                              2);
-        view_cpu_handle.ptr += descriptor_size_view;
-        view_gpu_handle.ptr += descriptor_size_view;
-      }
-      current_graphics_root_up_to_date_ &=
-          ~(1u << kRootParameter_Bindful_SharedMemoryAndEdram);
-    }
-    if (sampler_heap_index !=
-            ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid &&
-        draw_sampler_bindful_heap_index_ != sampler_heap_index) {
-      write_samplers_vertex = sampler_count_vertex != 0;
-      write_samplers_pixel = sampler_count_pixel != 0;
-      bindful_samplers_written_vertex_ = false;
-      bindful_samplers_written_pixel_ = false;
-    }
-
-    // Write the descriptors.
-    if (write_textures_vertex) {
-      assert_true(current_graphics_root_bindful_extras_.textures_vertex !=
-                  RootBindfulExtraParameterIndices::kUnavailable);
-      gpu_handle_textures_vertex_ = view_gpu_handle;
-      for (size_t i = 0; i < texture_count_vertex; ++i) {
-        texture_cache_->WriteActiveTextureBindfulSRV(textures_vertex[i],
-                                                     view_cpu_handle);
-        view_cpu_handle.ptr += descriptor_size_view;
-        view_gpu_handle.ptr += descriptor_size_view;
-      }
-      current_texture_layout_uid_vertex_ = texture_layout_uid_vertex;
-      current_texture_srv_keys_vertex_.resize(
-          std::max(current_texture_srv_keys_vertex_.size(),
-                   size_t(texture_count_vertex)));
-      texture_cache_->WriteActiveTextureSRVKeys(
-          current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
-          texture_count_vertex);
-      bindful_textures_written_vertex_ = true;
-      current_graphics_root_up_to_date_ &=
-          ~(1u << current_graphics_root_bindful_extras_.textures_vertex);
-    }
-    if (write_textures_pixel) {
-      assert_true(current_graphics_root_bindful_extras_.textures_pixel !=
-                  RootBindfulExtraParameterIndices::kUnavailable);
-      gpu_handle_textures_pixel_ = view_gpu_handle;
-      for (size_t i = 0; i < texture_count_pixel; ++i) {
-        texture_cache_->WriteActiveTextureBindfulSRV((*textures_pixel)[i],
-                                                     view_cpu_handle);
-        view_cpu_handle.ptr += descriptor_size_view;
-        view_gpu_handle.ptr += descriptor_size_view;
-      }
-      current_texture_layout_uid_pixel_ = texture_layout_uid_pixel;
-      current_texture_srv_keys_pixel_.resize(std::max(
-          current_texture_srv_keys_pixel_.size(), size_t(texture_count_pixel)));
-      texture_cache_->WriteActiveTextureSRVKeys(
-          current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
-          texture_count_pixel);
-      bindful_textures_written_pixel_ = true;
-      current_graphics_root_up_to_date_ &=
-          ~(1u << current_graphics_root_bindful_extras_.textures_pixel);
-    }
-    if (write_samplers_vertex) {
-      assert_true(current_graphics_root_bindful_extras_.samplers_vertex !=
-                  RootBindfulExtraParameterIndices::kUnavailable);
-      gpu_handle_samplers_vertex_ = sampler_gpu_handle;
-      for (size_t i = 0; i < sampler_count_vertex; ++i) {
-        texture_cache_->WriteSampler(current_samplers_vertex_[i],
-                                     sampler_cpu_handle);
-        sampler_cpu_handle.ptr += descriptor_size_sampler;
-        sampler_gpu_handle.ptr += descriptor_size_sampler;
-      }
-      // Current samplers have already been updated.
-      bindful_samplers_written_vertex_ = true;
-      current_graphics_root_up_to_date_ &=
-          ~(1u << current_graphics_root_bindful_extras_.samplers_vertex);
-    }
-    if (write_samplers_pixel) {
-      assert_true(current_graphics_root_bindful_extras_.samplers_pixel !=
-                  RootBindfulExtraParameterIndices::kUnavailable);
-      gpu_handle_samplers_pixel_ = sampler_gpu_handle;
-      for (size_t i = 0; i < sampler_count_pixel; ++i) {
-        texture_cache_->WriteSampler(current_samplers_pixel_[i],
-                                     sampler_cpu_handle);
-        sampler_cpu_handle.ptr += descriptor_size_sampler;
-        sampler_gpu_handle.ptr += descriptor_size_sampler;
-      }
-      // Current samplers have already been updated.
-      bindful_samplers_written_pixel_ = true;
-      current_graphics_root_up_to_date_ &=
-          ~(1u << current_graphics_root_bindful_extras_.samplers_pixel);
-    }
-
-    // Wrote new descriptors on the current page.
-    draw_view_bindful_heap_index_ = view_heap_index;
-    if (sampler_heap_index !=
-        ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
-      draw_sampler_bindful_heap_index_ = sampler_heap_index;
-    }
+    bool retflag;
+    bool retval = UpdateBindings_BindfulPath(
+        texture_layout_uid_vertex, textures_vertex, texture_layout_uid_pixel,
+        textures_pixel, sampler_count_vertex, sampler_count_pixel, retflag);
+    if (retflag) return retval;
   }
 
   // Update the root parameters.
@@ -4967,47 +4832,255 @@ bool D3D12CommandProcessor::UpdateBindings(
                                            << kRootParameter_Bindless_ViewHeap;
     }
   } else {
-    if (!(current_graphics_root_up_to_date_ &
-          (1u << kRootParameter_Bindful_SharedMemoryAndEdram))) {
-      deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
-          kRootParameter_Bindful_SharedMemoryAndEdram,
-          gpu_handle_shared_memory_and_edram_);
-      current_graphics_root_up_to_date_ |=
-          1u << kRootParameter_Bindful_SharedMemoryAndEdram;
-    }
-    uint32_t extra_index;
-    extra_index = current_graphics_root_bindful_extras_.textures_pixel;
-    if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
-        !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
-      deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
-          extra_index, gpu_handle_textures_pixel_);
-      current_graphics_root_up_to_date_ |= 1u << extra_index;
-    }
-    extra_index = current_graphics_root_bindful_extras_.samplers_pixel;
-    if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
-        !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
-      deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
-          extra_index, gpu_handle_samplers_pixel_);
-      current_graphics_root_up_to_date_ |= 1u << extra_index;
-    }
-    extra_index = current_graphics_root_bindful_extras_.textures_vertex;
-    if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
-        !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
-      deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
-          extra_index, gpu_handle_textures_vertex_);
-      current_graphics_root_up_to_date_ |= 1u << extra_index;
-    }
-    extra_index = current_graphics_root_bindful_extras_.samplers_vertex;
-    if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
-        !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
-      deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
-          extra_index, gpu_handle_samplers_vertex_);
-      current_graphics_root_up_to_date_ |= 1u << extra_index;
-    }
+    UpdateBindings_UpdateRootBindful();
   }
 
   return true;
 }
+XE_COLD
+XE_NOINLINE
+void D3D12CommandProcessor::UpdateBindings_UpdateRootBindful() {
+  if (!(current_graphics_root_up_to_date_ &
+        (1u << kRootParameter_Bindful_SharedMemoryAndEdram))) {
+    deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
+        kRootParameter_Bindful_SharedMemoryAndEdram,
+        gpu_handle_shared_memory_and_edram_);
+    current_graphics_root_up_to_date_ |=
+        1u << kRootParameter_Bindful_SharedMemoryAndEdram;
+  }
+  uint32_t extra_index;
+  extra_index = current_graphics_root_bindful_extras_.textures_pixel;
+  if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
+      !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
+    deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
+        extra_index, gpu_handle_textures_pixel_);
+    current_graphics_root_up_to_date_ |= 1u << extra_index;
+  }
+  extra_index = current_graphics_root_bindful_extras_.samplers_pixel;
+  if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
+      !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
+    deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
+        extra_index, gpu_handle_samplers_pixel_);
+    current_graphics_root_up_to_date_ |= 1u << extra_index;
+  }
+  extra_index = current_graphics_root_bindful_extras_.textures_vertex;
+  if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
+      !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
+    deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
+        extra_index, gpu_handle_textures_vertex_);
+    current_graphics_root_up_to_date_ |= 1u << extra_index;
+  }
+  extra_index = current_graphics_root_bindful_extras_.samplers_vertex;
+  if (extra_index != RootBindfulExtraParameterIndices::kUnavailable &&
+      !(current_graphics_root_up_to_date_ & (1u << extra_index))) {
+    deferred_command_list_.D3DSetGraphicsRootDescriptorTable(
+        extra_index, gpu_handle_samplers_vertex_);
+    current_graphics_root_up_to_date_ |= 1u << extra_index;
+  }
+}
+XE_NOINLINE
+XE_COLD
+bool D3D12CommandProcessor::UpdateBindings_BindfulPath(
+    const size_t texture_layout_uid_vertex,
+    const std::vector<xe::gpu::DxbcShader::TextureBinding>& textures_vertex,
+    const size_t texture_layout_uid_pixel,
+    const std::vector<xe::gpu::DxbcShader::TextureBinding>* textures_pixel,
+    const size_t sampler_count_vertex, const size_t sampler_count_pixel,
+    bool& retflag) {
+  retflag = true;
+  auto& provider = this->GetD3D12Provider();
+  size_t texture_count_pixel = textures_pixel->size();
+  size_t texture_count_vertex = textures_vertex.size();
+  //
+  // Bindful descriptors path.
+  //
+
+  // See what descriptors need to be updated.
+  // Samplers have already been checked.
+  bool write_textures_vertex =
+      texture_count_vertex &&
+      (!bindful_textures_written_vertex_ ||
+       current_texture_layout_uid_vertex_ != texture_layout_uid_vertex ||
+       !texture_cache_->AreActiveTextureSRVKeysUpToDate(
+           current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
+           texture_count_vertex));
+  bool write_textures_pixel =
+      texture_count_pixel &&
+      (!bindful_textures_written_pixel_ ||
+       current_texture_layout_uid_pixel_ != texture_layout_uid_pixel ||
+       !texture_cache_->AreActiveTextureSRVKeysUpToDate(
+           current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
+           texture_count_pixel));
+  bool write_samplers_vertex =
+      sampler_count_vertex && !bindful_samplers_written_vertex_;
+  bool write_samplers_pixel =
+      sampler_count_pixel && !bindful_samplers_written_pixel_;
+  bool edram_rov_used = render_target_cache_->GetPath() ==
+                        RenderTargetCache::Path::kPixelShaderInterlock;
+
+  // Allocate the descriptors.
+  size_t view_count_partial_update = 0;
+  if (write_textures_vertex) {
+    view_count_partial_update += texture_count_vertex;
+  }
+  if (write_textures_pixel) {
+    view_count_partial_update += texture_count_pixel;
+  }
+  // All the constants + shared memory SRV and UAV + textures.
+  size_t view_count_full_update =
+      2 + texture_count_vertex + texture_count_pixel;
+  if (edram_rov_used) {
+    // + EDRAM UAV.
+    ++view_count_full_update;
+  }
+  D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle;
+  D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle;
+  uint32_t descriptor_size_view = provider.GetViewDescriptorSize();
+  uint64_t view_heap_index = RequestViewBindfulDescriptors(
+      draw_view_bindful_heap_index_, uint32_t(view_count_partial_update),
+      uint32_t(view_count_full_update), view_cpu_handle, view_gpu_handle);
+  if (view_heap_index ==
+      ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
+    XELOGE("Failed to allocate view descriptors");
+    return false;
+  }
+  size_t sampler_count_partial_update = 0;
+  if (write_samplers_vertex) {
+    sampler_count_partial_update += sampler_count_vertex;
+  }
+  if (write_samplers_pixel) {
+    sampler_count_partial_update += sampler_count_pixel;
+  }
+  D3D12_CPU_DESCRIPTOR_HANDLE sampler_cpu_handle = {};
+  D3D12_GPU_DESCRIPTOR_HANDLE sampler_gpu_handle = {};
+  uint32_t descriptor_size_sampler = provider.GetSamplerDescriptorSize();
+  uint64_t sampler_heap_index =
+      ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid;
+  if (sampler_count_vertex != 0 || sampler_count_pixel != 0) {
+    sampler_heap_index = RequestSamplerBindfulDescriptors(
+        draw_sampler_bindful_heap_index_,
+        uint32_t(sampler_count_partial_update),
+        uint32_t(sampler_count_vertex + sampler_count_pixel),
+        sampler_cpu_handle, sampler_gpu_handle);
+    if (sampler_heap_index ==
+        ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
+      XELOGE("Failed to allocate sampler descriptors");
+      return false;
+    }
+  }
+  if (draw_view_bindful_heap_index_ != view_heap_index) {
+    // Need to update all view descriptors.
+    write_textures_vertex = texture_count_vertex != 0;
+    write_textures_pixel = texture_count_pixel != 0;
+    bindful_textures_written_vertex_ = false;
+    bindful_textures_written_pixel_ = false;
+    // If updating fully, write the shared memory SRV and UAV descriptors and,
+    // if needed, the EDRAM descriptor.
+    gpu_handle_shared_memory_and_edram_ = view_gpu_handle;
+    shared_memory_->WriteRawSRVDescriptor(view_cpu_handle);
+    view_cpu_handle.ptr += descriptor_size_view;
+    view_gpu_handle.ptr += descriptor_size_view;
+    shared_memory_->WriteRawUAVDescriptor(view_cpu_handle);
+    view_cpu_handle.ptr += descriptor_size_view;
+    view_gpu_handle.ptr += descriptor_size_view;
+    if (edram_rov_used) {
+      render_target_cache_->WriteEdramUintPow2UAVDescriptor(view_cpu_handle, 2);
+      view_cpu_handle.ptr += descriptor_size_view;
+      view_gpu_handle.ptr += descriptor_size_view;
+    }
+    current_graphics_root_up_to_date_ &=
+        ~(1u << kRootParameter_Bindful_SharedMemoryAndEdram);
+  }
+  if (sampler_heap_index !=
+          ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid &&
+      draw_sampler_bindful_heap_index_ != sampler_heap_index) {
+    write_samplers_vertex = sampler_count_vertex != 0;
+    write_samplers_pixel = sampler_count_pixel != 0;
+    bindful_samplers_written_vertex_ = false;
+    bindful_samplers_written_pixel_ = false;
+  }
+
+  // Write the descriptors.
+  if (write_textures_vertex) {
+    assert_true(current_graphics_root_bindful_extras_.textures_vertex !=
+                RootBindfulExtraParameterIndices::kUnavailable);
+    gpu_handle_textures_vertex_ = view_gpu_handle;
+    for (size_t i = 0; i < texture_count_vertex; ++i) {
+      texture_cache_->WriteActiveTextureBindfulSRV(textures_vertex[i],
+                                                   view_cpu_handle);
+      view_cpu_handle.ptr += descriptor_size_view;
+      view_gpu_handle.ptr += descriptor_size_view;
+    }
+    current_texture_layout_uid_vertex_ = texture_layout_uid_vertex;
+    current_texture_srv_keys_vertex_.resize(std::max(
+        current_texture_srv_keys_vertex_.size(), size_t(texture_count_vertex)));
+    texture_cache_->WriteActiveTextureSRVKeys(
+        current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
+        texture_count_vertex);
+    bindful_textures_written_vertex_ = true;
+    current_graphics_root_up_to_date_ &=
+        ~(1u << current_graphics_root_bindful_extras_.textures_vertex);
+  }
+  if (write_textures_pixel) {
+    assert_true(current_graphics_root_bindful_extras_.textures_pixel !=
+                RootBindfulExtraParameterIndices::kUnavailable);
+    gpu_handle_textures_pixel_ = view_gpu_handle;
+    for (size_t i = 0; i < texture_count_pixel; ++i) {
+      texture_cache_->WriteActiveTextureBindfulSRV((*textures_pixel)[i],
+                                                   view_cpu_handle);
+      view_cpu_handle.ptr += descriptor_size_view;
+      view_gpu_handle.ptr += descriptor_size_view;
+    }
+    current_texture_layout_uid_pixel_ = texture_layout_uid_pixel;
+    current_texture_srv_keys_pixel_.resize(std::max(
+        current_texture_srv_keys_pixel_.size(), size_t(texture_count_pixel)));
+    texture_cache_->WriteActiveTextureSRVKeys(
+        current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
+        texture_count_pixel);
+    bindful_textures_written_pixel_ = true;
+    current_graphics_root_up_to_date_ &=
+        ~(1u << current_graphics_root_bindful_extras_.textures_pixel);
+  }
+  if (write_samplers_vertex) {
+    assert_true(current_graphics_root_bindful_extras_.samplers_vertex !=
+                RootBindfulExtraParameterIndices::kUnavailable);
+    gpu_handle_samplers_vertex_ = sampler_gpu_handle;
+    for (size_t i = 0; i < sampler_count_vertex; ++i) {
+      texture_cache_->WriteSampler(current_samplers_vertex_[i],
+                                   sampler_cpu_handle);
+      sampler_cpu_handle.ptr += descriptor_size_sampler;
+      sampler_gpu_handle.ptr += descriptor_size_sampler;
+    }
+    // Current samplers have already been updated.
+    bindful_samplers_written_vertex_ = true;
+    current_graphics_root_up_to_date_ &=
+        ~(1u << current_graphics_root_bindful_extras_.samplers_vertex);
+  }
+  if (write_samplers_pixel) {
+    assert_true(current_graphics_root_bindful_extras_.samplers_pixel !=
+                RootBindfulExtraParameterIndices::kUnavailable);
+    gpu_handle_samplers_pixel_ = sampler_gpu_handle;
+    for (size_t i = 0; i < sampler_count_pixel; ++i) {
+      texture_cache_->WriteSampler(current_samplers_pixel_[i],
+                                   sampler_cpu_handle);
+      sampler_cpu_handle.ptr += descriptor_size_sampler;
+      sampler_gpu_handle.ptr += descriptor_size_sampler;
+    }
+    // Current samplers have already been updated.
+    bindful_samplers_written_pixel_ = true;
+    current_graphics_root_up_to_date_ &=
+        ~(1u << current_graphics_root_bindful_extras_.samplers_pixel);
+  }
+
+  // Wrote new descriptors on the current page.
+  draw_view_bindful_heap_index_ = view_heap_index;
+  if (sampler_heap_index !=
+      ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
+    draw_sampler_bindful_heap_index_ = sampler_heap_index;
+  }
+  retflag = false;
+  return {};
+}
 
 uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize(
     xenos::ColorFormat format) {
@@ -5043,7 +5116,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
   if (size == 0) {
     return nullptr;
   }
-#if 0
+#if 1
   if (readback_available_) {
     GetDMAC()->WaitJobDone(readback_available_);
     readback_available_ = 0;
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 37d048d29..ba2c17a82 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -45,7 +45,10 @@
 namespace xe {
 namespace gpu {
 namespace d3d12 {
-
+struct MemExportRange {
+  uint32_t base_address_dwords;
+  uint32_t size_dwords;
+};
 class D3D12CommandProcessor final : public CommandProcessor {
  public:
 #include "../pm4_command_processor_declare.h"
@@ -287,8 +290,21 @@ class D3D12CommandProcessor final : public CommandProcessor {
   bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count,
                  IndexBufferInfo* index_buffer_info,
                  bool major_mode_explicit) override;
+  XE_COLD
+  XE_NOINLINE
+  bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer,
+                               D3D12_INDEX_BUFFER_VIEW& index_buffer_view,
+                               uint32_t guest_index_base,
+                               bool& retflag);
+  XE_NOINLINE
+  XE_COLD
+  bool GatherMemexportRangesAndMakeResident(bool& retflag);
+  XE_NOINLINE
+  XE_COLD
+  void HandleMemexportDrawOrdering_AndReadback();
   bool IssueCopy() override;
-
+  XE_NOINLINE
+  bool IssueCopy_ReadbackResolvePath();
   void InitializeTrace() override;
 
  private:
@@ -363,6 +379,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
   };
   // Gets the indices of optional root parameters. Returns the total parameter
   // count.
+  XE_NOINLINE
+  XE_COLD
   static uint32_t GetRootBindfulExtraParameterIndices(
       const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
       RootBindfulExtraParameterIndices& indices_out);
@@ -437,6 +455,18 @@ class D3D12CommandProcessor final : public CommandProcessor {
   bool UpdateBindings(const D3D12Shader* vertex_shader,
                       const D3D12Shader* pixel_shader,
                       ID3D12RootSignature* root_signature);
+  XE_COLD
+  XE_NOINLINE
+  void UpdateBindings_UpdateRootBindful();
+  XE_NOINLINE
+  XE_COLD
+  bool UpdateBindings_BindfulPath(
+      const size_t texture_layout_uid_vertex,
+      const std::vector<xe::gpu::DxbcShader::TextureBinding>& textures_vertex,
+      const size_t texture_layout_uid_pixel,
+      const std::vector<xe::gpu::DxbcShader::TextureBinding>* textures_pixel,
+      const size_t sampler_count_vertex, const size_t sampler_count_pixel,
+      bool& retflag);
 
   // Returns dword count for one element for a memexport format, or 0 if it's
   // not supported by the D3D12 command processor (if it's smaller that 1 dword,
@@ -743,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
 
   draw_util::GetViewportInfoArgs previous_viewport_info_args_;
   draw_util::ViewportInfo previous_viewport_info_;
+  // scratch memexport data
+  MemExportRange memexport_ranges_[512];
+  uint32_t memexport_range_count_ = 0;
 };
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/d3d12/deferred_command_list.cc b/src/xenia/gpu/d3d12/deferred_command_list.cc
index c27c8b226..0d647331f 100644
--- a/src/xenia/gpu/d3d12/deferred_command_list.cc
+++ b/src/xenia/gpu/d3d12/deferred_command_list.cc
@@ -266,22 +266,9 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
 
 void* DeferredCommandList::WriteCommand(Command command,
                                         size_t arguments_size_bytes) {
-
   size_t arguments_size_elements =
       round_up(arguments_size_bytes, sizeof(uintmax_t), false);
 
-      //(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
-  #if 0
-  size_t offset = command_stream_.size();
-  command_stream_.resize(offset + kCommandHeaderSizeElements +
-                         arguments_size_elements);
-  CommandHeader& header =
-      *reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
-  header.command = command;
-  header.arguments_size_elements = uint32_t(arguments_size_elements);
-  return command_stream_.data() + (offset + kCommandHeaderSizeElements);
-  #else
-
   size_t offset = command_stream_.size();
   constexpr size_t kCommandHeaderSizeBytes =
       kCommandHeaderSizeElements * sizeof(uintmax_t);
@@ -290,9 +277,9 @@ void* DeferredCommandList::WriteCommand(Command command,
   CommandHeader& header =
       *reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
   header.command = command;
-  header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t);
+  header.arguments_size_elements =
+      uint32_t(arguments_size_elements) / sizeof(uintmax_t);
   return command_stream_.data() + (offset + kCommandHeaderSizeBytes);
-  #endif
 }
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc
index d9914e566..29501b299 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@@ -183,7 +183,7 @@ void PipelineCache::Shutdown() {
   // creating them.
   if (!creation_threads_.empty()) {
     {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
       creation_threads_shutdown_from_ = 0;
     }
     creation_request_cond_.notify_all();
@@ -681,7 +681,7 @@ void PipelineCache::InitializeShaderStorage(
       if (!creation_threads_.empty()) {
         // Submit the pipeline for creation to any available thread.
         {
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
           creation_queue_.push_back(new_pipeline);
         }
         creation_request_cond_.notify_one();
@@ -695,7 +695,7 @@ void PipelineCache::InitializeShaderStorage(
       CreateQueuedPipelinesOnProcessorThread();
       if (creation_threads_.size() > creation_thread_original_count) {
         {
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
           creation_threads_shutdown_from_ = creation_thread_original_count;
           // Assuming the queue is empty because of
           // CreateQueuedPipelinesOnProcessorThread.
@@ -708,7 +708,7 @@ void PipelineCache::InitializeShaderStorage(
         bool await_creation_completion_event;
         {
           // Cleanup so additional threads can be created later again.
-          std::lock_guard<std::mutex> lock(creation_request_lock_);
+          std::lock_guard<xe_mutex> lock(creation_request_lock_);
           creation_threads_shutdown_from_ = SIZE_MAX;
           // If the invocation is blocking, all the shader storage
           // initialization is expected to be done before proceeding, to avoid
@@ -813,7 +813,7 @@ void PipelineCache::EndSubmission() {
     // Await creation of all queued pipelines.
     bool await_creation_completion_event;
     {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
       // Assuming the creation queue is already empty (because the processor
       // thread also worked on creating the leftover pipelines), so only check
       // if there are threads with pipelines currently being created.
@@ -834,7 +834,7 @@ bool PipelineCache::IsCreatingPipelines() {
   if (creation_threads_.empty()) {
     return false;
   }
-  std::lock_guard<std::mutex> lock(creation_request_lock_);
+  std::lock_guard<xe_mutex> lock(creation_request_lock_);
   return !creation_queue_.empty() || creation_threads_busy_ != 0;
 }
 
@@ -1076,7 +1076,7 @@ bool PipelineCache::ConfigurePipeline(
   if (!creation_threads_.empty()) {
     // Submit the pipeline for creation to any available thread.
     {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
       creation_queue_.push_back(new_pipeline);
     }
     creation_request_cond_.notify_one();
@@ -3314,7 +3314,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
     // Check if need to shut down or set the completion event and dequeue the
     // pipeline if there is any.
     {
-      std::unique_lock<std::mutex> lock(creation_request_lock_);
+      std::unique_lock<xe_mutex> lock(creation_request_lock_);
       if (thread_index >= creation_threads_shutdown_from_ ||
           creation_queue_.empty()) {
         if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
@@ -3345,7 +3345,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
     // completion event if needed (at the next iteration, or in some other
     // thread).
     {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
       --creation_threads_busy_;
     }
   }
@@ -3356,7 +3356,7 @@ void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
   while (true) {
     Pipeline* pipeline_to_create;
     {
-      std::lock_guard<std::mutex> lock(creation_request_lock_);
+      std::lock_guard<xe_mutex> lock(creation_request_lock_);
       if (creation_queue_.empty()) {
         break;
       }
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h
index 37e73cae4..43e528d35 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@@ -403,8 +403,8 @@ class PipelineCache {
   // Pipeline creation threads.
   void CreationThread(size_t thread_index);
   void CreateQueuedPipelinesOnProcessorThread();
-  std::mutex creation_request_lock_;
-  std::condition_variable creation_request_cond_;
+  xe_mutex creation_request_lock_;
+  std::condition_variable_any creation_request_cond_;
   // Protected with creation_request_lock_, notify_one creation_request_cond_
   // when set.
   std::deque<Pipeline*> creation_queue_;
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index 24b1eefdc..5c62c50c3 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -650,7 +650,8 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
   }
   return normalized_color_mask;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 xenos::CopySampleSelect SanitizeCopySampleSelect(
     xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
     bool is_depth) {
@@ -737,7 +738,7 @@ const ResolveCopyShaderInfo
         {"Resolve Copy Full 64bpp", true, 2, 4, 5, 3},
         {"Resolve Copy Full 128bpp", true, 2, 4, 4, 3},
 };
-
+XE_MSVC_OPTIMIZE_SMALL()
 bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                     TraceWriter& trace_writer, uint32_t draw_resolution_scale_x,
                     uint32_t draw_resolution_scale_y,
@@ -869,7 +870,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
     y1 = y0 + int32_t(xenos::kMaxResolveSize);
   }
   // fails in forza horizon 1
-  assert_true(x0 < x1 && y0 < y1);
+  //x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
+  assert_true(x0 <= x1 && y0 <= y1);
   if (x0 >= x1 || y0 >= y1) {
     XELOGE("Resolve region is empty");
     return false;
@@ -1108,7 +1110,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
   info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
   info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
   info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
-
+  #if 0
   XELOGD(
       "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
       "modified memory range 0x{:08X} to 0x{:08X})",
@@ -1119,10 +1121,10 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                      xenos::ColorRenderTargetFormat(color_edram_info.format)),
       FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
       copy_dest_extent_end);
-
+  #endif
   return true;
 }
-
+XE_MSVC_OPTIMIZE_REVERT()
 ResolveCopyShaderIndex ResolveInfo::GetCopyShader(
     uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y,
     ResolveCopyShaderConstants& constants_out, uint32_t& group_count_x_out,
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index 420bafcf2..15c014520 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -475,6 +475,8 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
 
 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
+XE_NOINLINE
+XE_NOALIAS
 xenos::CopySampleSelect SanitizeCopySampleSelect(
     xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
     bool is_depth);
diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h
index 53b81b888..1c877a9ab 100644
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@@ -14,6 +14,11 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
   new (&reader_)
       RingBuffer(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t));
   reader_.set_write_offset(count * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      COMMAND_PROCESSOR::GetCurrentRingReadCount());
   do {
     if (COMMAND_PROCESSOR::ExecutePacket()) {
       continue;
@@ -30,11 +35,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
 }
 
 bool COMMAND_PROCESSOR::ExecutePacket() {
-  // prefetch the wraparound range
-  // it likely is already in L3 cache, but in a zen system it may be another
-  // chiplets l3
-  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
-      COMMAND_PROCESSOR::GetCurrentRingReadCount());
   const uint32_t packet = reader_.ReadAndSwap<uint32_t>();
   const uint32_t packet_type = packet >> 30;
 
@@ -495,7 +495,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
         } else {
           xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100));
         }
-        xe::threading::SyncMemory();
+        // xe::threading::SyncMemory();
         ReturnFromWait();
 
         if (!worker_running_) {
@@ -599,27 +599,28 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_COND_WRITE(
     value = register_file_->values[poll_reg_addr].u32;
   }
   bool matched = false;
+  value &= mask;
   switch (wait_info & 0x7) {
     case 0x0:  // Never.
       matched = false;
       break;
     case 0x1:  // Less than reference.
-      matched = (value & mask) < ref;
+      matched = value < ref;
       break;
     case 0x2:  // Less than or equal to reference.
-      matched = (value & mask) <= ref;
+      matched = value <= ref;
       break;
     case 0x3:  // Equal to reference.
-      matched = (value & mask) == ref;
+      matched = value == ref;
       break;
     case 0x4:  // Not equal to reference.
-      matched = (value & mask) != ref;
+      matched = value != ref;
       break;
     case 0x5:  // Greater than or equal to reference.
-      matched = (value & mask) >= ref;
+      matched = value >= ref;
       break;
     case 0x6:  // Greater than reference.
-      matched = (value & mask) > ref;
+      matched = value > ref;
       break;
     case 0x7:  // Always
       matched = true;
@@ -1064,7 +1065,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
   assert_true(count - 2 >= size_dwords);
   auto shader = COMMAND_PROCESSOR::LoadShader(
       shader_type, uint32_t(reader_.read_ptr()),
-                 reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
+      reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
   switch (shader_type) {
     case xenos::ShaderType::kVertex:
       active_vertex_shader_ = shader;
diff --git a/src/xenia/gpu/primitive_processor.h b/src/xenia/gpu/primitive_processor.h
index aac84885d..7c2e96e3b 100644
--- a/src/xenia/gpu/primitive_processor.h
+++ b/src/xenia/gpu/primitive_processor.h
@@ -430,7 +430,7 @@ class PrimitiveProcessor {
       --count;
       uint32_t index = *(source++) & low_bits_mask_guest_endian;
       *(dest++) = index != reset_index_guest_endian
-                      ? xenos::GpuSwap(index, HostSwap)
+                      ? xenos::GpuSwapInline(index, HostSwap)
                       : UINT32_MAX;
     }
     if (count >= kSimdVectorU32Elements) {
@@ -442,10 +442,10 @@ class PrimitiveProcessor {
       __m128i host_swap_shuffle;
       if constexpr (HostSwap != xenos::Endian::kNone) {
         host_swap_shuffle = _mm_set_epi32(
-            int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)),
-            int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap)));
+            int32_t(xenos::GpuSwapInline(uint32_t(0x0F0E0D0C), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x0B0A0908), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x07060504), HostSwap)),
+            int32_t(xenos::GpuSwapInline(uint32_t(0x03020100), HostSwap)));
       }
 #endif  // XE_ARCH_AMD64
       while (count >= kSimdVectorU32Elements) {
@@ -490,7 +490,7 @@ class PrimitiveProcessor {
     while (count--) {
       uint32_t index = *(source++) & low_bits_mask_guest_endian;
       *(dest++) = index != reset_index_guest_endian
-                      ? xenos::GpuSwap(index, HostSwap)
+                      ? xenos::GpuSwapInline(index, HostSwap)
                       : UINT32_MAX;
     }
   }
@@ -510,19 +510,19 @@ class PrimitiveProcessor {
   };
   struct To24Swapping8In16IndexTransform {
     uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k8in16) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k8in16) &
              xenos::kVertexIndexMask;
     }
   };
   struct To24Swapping8In32IndexTransform {
     uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k8in32) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k8in32) &
              xenos::kVertexIndexMask;
     }
   };
   struct To24Swapping16In32IndexTransform {
     uint32_t operator()(uint32_t index) const {
-      return xenos::GpuSwap(index, xenos::Endian::k16in32) &
+      return xenos::GpuSwapInline(index, xenos::Endian::k16in32) &
              xenos::kVertexIndexMask;
     }
   };
diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc
index ffd77246e..38a8c54e9 100644
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@@ -388,6 +388,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
 
   bool any_data_resolved = false;
   uint32_t block_first = page_first >> 6;
+  swcache::PrefetchL1(&system_page_flags_[block_first]);
   uint32_t block_last = page_last >> 6;
   uint32_t range_start = UINT32_MAX;
 
diff --git a/src/xenia/gpu/texture_util.cc b/src/xenia/gpu/texture_util.cc
index b20194a78..cbe6c62bd 100644
--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@@ -464,7 +464,8 @@ TextureGuestLayout GetGuestTextureLayout(
 
   return layout;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
                          uint32_t bytes_per_block_log2) {
   // https://github.com/gildor2/UModel/blob/de8fbd3bc922427ea056b7340202dcdcc19ccff5/Unreal/UnTexture.cpp#L489
@@ -481,7 +482,8 @@ int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
   return ((offset & ~0x1FF) << 3) + ((y & 16) << 7) + ((offset & 0x1C0) << 2) +
          (((((y & 8) >> 2) + (x >> 3)) & 3) << 6) + (offset & 0x3F);
 }
-
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
                          uint32_t height, uint32_t bytes_per_block_log2) {
   // Reconstructed from disassembly of XGRAPHICS::TileVolume.
@@ -509,7 +511,8 @@ int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
   address += offset2 & 63;
   return address;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
                                      uint32_t pitch,
                                      uint32_t bytes_per_block_log2) {
@@ -538,7 +541,8 @@ uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
   }
   return upper_bound;
 }
-
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
                                      uint32_t back, uint32_t pitch,
                                      uint32_t height,
diff --git a/src/xenia/gpu/texture_util.h b/src/xenia/gpu/texture_util.h
index bcc080de3..a6513a0c0 100644
--- a/src/xenia/gpu/texture_util.h
+++ b/src/xenia/gpu/texture_util.h
@@ -280,8 +280,12 @@ void GetTextureTotalSize(xenos::DataDimension dimension,
 // bytes_per_block_log2 is log2_floor according to how Direct3D 9 calculates it,
 // but k_32_32_32 textures are never tiled anyway likely.
 
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
                          uint32_t bytes_per_block_log2);
+XE_NOINLINE
+XE_NOALIAS
 int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
                          uint32_t height, uint32_t bytes_per_block_log2);
 // Because (0, 0, 0) within each 32x32x4-block tile is stored in memory first,
@@ -308,9 +312,13 @@ inline uint32_t GetTiledAddressLowerBound3D(uint32_t left, uint32_t top,
 // Supporting the right > pitch and bottom > height (in tiles) cases also, for
 // estimation how far addresses can actually go even potentially beyond the
 // subresource stride.
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
                                      uint32_t pitch,
                                      uint32_t bytes_per_block_log2);
+XE_NOINLINE
+XE_NOALIAS
 uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
                                      uint32_t back, uint32_t pitch,
                                      uint32_t height,
diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc
index f15c621cd..997e9a48a 100644
--- a/src/xenia/gpu/xenos.cc
+++ b/src/xenia/gpu/xenos.cc
@@ -125,8 +125,8 @@ float Float7e3To32(uint32_t f10) {
 // Based on CFloat24 from d3dref9.dll and the 6e4 code from:
 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
 // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
-
-uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
+XE_NOALIAS
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept {
   if (!(f32 > 0.0f)) {
     // Positive only, and not -0 or NaN.
     return 0;
@@ -150,8 +150,8 @@ uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
   }
   return (f32u32 >> 3) & 0xFFFFFF;
 }
-
-float Float20e4To32(uint32_t f24) {
+XE_NOALIAS
+float Float20e4To32(uint32_t f24) noexcept {
   f24 &= 0xFFFFFF;
   if (!f24) {
     return 0.0f;
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 8c03be479..8e9fd5c11 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -421,10 +421,12 @@ float Float7e3To32(uint32_t f10);
 // floating-point number.
 // Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
 // depth, rounding to the nearest even or towards zero.
-uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
+XE_NOALIAS 
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept;
 // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
 // IEEE-754 32-bit floating-point number.
-float Float20e4To32(uint32_t f24);
+XE_NOALIAS
+float Float20e4To32(uint32_t f24) noexcept;
 // Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
 // floating-point number.
 constexpr float UNorm24To32(uint32_t n24) {
@@ -1045,9 +1047,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) {
       return value;
   }
 }
-XE_NOINLINE
+XE_FORCEINLINE
 XE_NOALIAS
-static uint32_t GpuSwap(uint32_t value, Endian endianness) {
+static uint32_t GpuSwapInline(uint32_t value, Endian endianness) {
   switch (endianness) {
     default:
     case Endian::kNone:
@@ -1065,6 +1067,11 @@ static uint32_t GpuSwap(uint32_t value, Endian endianness) {
       return ((value >> 16) & 0xFFFF) | (value << 16);
   }
 }
+XE_NOINLINE
+XE_NOALIAS
+static uint32_t GpuSwap(uint32_t value, Endian endianness) {
+  return GpuSwapInline(value, endianness);
+}
 
 inline float GpuSwap(float value, Endian endianness) {
   union {
diff --git a/src/xenia/hid/input_system.cc b/src/xenia/hid/input_system.cc
index 588faefe3..a21ce5a7b 100644
--- a/src/xenia/hid/input_system.cc
+++ b/src/xenia/hid/input_system.cc
@@ -137,8 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel(
   modified_vibration.right_motor_speed = 0;
   return modified_vibration;
 }
-std::unique_lock<xe_unlikely_mutex> InputSystem::lock() {
-  return std::unique_lock<xe_unlikely_mutex>{lock_};
+std::unique_lock<xe_mutex> InputSystem::lock() {
+  return std::unique_lock<xe_mutex>{lock_};
 }
 }  // namespace hid
 }  // namespace xe
diff --git a/src/xenia/hid/input_system.h b/src/xenia/hid/input_system.h
index 333116499..c294edc64 100644
--- a/src/xenia/hid/input_system.h
+++ b/src/xenia/hid/input_system.h
@@ -48,7 +48,7 @@ class InputSystem {
   void UpdateUsedSlot(uint8_t slot, bool connected);
   uint8_t GetConnectedSlots() const { return connected_slot; }
 
-  std::unique_lock<xe_unlikely_mutex> lock();
+  std::unique_lock<xe_mutex> lock();
 
  private:
   xe::ui::Window* window_ = nullptr;
@@ -57,7 +57,7 @@ class InputSystem {
 
   X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration);
   uint8_t connected_slot = 0b0001;
-  xe_unlikely_mutex lock_;
+  xe_mutex lock_;
 };
 
 }  // namespace hid
diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
index 8e66ac683..b5bb6c57b 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@@ -911,11 +911,17 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
 DECLARE_XBOXKRNL_EXPORT3(NtSignalAndWaitForSingleObjectEx, kThreading,
                          kImplemented, kBlocking, kHighFrequency);
 
+static void PrefetchForCAS(const void* value) {
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
+    swcache::PrefetchW(value);
+  }
+}
+
 uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
   // XELOGD(
   //     "KfAcquireSpinLock({:08X})",
   //     lock_ptr);
-
+  PrefetchForCAS(lock);
   // Lock.
   while (!xe::atomic_cas(0, 1, lock)) {
     // Spin!
@@ -956,6 +962,7 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
 void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
   // Lock.
   auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
+  PrefetchForCAS(lock);
   while (!xe::atomic_cas(0, 1, lock)) {
 #if XE_ARCH_AMD64 == 1
     // todo: this is just a nop if they don't have SMT, which is not great
@@ -973,6 +980,7 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
 dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
   // Lock.
   auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
+  PrefetchForCAS(lock);
   if (!xe::atomic_cas(0, 1, lock)) {
     return 0;
   }
diff --git a/src/xenia/kernel/xthread.cc b/src/xenia/kernel/xthread.cc
index b842c2c08..084485c66 100644
--- a/src/xenia/kernel/xthread.cc
+++ b/src/xenia/kernel/xthread.cc
@@ -763,7 +763,8 @@ void XThread::SetActiveCpu(uint8_t cpu_index) {
       thread_->set_affinity_mask(uint64_t(1) << cpu_index);
     }
   } else {
-    XELOGW("Too few processor cores - scheduling will be wonky");
+	  //there no good reason why we need to log this... we don't perfectly emulate the 360's scheduler in any way
+   // XELOGW("Too few processor cores - scheduling will be wonky");
   }
 }
 
diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc
index 16e2b8336..f29eb21dc 100644
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@@ -713,6 +713,8 @@ void BaseHeap::Initialize(Memory* memory, uint8_t* membase, HeapType heap_type,
   heap_base_ = heap_base;
   heap_size_ = heap_size;
   page_size_ = page_size;
+  xenia_assert(xe::is_pow2(page_size_));
+  page_size_shift_ = xe::log2_floor(page_size_);
   host_address_offset_ = host_address_offset;
   page_table_.resize(heap_size / page_size);
   unreserved_page_count_ = uint32_t(page_table_.size());
@@ -1234,14 +1236,14 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
   //  fails and returns without modifying the access protection of any pages in
   //  the specified region."
 
-  uint32_t start_page_number = (address - heap_base_) / page_size_;
+  uint32_t start_page_number = (address - heap_base_) >> page_size_shift_;
   if (start_page_number >= page_table_.size()) {
     XELOGE("BaseHeap::Protect failed due to out-of-bounds base address {:08X}",
            address);
     return false;
   }
   uint32_t end_page_number =
-      uint32_t((uint64_t(address) + size - 1 - heap_base_) / page_size_);
+      uint32_t((uint64_t(address) + size - 1 - heap_base_) >> page_size_shift_);
   if (end_page_number >= page_table_.size()) {
     XELOGE(
         "BaseHeap::Protect failed due to out-of-bounds range ({:08X} bytes "
@@ -1268,17 +1270,21 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
       return false;
     }
   }
+  uint32_t xe_page_size = static_cast<uint32_t>(xe::memory::page_size());
+
+  uint32_t page_size_mask = xe_page_size - 1;
 
   // Attempt host change (hopefully won't fail).
   // We can only do this if our size matches system page granularity.
   uint32_t page_count = end_page_number - start_page_number + 1;
-  if (page_size_ == xe::memory::page_size() ||
-      (((page_count * page_size_) % xe::memory::page_size() == 0) &&
-       ((start_page_number * page_size_) % xe::memory::page_size() == 0))) {
+  if (page_size_ == xe_page_size ||
+      ((((page_count << page_size_shift_) & page_size_mask) == 0) &&
+       (((start_page_number << page_size_shift_) & page_size_mask) == 0))) {
     memory::PageAccess old_protect_access;
-    if (!xe::memory::Protect(TranslateRelative(start_page_number * page_size_),
-                             page_count * page_size_, ToPageAccess(protect),
-                             old_protect ? &old_protect_access : nullptr)) {
+    if (!xe::memory::Protect(
+            TranslateRelative(start_page_number << page_size_shift_),
+            page_count << page_size_shift_, ToPageAccess(protect),
+            old_protect ? &old_protect_access : nullptr)) {
       XELOGE("BaseHeap::Protect failed due to host VirtualProtect failure");
       return false;
     }
@@ -1303,7 +1309,7 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
 
 bool BaseHeap::QueryRegionInfo(uint32_t base_address,
                                HeapAllocationInfo* out_info) {
-  uint32_t start_page_number = (base_address - heap_base_) / page_size_;
+  uint32_t start_page_number = (base_address - heap_base_) >> page_size_shift_;
   if (start_page_number > page_table_.size()) {
     XELOGE("BaseHeap::QueryRegionInfo base page out of range");
     return false;
@@ -1321,9 +1327,10 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
   if (start_page_entry.state) {
     // Committed/reserved region.
     out_info->allocation_base =
-        heap_base_ + start_page_entry.base_address * page_size_;
+        heap_base_ + (start_page_entry.base_address << page_size_shift_);
     out_info->allocation_protect = start_page_entry.allocation_protect;
-    out_info->allocation_size = start_page_entry.region_page_count * page_size_;
+    out_info->allocation_size = start_page_entry.region_page_count
+                                << page_size_shift_;
     out_info->state = start_page_entry.state;
     out_info->protect = start_page_entry.current_protect;
 
@@ -1358,7 +1365,7 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
 }
 
 bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
-  uint32_t page_number = (address - heap_base_) / page_size_;
+  uint32_t page_number = (address - heap_base_) >> page_size_shift_;
   if (page_number > page_table_.size()) {
     XELOGE("BaseHeap::QuerySize base page out of range");
     *out_size = 0;
@@ -1366,12 +1373,12 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
   }
   auto global_lock = global_critical_region_.Acquire();
   auto page_entry = page_table_[page_number];
-  *out_size = (page_entry.region_page_count * page_size_);
+  *out_size = (page_entry.region_page_count << page_size_shift_);
   return true;
 }
 
 bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
-  uint32_t page_number = (*in_out_address - heap_base_) / page_size_;
+  uint32_t page_number = (*in_out_address - heap_base_) >> page_size_shift_;
   if (page_number > page_table_.size()) {
     XELOGE("BaseHeap::QuerySize base page out of range");
     *out_size = 0;
@@ -1379,13 +1386,13 @@ bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
   }
   auto global_lock = global_critical_region_.Acquire();
   auto page_entry = page_table_[page_number];
-  *in_out_address = (page_entry.base_address * page_size_);
-  *out_size = (page_entry.region_page_count * page_size_);
+  *in_out_address = (page_entry.base_address << page_size_shift_);
+  *out_size = (page_entry.region_page_count << page_size_shift_);
   return true;
 }
 
 bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
-  uint32_t page_number = (address - heap_base_) / page_size_;
+  uint32_t page_number = (address - heap_base_) >> page_size_shift_;
   if (page_number > page_table_.size()) {
     XELOGE("BaseHeap::QueryProtect base page out of range");
     *out_protect = 0;
@@ -1403,8 +1410,8 @@ xe::memory::PageAccess BaseHeap::QueryRangeAccess(uint32_t low_address,
       (high_address - heap_base_) >= heap_size_) {
     return xe::memory::PageAccess::kNoAccess;
   }
-  uint32_t low_page_number = (low_address - heap_base_) / page_size_;
-  uint32_t high_page_number = (high_address - heap_base_) / page_size_;
+  uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
+  uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
   uint32_t protect = kMemoryProtectRead | kMemoryProtectWrite;
   {
     auto global_lock = global_critical_region_.Acquire();
@@ -1446,6 +1453,8 @@ void PhysicalHeap::Initialize(Memory* memory, uint8_t* membase,
                        page_size, host_address_offset);
   parent_heap_ = parent_heap;
   system_page_size_ = uint32_t(xe::memory::page_size());
+  xenia_assert(xe::is_pow2(system_page_size_));
+  system_page_shift_ = xe::log2_floor(system_page_size_);
 
   system_page_count_ =
       (size_t(heap_size_) + host_address_offset + (system_page_size_ - 1)) /
@@ -1665,10 +1674,11 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
   }
 
   uint32_t system_page_first =
-      (heap_relative_address + host_address_offset()) / system_page_size_;
+      (heap_relative_address + host_address_offset()) >> system_page_shift_;
+  swcache::PrefetchL1(&system_page_flags_[system_page_first >> 6]);
   uint32_t system_page_last =
-      (heap_relative_address + length - 1 + host_address_offset()) /
-      system_page_size_;
+      (heap_relative_address + length - 1 + host_address_offset()) >>
+      system_page_shift_;
   system_page_last = std::min(system_page_last, system_page_count_ - 1);
   assert_true(system_page_first <= system_page_last);
 
@@ -1677,10 +1687,40 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
   xe::memory::PageAccess protect_access =
       enable_data_providers ? xe::memory::PageAccess::kNoAccess
                             : xe::memory::PageAccess::kReadOnly;
+
+  auto global_lock = global_critical_region_.Acquire();
+  if (enable_invalidation_notifications) {
+    EnableAccessCallbacksInner<true>(system_page_first, system_page_last,
+                                     protect_access);
+  } else {
+    EnableAccessCallbacksInner<false>(system_page_first, system_page_last,
+                                      protect_access);
+  }
+}
+
+template <bool enable_invalidation_notifications>
+XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
+    const uint32_t system_page_first, const uint32_t system_page_last,
+    xe::memory::PageAccess protect_access) XE_RESTRICT {
   uint8_t* protect_base = membase_ + heap_base_;
   uint32_t protect_system_page_first = UINT32_MAX;
-  auto global_lock = global_critical_region_.Acquire();
-  for (uint32_t i = system_page_first; i <= system_page_last; ++i) {
+
+  SystemPageFlagsBlock* XE_RESTRICT sys_page_flags = system_page_flags_.data();
+  PageEntry* XE_RESTRICT page_table_ptr = page_table_.data();
+
+  // chrispy: a lot of time is spent in this loop, and i think some of the work
+  // may be avoidable and repetitive profiling shows quite a bit of time spent
+  // in this loop, but very little spent actually calling Protect
+  uint32_t i = system_page_first;
+
+  uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
+  uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
+
+  uint32_t guest_one =
+      SystemPagenumToGuestPagenum(1);
+
+  uint32_t system_one = GuestPagenumToSystemPagenum(1);
+  for (; i <= system_page_last; ++i) {
     // Check if need to enable callbacks for the page and raise its protection.
     //
     // If enabling invalidation notifications:
@@ -1702,12 +1742,19 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
     //
     // Enabling data providers doesn't need to be deferred - providers will be
     // polled for the last time without releasing the lock.
-    SystemPageFlagsBlock& page_flags_block = system_page_flags_[i >> 6];
+    SystemPageFlagsBlock& page_flags_block = sys_page_flags[i >> 6];
+
+#if XE_ARCH_AMD64 == 1
+    // x86 modulus shift
+    uint64_t page_flags_bit = uint64_t(1) << i;
+#else
     uint64_t page_flags_bit = uint64_t(1) << (i & 63);
-    uint32_t guest_page_number =
-        xe::sat_sub(i * system_page_size_, host_address_offset()) / page_size_;
+#endif
+
+    uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
+    //swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]);
     xe::memory::PageAccess current_page_access =
-        ToPageAccess(page_table_[guest_page_number].current_protect);
+        ToPageAccess(page_table_ptr[guest_page_number].current_protect);
     bool protect_system_page = false;
     // Don't do anything with inaccessible pages - don't protect, don't enable
     // callbacks - because real access violations are needed there. And don't
@@ -1715,7 +1762,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
     // reason.
     if (current_page_access != xe::memory::PageAccess::kNoAccess) {
       // TODO(Triang3l): Enable data providers.
-      if (enable_invalidation_notifications) {
+      if constexpr (enable_invalidation_notifications) {
         if (current_page_access != xe::memory::PageAccess::kReadOnly &&
             (page_flags_block.notify_on_invalidation & page_flags_bit) == 0) {
           // TODO(Triang3l): Check if data providers are already enabled.
@@ -1733,21 +1780,22 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
     } else {
       if (protect_system_page_first != UINT32_MAX) {
         xe::memory::Protect(
-            protect_base + protect_system_page_first * system_page_size_,
-            (i - protect_system_page_first) * system_page_size_,
+            protect_base + (protect_system_page_first << system_page_shift_),
+            (i - protect_system_page_first) << system_page_shift_,
             protect_access);
         protect_system_page_first = UINT32_MAX;
       }
     }
   }
+
   if (protect_system_page_first != UINT32_MAX) {
     xe::memory::Protect(
-        protect_base + protect_system_page_first * system_page_size_,
-        (system_page_last + 1 - protect_system_page_first) * system_page_size_,
+        protect_base + (protect_system_page_first << system_page_shift_),
+        (system_page_last + 1 - protect_system_page_first)
+            << system_page_shift_,
         protect_access);
   }
 }
-
 bool PhysicalHeap::TriggerCallbacks(
     global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
     uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
@@ -1774,10 +1822,10 @@ bool PhysicalHeap::TriggerCallbacks(
   }
 
   uint32_t system_page_first =
-      (heap_relative_address + host_address_offset()) / system_page_size_;
+      (heap_relative_address + host_address_offset()) >> system_page_shift_;
   uint32_t system_page_last =
-      (heap_relative_address + length - 1 + host_address_offset()) /
-      system_page_size_;
+      (heap_relative_address + length - 1 + host_address_offset()) >>
+      system_page_shift_;
   system_page_last = std::min(system_page_last, system_page_count_ - 1);
   assert_true(system_page_first <= system_page_last);
   uint32_t block_index_first = system_page_first >> 6;
@@ -1810,11 +1858,11 @@ bool PhysicalHeap::TriggerCallbacks(
   }
   uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
   uint32_t physical_address_start =
-      xe::sat_sub(system_page_first * system_page_size_,
+      xe::sat_sub(system_page_first << system_page_shift_,
                   host_address_offset()) +
       physical_address_offset;
   uint32_t physical_length = std::min(
-      xe::sat_sub(system_page_last * system_page_size_ + system_page_size_,
+      xe::sat_sub((system_page_last << system_page_shift_) + system_page_size_,
                   host_address_offset()) +
           physical_address_offset - physical_address_start,
       heap_size_ - (physical_address_start - physical_address_offset));
@@ -1858,8 +1906,8 @@ bool PhysicalHeap::TriggerCallbacks(
     unwatch_first += host_address_offset();
     unwatch_last += host_address_offset();
     assert_true(unwatch_first <= unwatch_last);
-    system_page_first = unwatch_first / system_page_size_;
-    system_page_last = unwatch_last / system_page_size_;
+    system_page_first = unwatch_first >> system_page_shift_;
+    system_page_last = unwatch_last >> system_page_shift_;
     block_index_first = system_page_first >> 6;
     block_index_last = system_page_last >> 6;
   }
@@ -1874,8 +1922,8 @@ bool PhysicalHeap::TriggerCallbacks(
                              (uint64_t(1) << (i & 63))) != 0;
       if (unprotect_page) {
         uint32_t guest_page_number =
-            xe::sat_sub(i * system_page_size_, host_address_offset()) /
-            page_size_;
+            xe::sat_sub(i << system_page_shift_, host_address_offset()) >>
+            page_size_shift_;
         if (ToPageAccess(page_table_[guest_page_number].current_protect) !=
             xe::memory::PageAccess::kReadWrite) {
           unprotect_page = false;
@@ -1888,8 +1936,9 @@ bool PhysicalHeap::TriggerCallbacks(
       } else {
         if (unprotect_system_page_first != UINT32_MAX) {
           xe::memory::Protect(
-              protect_base + unprotect_system_page_first * system_page_size_,
-              (i - unprotect_system_page_first) * system_page_size_,
+              protect_base +
+                  (unprotect_system_page_first << system_page_shift_),
+              (i - unprotect_system_page_first) << system_page_shift_,
               xe::memory::PageAccess::kReadWrite);
           unprotect_system_page_first = UINT32_MAX;
         }
@@ -1897,9 +1946,9 @@ bool PhysicalHeap::TriggerCallbacks(
     }
     if (unprotect_system_page_first != UINT32_MAX) {
       xe::memory::Protect(
-          protect_base + unprotect_system_page_first * system_page_size_,
-          (system_page_last + 1 - unprotect_system_page_first) *
-              system_page_size_,
+          protect_base + (unprotect_system_page_first << system_page_shift_),
+          (system_page_last + 1 - unprotect_system_page_first)
+              << system_page_shift_,
           xe::memory::PageAccess::kReadWrite);
     }
   }
diff --git a/src/xenia/memory.h b/src/xenia/memory.h
index 3d4cf5637..672115d5c 100644
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@@ -216,6 +216,7 @@ class BaseHeap {
   uint32_t heap_base_;
   uint32_t heap_size_;
   uint32_t page_size_;
+  uint32_t page_size_shift_;
   uint32_t host_address_offset_;
   uint32_t unreserved_page_count_;
   xe::global_critical_region global_critical_region_;
@@ -270,18 +271,36 @@ class PhysicalHeap : public BaseHeap {
   void EnableAccessCallbacks(uint32_t physical_address, uint32_t length,
                              bool enable_invalidation_notifications,
                              bool enable_data_providers);
+  template <bool enable_invalidation_notifications>
+  XE_NOINLINE void EnableAccessCallbacksInner(
+      const uint32_t system_page_first, const uint32_t system_page_last,
+      xe::memory::PageAccess protect_access) XE_RESTRICT;
+
   // Returns true if any page in the range was watched.
   bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
-      uint32_t virtual_address, uint32_t length, bool is_write,
-      bool unwatch_exact_range, bool unprotect = true);
+                        uint32_t virtual_address, uint32_t length,
+                        bool is_write, bool unwatch_exact_range,
+                        bool unprotect = true);
 
   uint32_t GetPhysicalAddress(uint32_t address) const;
 
+  uint32_t SystemPagenumToGuestPagenum(uint32_t num) const {
+    return ((num << system_page_shift_) - host_address_offset()) >> page_size_shift_;
+  }
+
+  uint32_t GuestPagenumToSystemPagenum(uint32_t num) {
+    num <<= page_size_shift_;
+    num += host_address_offset();
+    num >>= system_page_shift_;
+    return num;
+  }
  protected:
   VirtualHeap* parent_heap_;
 
   uint32_t system_page_size_;
   uint32_t system_page_count_;
+  uint32_t system_page_shift_;
+  uint32_t padding1_;
 
   struct SystemPageFlagsBlock {
     // Whether writing to each page should result trigger invalidation
@@ -458,9 +477,9 @@ class Memory {
   // TODO(Triang3l): Implement data providers - this is why locking depth of 1
   // will be required in the future.
   bool TriggerPhysicalMemoryCallbacks(
-      global_unique_lock_type global_lock_locked_once,
-      uint32_t virtual_address, uint32_t length, bool is_write,
-      bool unwatch_exact_range, bool unprotect = true);
+      global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
+      uint32_t length, bool is_write, bool unwatch_exact_range,
+      bool unprotect = true);
 
   // Allocates virtual memory from the 'system' heap.
   // System memory is kept separate from game memory but is still accessible
@@ -509,10 +528,10 @@ class Memory {
                                           const void* host_address);
 
   bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
-      void* host_address, bool is_write);
+                               void* host_address, bool is_write);
   static bool AccessViolationCallbackThunk(
-      global_unique_lock_type global_lock_locked_once,
-      void* context, void* host_address, bool is_write);
+      global_unique_lock_type global_lock_locked_once, void* context,
+      void* host_address, bool is_write);
 
   std::filesystem::path file_name_;
   uint32_t system_page_size_ = 0;