From 82dcf3f951766f21d52be24bbca0e0eddf48a9db Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Tue, 13 Dec 2022 10:45:19 -0800
Subject: [PATCH 1/6] faster/more compact MatchValueAndRef

Made commandprocessor GetcurrentRingReadcount inline, it was made noinline to match PGO decisions but i think PGO can make extra reg allocation decisions that make this inlining choice yield gains, whereas if we do it manually we lose a tiny bit of performance

Working on a more compact vectorized version of GetScissor to save icache on cmd processor thread
Add WriteRegisterForceinline, will probably end up amending to remove it

add ERMS path for vastcpy

Adding WriteRegistersFromMemCommonSense (name will be changed later), name is because i realized my approach with optimizing writeregisters has been backwards,  instead of handling all checks more quickly within the loop that writes the registers, we need a different loop for each range with unique handling. we also manage to hoist a lot of the logic out of the loops

Use 100ns delay for MaybeYield, noticed we often return almost immediately from the syscall so we end up wasting some cpu, instead we give up the cpu for the min waitable time (on my system, this is 0.5 ms)

Added a note about affinity mask/dynamic process affinity updates to threading_win

Add TextureFetchConstantsWritten
---
 src/xenia/base/memory.cc                      |  14 +-
 src/xenia/base/threading_win.cc               |   8 +-
 .../gpu/d3d12/d3d12_command_processor.cc      | 218 ++++++++++++++++--
 src/xenia/gpu/d3d12/d3d12_command_processor.h |  13 +-
 src/xenia/gpu/draw_util.cc                    |  85 +++++--
 src/xenia/gpu/draw_util.h                     |   6 +-
 src/xenia/gpu/pm4_command_processor_declare.h |   2 +-
 .../gpu/pm4_command_processor_implement.h     |  87 +++----
 src/xenia/gpu/texture_cache.h                 |   9 +
 9 files changed, 345 insertions(+), 97 deletions(-)

diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc
index b83e545d2..cf6788f96 100644
--- a/src/xenia/base/memory.cc
+++ b/src/xenia/base/memory.cc
@@ -9,8 +9,8 @@
 
 #include "xenia/base/memory.h"
 #include "xenia/base/cvar.h"
-#include "xenia/base/platform.h"
 #include "xenia/base/logging.h"
+#include "xenia/base/platform.h"
 
 #if XE_ARCH_ARM64
 #include <arm_neon.h>
@@ -59,7 +59,7 @@ static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
 
   CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
   CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
-  
+
   for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
     xe::swcache::CacheLine line0, line1, line2, line3;
 
@@ -173,7 +173,12 @@ static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
     _movdir64b(physaddr + i, rdmapping + i);
   }
 }
-
+static void vastcpy_impl_repmovs(CacheLine* XE_RESTRICT physaddr,
+                                 CacheLine* XE_RESTRICT rdmapping,
+                                 uint32_t written_length) {
+  __movsq((unsigned long long*)physaddr, (unsigned long long*)rdmapping,
+          written_length / 8);
+}
 XE_COLD
 static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
                           CacheLine* XE_RESTRICT rdmapping,
@@ -189,6 +194,9 @@ static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
   if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
     XELOGI("Selecting MOVDIR64M vastcpy.");
     dispatch_to_use = vastcpy_impl_movdir64m;
+  } else if (amd64::GetFeatureFlags() & amd64::kX64FastRepMovs) {
+    XELOGI("Selecting rep movs vastcpy.");
+    dispatch_to_use = vastcpy_impl_repmovs;
   } else {
     XELOGI("Selecting generic AVX vastcpy.");
     dispatch_to_use = vastcpy_impl_avx;
diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc
index a8aa7889c..88ecb145c 100644
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@@ -60,6 +60,12 @@ namespace xe {
 namespace threading {
 
 void EnableAffinityConfiguration() {
+  // chrispy: i don't think this is necessary,
+  // affinity always seems to be the system mask? research more
+  // also, maybe if ignore_thread_affinities is on we should use
+  // SetProcessAffinityUpdateMode to allow windows to dynamically update
+  // our process' affinity (by default windows cannot change the affinity itself
+  // at runtime, user code must do it)
   HANDLE process_handle = GetCurrentProcess();
   DWORD_PTR process_affinity_mask;
   DWORD_PTR system_affinity_mask;
@@ -117,7 +123,7 @@ void set_name(const std::string_view name) {
 
 // checked ntoskrnl, it does not modify delay, so we can place this as a
 // constant and avoid creating a stack variable
-static const LARGE_INTEGER sleepdelay0_for_maybeyield{{0LL}};
+static const LARGE_INTEGER sleepdelay0_for_maybeyield{{~0u, -1}};
 
 void MaybeYield() {
 #if 0
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index fc236aa6a..70b09501e 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -1712,8 +1712,10 @@ void D3D12CommandProcessor::ShutdownContext() {
 
   CommandProcessor::ShutdownContext();
 }
-// todo: bit-pack the bools and use bitarith to reduce branches
-void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
+
+XE_FORCEINLINE
+void D3D12CommandProcessor::WriteRegisterForceinline(uint32_t index,
+                                                     uint32_t value) {
   __m128i to_rangecheck = _mm_set1_epi16(static_cast<short>(index));
 
   __m128i lower_bounds = _mm_setr_epi16(
@@ -1783,6 +1785,10 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
     return;
   }
 }
+// todo: bit-pack the bools and use bitarith to reduce branches
+void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
+  WriteRegisterForceinline(index, value);
+}
 
 void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
                                                   uint32_t* base,
@@ -1911,8 +1917,22 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
 void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
                                                        uint32_t base,
                                                        uint32_t num_registers) {
-  WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
-                                                       num_registers);
+  // WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
+  //                                                     num_registers);
+
+  RingBuffer::ReadRange range =
+      ring->BeginRead(num_registers * sizeof(uint32_t));
+
+  XE_LIKELY_IF(!range.second) {
+    WriteRegistersFromMemCommonSense(
+        base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
+        num_registers);
+
+    ring->EndRead(range);
+  }
+  else {
+    return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
+  }
 }
 
 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
@@ -1926,6 +1946,145 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
          bounds_may_have_reg<register_lower_bound, register_upper_bound>(
              last_reg);
 }
+void D3D12CommandProcessor::WriteShaderConstantsFromMem(
+    uint32_t start_index, uint32_t* base, uint32_t num_registers) {
+  if (frame_open_) {
+    bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
+    bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
+    if (cbuffer_pixel_uptodate || cbuffer_vertex_uptodate) {
+      // super naive, could just do some bit magic and interval checking,
+      // but we just need this hoisted out of the copy so we do a bulk copy
+      // because its the actual load/swap/store we're getting murdered by
+      // this precheck followed by copy_and_swap_32_unaligned reduced the cpu
+      // usage from packettype0/writeregistersfrommem from 10-11% of cpu time
+      // spent on xenia to like 1%
+      // chrispy: todo, this can be reduced even further, should be split into
+      // two loops and should skip whole words, this could net us even bigger
+      // gains
+      uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
+      uint32_t end_map_index =
+          (start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
+      if (!cbuffer_vertex_uptodate) {
+        if (256 >= end_map_index) {
+          goto skip_map_checks;
+        }
+        map_index = 256;
+      }
+      for (; map_index < end_map_index; ++map_index) {
+        uint32_t float_constant_index = map_index;
+        if (float_constant_index >= 256) {
+          float_constant_index -= 256;
+          if (current_float_constant_map_pixel_[float_constant_index >> 6] &
+              (1ull << (float_constant_index & 63))) {
+            cbuffer_pixel_uptodate = false;
+            if (!cbuffer_vertex_uptodate) {
+              break;
+            }
+          }
+        } else {
+          if (current_float_constant_map_vertex_[float_constant_index >> 6] &
+              (1ull << (float_constant_index & 63))) {
+            cbuffer_vertex_uptodate = false;
+            if (!cbuffer_pixel_uptodate) {
+              break;
+            } else {
+              map_index = 255;  // skip to checking pixel
+              continue;  // increment will put us at 256, then the check will
+                         // happen
+            }
+          }
+        }
+      }
+    skip_map_checks:;
+    }
+    cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
+    cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
+  }
+  // maybe use non-temporal copy if possible...
+  copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
+                             num_registers);
+}
+
+void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
+                                                 uint32_t* base,
+                                                 uint32_t num_registers) {
+  cbuffer_binding_bool_loop_.up_to_date = false;
+  copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
+                             num_registers);
+}
+void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
+                                              uint32_t* base,
+                                              uint32_t num_registers) {
+  cbuffer_binding_fetch_.up_to_date = false;
+
+  uint32_t first_fetch =
+      ((start_index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
+  uint32_t last_fetch =  // i think last_fetch should be inclusive if its modulo
+                         // is nz...
+      (((start_index + num_registers) - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) /
+       6);
+  texture_cache_->TextureFetchConstantsWritten(first_fetch, last_fetch);
+
+  copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
+                             num_registers);
+}
+
+void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
+    uint32_t start_index, uint32_t* base, uint32_t num_registers) {
+  uint32_t end = start_index + num_registers;
+
+  uint32_t current_index = start_index;
+
+  auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
+    return std::min<uint32_t>(regnum, end) - current_index;
+  };
+
+#define DO_A_RANGE_CALLBACK(start_range, end_range, index, base, n)     \
+  WriteRegisterRangeFromMem_WithKnownBound<(start_range), (end_range)>( \
+      index, base, n)
+
+#define DO_A_RANGE(start_range, end_range, cb)                     \
+  if (current_index < (end_range)) {                               \
+    uint32_t ntowrite = get_end_before_qty(end_range);             \
+    cb((start_range), (end_range), current_index, base, ntowrite); \
+    current_index += ntowrite;                                     \
+    base += ntowrite;                                              \
+  }                                                                \
+  if (current_index >= end) {                                      \
+    return;                                                        \
+  }
+
+  if (start_index >= XE_GPU_REG_SHADER_CONSTANT_000_X) {  // fairly common
+    goto shader_vars_start;
+  }
+
+#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
+  copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
+  DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
+  DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
+             DO_A_RANGE_CALLBACK);
+  DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
+             REGULAR_WRITE_CALLBACK);
+
+#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
+                                        n)                                   \
+  WriteShaderConstantsFromMem(index, base, n)
+shader_vars_start:
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
+             XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+             WRITE_SHADER_CONSTANTS_CALLBACK);
+
+#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
+  WriteFetchFromMem(ind, b, n)
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+             XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
+             WRITE_FETCH_CONSTANTS_CALLBACK);
+#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
+             XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
+             REGULAR_WRITE_CALLBACK);
+}
 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
 XE_FORCEINLINE void
 D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
@@ -1935,12 +2094,21 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
   constexpr auto bounds_has_bounds =
       bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
 
+  bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
+  bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
+
+  bool skip_uptodate_checks =
+      (!cbuffer_pixel_uptodate && !cbuffer_vertex_uptodate) || (!frame_open_);
   for (uint32_t i = 0; i < num_registers; ++i) {
     uint32_t data = xe::load_and_swap<uint32_t>(range + i);
-
-    {
-      uint32_t index = base + i;
-      uint32_t value = data;
+    uint32_t index = base + i;
+    uint32_t value = data;
+    // cant if constexpr this one or we get unreferenced label errors, and if we
+    // move the label into the else we get errors about a jump from one if
+    // constexpr into another
+    if (register_lower_bound == 0 && register_upper_bound == 0xFFFF) {
+      D3D12CommandProcessor::WriteRegisterForceinline(index, value);
+    } else {
       XE_MSVC_ASSUME(index >= register_lower_bound &&
                      index < register_upper_bound);
       register_file_->values[index].u32 = value;
@@ -1968,27 +2136,13 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
         HandleSpecialRegisterWrite(index, value);
         goto write_done;
       }
-      XE_MSVC_ASSUME(index >= register_lower_bound &&
-                     index < register_upper_bound);
-      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
-                                      XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
-        if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
-            index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
-          cbuffer_binding_fetch_.up_to_date = false;
-          // texture cache is never nullptr
-          texture_cache_->TextureFetchConstantWritten(
-              (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
-
-          goto write_done;
-        }
-      }
       XE_MSVC_ASSUME(index >= register_lower_bound &&
                      index < register_upper_bound);
       if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
                                       XE_GPU_REG_SHADER_CONSTANT_511_W)) {
         if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
             index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
-          if (frame_open_) {
+          if (!skip_uptodate_checks) {
             uint32_t float_constant_index =
                 (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
             if (float_constant_index >= 256) {
@@ -2002,12 +2156,30 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
                                                      6] &
                   (1ull << (float_constant_index & 63))) {
                 cbuffer_binding_float_vertex_.up_to_date = false;
+                if (!cbuffer_pixel_uptodate) {
+                  skip_uptodate_checks = true;
+                }
               }
             }
           }
           goto write_done;
         }
       }
+      XE_MSVC_ASSUME(index >= register_lower_bound &&
+                     index < register_upper_bound);
+      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+                                      XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
+        if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
+            index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
+          cbuffer_binding_fetch_.up_to_date = false;
+          // texture cache is never nullptr
+          texture_cache_->TextureFetchConstantWritten(
+              (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
+
+          goto write_done;
+        }
+      }
+
       XE_MSVC_ASSUME(index >= register_lower_bound &&
                      index < register_upper_bound);
       if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 75f23cf03..0fcd76593 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -211,12 +211,21 @@ class D3D12CommandProcessor final : public CommandProcessor {
  protected:
   bool SetupContext() override;
   void ShutdownContext() override;
-
+  XE_FORCEINLINE
+  void WriteRegisterForceinline(uint32_t index, uint32_t value);
   void WriteRegister(uint32_t index, uint32_t value) override;
   XE_FORCEINLINE
   virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
                                      uint32_t num_registers) override;
-
+  //SHADER_CONSTANT_blah_XWYZ
+  void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base,
+                                        uint32_t num_registers);
+  void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base,
+                            uint32_t num_registers);
+  void WriteFetchFromMem(uint32_t start_index, uint32_t* base,
+                         uint32_t num_registers);
+  void WriteRegistersFromMemCommonSense(uint32_t start_index,  uint32_t* base,
+                                     uint32_t num_registers) ;
   template <uint32_t register_lower_bound, uint32_t register_upper_bound>
   XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
       uint32_t start_index, uint32_t* base, uint32_t num_registers);
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index af977d4d5..98d2802ee 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -551,30 +551,60 @@ void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
     viewport_info_out.ndc_offset[i] = ndc_offset[i];
   }
 }
-void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
-                bool clamp_to_surface_pitch) {
+template <bool clamp_to_surface_pitch>
+XE_NOINLINE static void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
+                                       Scissor& XE_RESTRICT scissor_out) {
   auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
-  int32_t tl_x = int32_t(pa_sc_window_scissor_tl.tl_x);
-  int32_t tl_y = int32_t(pa_sc_window_scissor_tl.tl_y);
   auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
-  int32_t br_x = int32_t(pa_sc_window_scissor_br.br_x);
-  int32_t br_y = int32_t(pa_sc_window_scissor_br.br_y);
-  if (!pa_sc_window_scissor_tl.window_offset_disable) {
-    auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
-    tl_x += pa_sc_window_offset.window_x_offset;
-    tl_y += pa_sc_window_offset.window_y_offset;
-    br_x += pa_sc_window_offset.window_x_offset;
-    br_y += pa_sc_window_offset.window_y_offset;
+  auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+  auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
+  auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
+  uint32_t surface_pitch = 0;
+  if constexpr (clamp_to_surface_pitch) {
+    surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
   }
+  uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x,
+           pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y,
+           pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x,
+           pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y,
+           pa_sc_window_offset_window_x_offset =
+               pa_sc_window_offset.window_x_offset,
+           pa_sc_window_offset_window_y_offset =
+               pa_sc_window_offset.window_y_offset,
+           pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x,
+           pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y,
+           pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x,
+           pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y;
+
+  int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x);
+  int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y);
+
+  int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x);
+  int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y);
+
+  // chrispy: put this here to make it clear that the shift by 31 is extracting
+  // this field
+  XE_MAYBE_UNUSED
+  uint32_t window_offset_disable_reference =
+      pa_sc_window_scissor_tl.window_offset_disable;
+  int32_t window_offset_disable_mask =
+      ~(static_cast<int32_t>(pa_sc_window_scissor_tl.value) >> 31);
+  // if (!pa_sc_window_scissor_tl.window_offset_disable) {
+
+  tl_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask;
+  tl_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask;
+  br_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask;
+  br_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask;
+  //}
   // Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but
   // still handled here for completeness.
-  auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
-  tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl.tl_x));
-  tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl.tl_y));
-  auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
-  br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br.br_x));
-  br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br.br_y));
-  if (clamp_to_surface_pitch) {
+
+  tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl_tl_x));
+  tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl_tl_y));
+
+  br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br_br_x));
+  br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br_br_y));
+  if constexpr (clamp_to_surface_pitch) {
     // Clamp the horizontal scissor to surface_pitch for safety, in case that's
     // not done by the guest for some reason (it's not when doing draws without
     // clipping in Direct3D 9, for instance), to prevent overflow - this is
@@ -582,7 +612,7 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
     // rasterization without render target width at all (pixel shader
     // interlock-based custom RB implementations) and using conventional render
     // targets, but padded to EDRAM tiles.
-    uint32_t surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
+
     tl_x = std::min(tl_x, int32_t(surface_pitch));
     br_x = std::min(br_x, int32_t(surface_pitch));
   }
@@ -601,6 +631,15 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
   scissor_out.extent[1] = uint32_t(br_y - tl_y);
 }
 
+void GetScissor(const RegisterFile& XE_RESTRICT regs,
+                Scissor& XE_RESTRICT scissor_out, bool clamp_to_surface_pitch) {
+  if (clamp_to_surface_pitch) {
+    return GetScissorTmpl<true>(regs, scissor_out);
+  } else {
+    return GetScissorTmpl<false>(regs, scissor_out);
+  }
+}
+
 uint32_t GetNormalizedColorMask(const RegisterFile& regs,
                                 uint32_t pixel_shader_writes_color_targets) {
   if (regs.Get<reg::RB_MODECONTROL>().edram_mode !=
@@ -863,7 +902,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
     y1 = y0 + int32_t(xenos::kMaxResolveSize);
   }
   // fails in forza horizon 1
-  //x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
+  // x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
   assert_true(x0 <= x1 && y0 <= y1);
   if (x0 >= x1 || y0 >= y1) {
     XELOGE("Resolve region is empty");
@@ -1103,7 +1142,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
   info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
   info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
   info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
-  #if 0
+#if 0
   XELOGD(
       "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
       "modified memory range 0x{:08X} to 0x{:08X})",
@@ -1114,7 +1153,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                      xenos::ColorRenderTargetFormat(color_edram_info.format)),
       FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
       copy_dest_extent_end);
-  #endif
+#endif
   return true;
 }
 XE_MSVC_OPTIMIZE_REVERT()
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index ececbaac9..8196830b8 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -433,13 +433,15 @@ struct GetViewportInfoArgs {
 void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
                          ViewportInfo& viewport_info_out);
 
-struct Scissor {
+struct alignas(16) Scissor {
   // Offset from render target UV = 0 to +UV.
   uint32_t offset[2];
   // Extent can be zero.
   uint32_t extent[2];
 };
-void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
+
+void GetScissor(const RegisterFile& XE_RESTRICT regs,
+                Scissor& XE_RESTRICT scissor_out,
                 bool clamp_to_surface_pitch = true);
 
 // Returns the color component write mask for the draw command taking into
diff --git a/src/xenia/gpu/pm4_command_processor_declare.h b/src/xenia/gpu/pm4_command_processor_declare.h
index da0888f21..de6b66f6f 100644
--- a/src/xenia/gpu/pm4_command_processor_declare.h
+++ b/src/xenia/gpu/pm4_command_processor_declare.h
@@ -109,7 +109,7 @@ XE_NOINLINE
 XE_COLD
 bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT;
 
-XE_NOINLINE
+XE_FORCEINLINE
 XE_NOALIAS
 uint32_t GetCurrentRingReadCount();
 
diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h
index 2fc63cef2..21b9553d4 100644
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@@ -4,7 +4,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
                                               uint32_t count) XE_RESTRICT {
   SCOPE_profile_cpu_f("gpu");
 
-
   trace_writer_.WriteIndirectBufferStart(ptr, count * sizeof(uint32_t));
   if (count != 0) {
     RingBuffer old_reader = reader_;
@@ -32,10 +31,9 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
     trace_writer_.WriteIndirectBufferEnd();
     reader_ = old_reader;
   } else {
-	  //rare, but i've seen it happen! (and then a division by 0 occurs)
+    // rare, but i've seen it happen! (and then a division by 0 occurs)
     return;
   }
-
 }
 
 bool COMMAND_PROCESSOR::ExecutePacket() {
@@ -88,8 +86,9 @@ bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) {
          count * sizeof(uint32_t));
   return false;
 }
-    /*
-	Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4
+/*
+    Todo: optimize this function this one along with execute packet type III are
+   the most frequently called functions for PM4
 */
 XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
@@ -99,7 +98,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
 
   uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
 
-
   if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >=
       count * sizeof(uint32_t)) {
     trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
@@ -143,7 +141,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType2(uint32_t packet) XE_RESTRICT {
   trace_writer_.WritePacketEnd();
   return true;
 }
-XE_NOINLINE
+XE_FORCEINLINE
 XE_NOALIAS
 uint32_t COMMAND_PROCESSOR::GetCurrentRingReadCount() {
   return reader_.read_count();
@@ -446,41 +444,46 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(
   return true;
 }
 
-XE_NOINLINE
+/*
+        chrispy: this is fine to inline, as a noinline function it compiled down
+   to 54 bytes
+*/
 static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) {
-  /*
-	Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits 
-	then use the wait_info value in order to select the bits that correctly implement the condition
-	If neither subtraction has the signbit set then that means the value is equal
-  */
-  bool matched = false;
-  switch (wait_info & 0x7) {
-    case 0x0:  // Never.
-      matched = false;
-      break;
-    case 0x1:  // Less than reference.
-      matched = value < ref;
-      break;
-    case 0x2:  // Less than or equal to reference.
-      matched = value <= ref;
-      break;
-    case 0x3:  // Equal to reference.
-      matched = value == ref;
-      break;
-    case 0x4:  // Not equal to reference.
-      matched = value != ref;
-      break;
-    case 0x5:  // Greater than or equal to reference.
-      matched = value >= ref;
-      break;
-    case 0x6:  // Greater than reference.
-      matched = value > ref;
-      break;
-    case 0x7:  // Always
-      matched = true;
-      break;
-  }
-  return matched;
+// smaller code is generated than the #else path, although whether it is faster
+// i do not know. i don't think games do an enormous number of cond_write
+// though, so we have picked
+// the path with the smaller codegen.
+// we do technically have more instructions executed vs the switch case method,
+// but we have no mispredicts and most of our instructions are 0.25/0.3
+// throughput
+#if 1
+  uint32_t value_minus_ref =
+      static_cast<uint32_t>(static_cast<int32_t>(value - ref) >> 31);
+  uint32_t ref_minus_value =
+      static_cast<uint32_t>(static_cast<int32_t>(ref - value) >> 31);
+  uint32_t eqmask = ~(value_minus_ref | ref_minus_value);
+  uint32_t nemask = (value_minus_ref | ref_minus_value);
+
+  uint32_t value_lt_mask = value_minus_ref;
+  uint32_t value_gt_mask = ref_minus_value;
+  uint32_t value_lte_mask = value_lt_mask | eqmask;
+  uint32_t value_gte_mask = value_gt_mask | eqmask;
+
+  uint32_t bits_for_selecting =
+      (value_lt_mask & (1 << 1)) | (value_lte_mask & (1 << 2)) |
+      (eqmask & (1 << 3)) | (nemask & (1 << 4)) | (value_gte_mask & (1 << 5)) |
+      (value_gt_mask & (1 << 6)) | (1 << 7);
+
+  return (bits_for_selecting >> (wait_info & 7)) & 1;
+
+#else
+
+  return ((((value < ref) << 1) | ((value <= ref) << 2) |
+           ((value == ref) << 3) | ((value != ref) << 4) |
+           ((value >= ref) << 5) | ((value > ref) << 6) | (1 << 7)) >>
+          (wait_info & 7)) &
+         1;
+#endif
 }
 XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
@@ -1128,7 +1131,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(
 }
 
 uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index,
-                                                uint32_t write_index) {
+                                                 uint32_t write_index) {
   SCOPE_profile_cpu_f("gpu");
 #if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
   // If we have a pending trace stream open it now. That way we ensure we get
diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h
index ff29fcb38..606367b66 100644
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@@ -104,6 +104,15 @@ class TextureCache {
   void TextureFetchConstantWritten(uint32_t index) {
     texture_bindings_in_sync_ &= ~(UINT32_C(1) << index);
   }
+  void TextureFetchConstantsWritten(uint32_t first_index, uint32_t last_index) {
+    // generate a mask of all bits from before the first index, and xor it with
+    // all bits before the last index this produces a mask covering only the
+    // bits between first and last
+    uint32_t res = ((1U << first_index) - 1) ^ ((1U << last_index) - 1);
+    // todo: check that this is right
+
+    texture_bindings_in_sync_ &= ~res;
+  }
 
   virtual void RequestTextures(uint32_t used_texture_mask);
 

From ab6d9dade0c62f5a5c689cb6e7a4474fba9cb9a9 Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Wed, 14 Dec 2022 07:53:21 -0800
Subject: [PATCH 2/6] add avx2 codepath for copy_and_swap_32_unaligned use the
 new writerange approach in WriteRegisterRangeFromMem_WithKnownBound

---
 src/xenia/base/memory.cc                      |  46 +++-
 .../gpu/d3d12/d3d12_command_processor.cc      | 243 +++++++++---------
 src/xenia/gpu/d3d12/d3d12_command_processor.h |   3 +
 3 files changed, 163 insertions(+), 129 deletions(-)

diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc
index cf6788f96..41972aae9 100644
--- a/src/xenia/base/memory.cc
+++ b/src/xenia/base/memory.cc
@@ -309,15 +309,45 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
                                 size_t count) {
   auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
   auto src = reinterpret_cast<const uint32_t*>(src_ptr);
-  __m128i shufmask =
-      _mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
-                   0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
-
   size_t i;
-  for (i = 0; i + 4 <= count; i += 4) {
-    __m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
-    __m128i output = _mm_shuffle_epi8(input, shufmask);
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
+  // chrispy: this optimization mightt backfire if our unaligned load spans two
+  // cachelines... which it probably will
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitAVX2) {
+    __m256i shufmask = _mm256_set_epi8(
+        0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07,
+        0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B,
+        0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
+    // with vpshufb being a 0.5 through instruction, it makes the most sense to
+    // double up on our iters
+    for (i = 0; i + 16 <= count; i += 16) {
+      __m256i input1 =
+          _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
+      __m256i input2 =
+          _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i + 8]));
+
+      __m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
+      __m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
+
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
+    }
+    for (; i + 8 <= count; i += 8) {
+      __m256i input =
+          _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
+      __m256i output = _mm256_shuffle_epi8(input, shufmask);
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
+    }
+  } else {
+    __m128i shufmask =
+        _mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
+                     0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
+
+    for (i = 0; i + 4 <= count; i += 4) {
+      __m128i input =
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
+      __m128i output = _mm_shuffle_epi8(input, shufmask);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
+    }
   }
   XE_WORKAROUND_CONSTANT_RETURN_IF(count % 4 == 0);
   for (; i < count; ++i) {  // handle residual elements
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 70b09501e..09315be09 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -1903,11 +1903,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
   uint32_t num_regs_firstrange =
       static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
 
-  D3D12CommandProcessor::WriteRegistersFromMem(
+  D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
       base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
       num_regs_firstrange);
 
-  D3D12CommandProcessor::WriteRegistersFromMem(
+  D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
       base + num_regs_firstrange,
       reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
       num_registers - num_regs_firstrange);
@@ -1948,6 +1948,7 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
 }
 void D3D12CommandProcessor::WriteShaderConstantsFromMem(
     uint32_t start_index, uint32_t* base, uint32_t num_registers) {
+#if 1
   if (frame_open_) {
     bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
     bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
@@ -1964,12 +1965,36 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
       uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
       uint32_t end_map_index =
           (start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
+
+      if (map_index < 256 && cbuffer_vertex_uptodate) {
+        for (; map_index < end_map_index; ++map_index) {
+          if (current_float_constant_map_vertex_[map_index >> 6] &
+              (1ull << map_index)) {
+            cbuffer_vertex_uptodate = false;
+            break;
+          }
+        }
+      }
+      if (end_map_index > 256 && cbuffer_pixel_uptodate) {
+        for (; map_index < end_map_index; ++map_index) {
+          uint32_t float_constant_index = map_index;
+          float_constant_index -= 256;
+          if (current_float_constant_map_pixel_[float_constant_index >> 6] &
+              (1ull << float_constant_index)) {
+            cbuffer_pixel_uptodate = false;
+            break;
+          }
+        }
+      }
+
+#if 0
       if (!cbuffer_vertex_uptodate) {
         if (256 >= end_map_index) {
           goto skip_map_checks;
         }
         map_index = 256;
       }
+
       for (; map_index < end_map_index; ++map_index) {
         uint32_t float_constant_index = map_index;
         if (float_constant_index >= 256) {
@@ -1996,10 +2021,17 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
         }
       }
     skip_map_checks:;
+#endif
     }
     cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
     cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
   }
+#else
+  if (frame_open_) {
+    cbuffer_binding_float_pixel_.up_to_date = false;
+    cbuffer_binding_float_vertex_.up_to_date = false;
+  }
+#endif
   // maybe use non-temporal copy if possible...
   copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
                              num_registers);
@@ -2038,10 +2070,16 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
   auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
     return std::min<uint32_t>(regnum, end) - current_index;
   };
-
-#define DO_A_RANGE_CALLBACK(start_range, end_range, index, base, n)     \
-  WriteRegisterRangeFromMem_WithKnownBound<(start_range), (end_range)>( \
-      index, base, n)
+#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
+  copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
+#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
+  WriteFetchFromMem(ind, b, n)
+#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
+  WritePossiblySpecialRegistersFromMem(ind, bs, n)
+#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
+                                        n)                                   \
+  WriteShaderConstantsFromMem(index, base, n)
+#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
 
 #define DO_A_RANGE(start_range, end_range, cb)                     \
   if (current_index < (end_range)) {                               \
@@ -2054,145 +2092,108 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
     return;                                                        \
   }
 
-  if (start_index >= XE_GPU_REG_SHADER_CONSTANT_000_X) {  // fairly common
-    goto shader_vars_start;
-  }
-
-#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
-  copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
   DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
+
   DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
-             DO_A_RANGE_CALLBACK);
+             SPECIAL_REG_RANGE_CALLBACK);
   DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
              REGULAR_WRITE_CALLBACK);
 
-#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
-                                        n)                                   \
-  WriteShaderConstantsFromMem(index, base, n)
-shader_vars_start:
   DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
              XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
              WRITE_SHADER_CONSTANTS_CALLBACK);
 
-#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
-  WriteFetchFromMem(ind, b, n)
   DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
              XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
              WRITE_FETCH_CONSTANTS_CALLBACK);
-#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
   DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
              XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
   DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
              REGULAR_WRITE_CALLBACK);
 }
+void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
+    uint32_t start_index, uint32_t* base, uint32_t numregs) {
+  uint32_t end = numregs + start_index;
+  for (uint32_t index = start_index; index < end; ++index, ++base) {
+    uint32_t value = xe::load_and_swap<uint32_t>(base);
+
+    register_file_->values[index].u32 = value;
+
+    unsigned expr = 0;
+
+    expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
+
+    expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
+
+    expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
+             (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
+
+    if (expr == 0) {
+    } else {
+      HandleSpecialRegisterWrite(index, value);
+    }
+  }
+}
 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
 XE_FORCEINLINE void
 D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
-    uint32_t base, uint32_t* range, uint32_t num_registers) {
-  constexpr auto bounds_has_reg =
-      bounds_may_have_reg<register_lower_bound, register_upper_bound>;
-  constexpr auto bounds_has_bounds =
-      bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
+    uint32_t start_index, uint32_t* base, uint32_t num_registers) {
+  uint32_t end = start_index + num_registers;
 
-  bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
-  bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
+  uint32_t current_index = start_index;
 
-  bool skip_uptodate_checks =
-      (!cbuffer_pixel_uptodate && !cbuffer_vertex_uptodate) || (!frame_open_);
-  for (uint32_t i = 0; i < num_registers; ++i) {
-    uint32_t data = xe::load_and_swap<uint32_t>(range + i);
-    uint32_t index = base + i;
-    uint32_t value = data;
-    // cant if constexpr this one or we get unreferenced label errors, and if we
-    // move the label into the else we get errors about a jump from one if
-    // constexpr into another
-    if (register_lower_bound == 0 && register_upper_bound == 0xFFFF) {
-      D3D12CommandProcessor::WriteRegisterForceinline(index, value);
-    } else {
-      XE_MSVC_ASSUME(index >= register_lower_bound &&
-                     index < register_upper_bound);
-      register_file_->values[index].u32 = value;
+  auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
+    return std::min<uint32_t>(regnum, end) - current_index;
+  };
+#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
+  copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
+#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
+  WriteFetchFromMem(ind, b, n)
+#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
+  WritePossiblySpecialRegistersFromMem(ind, bs, n)
+#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
+                                        n)                                   \
+  WriteShaderConstantsFromMem(index, base, n)
+#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
 
-      unsigned expr = 0;
-
-      if constexpr (bounds_has_bounds(XE_GPU_REG_SCRATCH_REG0,
-                                      XE_GPU_REG_SCRATCH_REG7)) {
-        expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
-      }
-      if constexpr (bounds_has_reg(XE_GPU_REG_COHER_STATUS_HOST)) {
-        expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
-      }
-      if constexpr (bounds_has_bounds(XE_GPU_REG_DC_LUT_RW_INDEX,
-                                      XE_GPU_REG_DC_LUT_30_COLOR)) {
-        expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
-                 (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
-      }
-      // chrispy: reordered for msvc branch probability (assumes
-      // if is taken and else is not)
-      if (XE_LIKELY(expr == 0)) {
-        XE_MSVC_REORDER_BARRIER();
-
-      } else {
-        HandleSpecialRegisterWrite(index, value);
-        goto write_done;
-      }
-      XE_MSVC_ASSUME(index >= register_lower_bound &&
-                     index < register_upper_bound);
-      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
-                                      XE_GPU_REG_SHADER_CONSTANT_511_W)) {
-        if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
-            index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
-          if (!skip_uptodate_checks) {
-            uint32_t float_constant_index =
-                (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
-            if (float_constant_index >= 256) {
-              float_constant_index -= 256;
-              if (current_float_constant_map_pixel_[float_constant_index >> 6] &
-                  (1ull << (float_constant_index & 63))) {
-                cbuffer_binding_float_pixel_.up_to_date = false;
-              }
-            } else {
-              if (current_float_constant_map_vertex_[float_constant_index >>
-                                                     6] &
-                  (1ull << (float_constant_index & 63))) {
-                cbuffer_binding_float_vertex_.up_to_date = false;
-                if (!cbuffer_pixel_uptodate) {
-                  skip_uptodate_checks = true;
-                }
-              }
-            }
-          }
-          goto write_done;
-        }
-      }
-      XE_MSVC_ASSUME(index >= register_lower_bound &&
-                     index < register_upper_bound);
-      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
-                                      XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
-        if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
-            index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
-          cbuffer_binding_fetch_.up_to_date = false;
-          // texture cache is never nullptr
-          texture_cache_->TextureFetchConstantWritten(
-              (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
-
-          goto write_done;
-        }
-      }
-
-      XE_MSVC_ASSUME(index >= register_lower_bound &&
-                     index < register_upper_bound);
-      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
-                                      XE_GPU_REG_SHADER_CONSTANT_LOOP_31)) {
-        if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
-            index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
-          cbuffer_binding_bool_loop_.up_to_date = false;
-          goto write_done;
-        }
-      }
-    }
-  write_done:;
+#define DO_A_RANGE(start_range, end_range, cb)                     \
+  if (current_index < (end_range)) {                               \
+    uint32_t ntowrite = get_end_before_qty(end_range);             \
+    cb((start_range), (end_range), current_index, base, ntowrite); \
+    current_index += ntowrite;                                     \
+    base += ntowrite;                                              \
+  }                                                                \
+  if (current_index >= end) {                                      \
+    return;                                                        \
   }
+
+#define REFRESH_MSVC_RANGE()                              \
+  XE_MSVC_ASSUME(current_index >= register_lower_bound && \
+                 current_index < register_upper_bound)
+
+  REFRESH_MSVC_RANGE();
+  DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
+  REFRESH_MSVC_RANGE();
+  DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
+             SPECIAL_REG_RANGE_CALLBACK);
+  REFRESH_MSVC_RANGE();
+  DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
+             REGULAR_WRITE_CALLBACK);
+  REFRESH_MSVC_RANGE();
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
+             XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+             WRITE_SHADER_CONSTANTS_CALLBACK);
+  REFRESH_MSVC_RANGE();
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+             XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
+             WRITE_FETCH_CONSTANTS_CALLBACK);
+  REFRESH_MSVC_RANGE();
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
+             XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
+  REFRESH_MSVC_RANGE();
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
+             REGULAR_WRITE_CALLBACK);
+
 }
 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
 XE_FORCEINLINE void
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 0fcd76593..7bf11c9b7 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -226,6 +226,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
                          uint32_t num_registers);
   void WriteRegistersFromMemCommonSense(uint32_t start_index,  uint32_t* base,
                                      uint32_t num_registers) ;
+
+  void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
+                                           uint32_t num_registers);
   template <uint32_t register_lower_bound, uint32_t register_upper_bound>
   XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
       uint32_t start_index, uint32_t* base, uint32_t num_registers);

From 080b6f4cbd501bcaa1f6b81da7424b2f34f350fc Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Wed, 14 Dec 2022 09:33:14 -0800
Subject: [PATCH 3/6] Partially vectorized GetScissor (loading and unpacking
 the bitfields from the registers is still scalar)

---
 .../gpu/d3d12/d3d12_command_processor.cc      |  9 +-
 src/xenia/gpu/draw_util.cc                    | 87 ++++++++++++++++++-
 2 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 09315be09..5c7c6722f 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -2193,7 +2193,6 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
   REFRESH_MSVC_RANGE();
   DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
              REGULAR_WRITE_CALLBACK);
-
 }
 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
 XE_FORCEINLINE void
@@ -2799,11 +2798,17 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   // todo: use SIMD for getscissor + scaling here, should reduce code size more
   draw_util::Scissor scissor;
   draw_util::GetScissor(regs, scissor);
+#if XE_ARCH_AMD64 == 1
+  __m128i* scisp = (__m128i*)&scissor;
+  *scisp = _mm_mullo_epi32(
+      *scisp, _mm_setr_epi32(draw_resolution_scale_x, draw_resolution_scale_y,
+                             draw_resolution_scale_x, draw_resolution_scale_y));
+#else
   scissor.offset[0] *= draw_resolution_scale_x;
   scissor.offset[1] *= draw_resolution_scale_y;
   scissor.extent[0] *= draw_resolution_scale_x;
   scissor.extent[1] *= draw_resolution_scale_y;
-
+#endif
   // Update viewport, scissor, blend factor and stencil reference.
   UpdateFixedFunctionState(viewport_info, scissor, primitive_polygonal,
                            normalized_depth_control);
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index 98d2802ee..e6461e8bd 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -552,8 +552,90 @@ void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
   }
 }
 template <bool clamp_to_surface_pitch>
-XE_NOINLINE static void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
-                                       Scissor& XE_RESTRICT scissor_out) {
+static inline
+void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
+                           Scissor& XE_RESTRICT scissor_out) {
+#if XE_ARCH_AMD64 == 1
+  auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
+  auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
+  auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+  auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
+  auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
+  uint32_t surface_pitch = 0;
+  if constexpr (clamp_to_surface_pitch) {
+    surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
+  }
+  uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x,
+           pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y,
+           pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x,
+           pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y,
+           pa_sc_window_offset_window_x_offset =
+               pa_sc_window_offset.window_x_offset,
+           pa_sc_window_offset_window_y_offset =
+               pa_sc_window_offset.window_y_offset,
+           pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x,
+           pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y,
+           pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x,
+           pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y;
+
+  int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x);
+  int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y);
+
+  int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x);
+  int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y);
+
+  __m128i tmp1 = _mm_setr_epi32(tl_x, tl_y, br_x, br_y);
+  __m128i pa_sc_scissor = _mm_setr_epi32(
+      pa_sc_screen_scissor_tl_tl_x, pa_sc_screen_scissor_tl_tl_y,
+      pa_sc_screen_scissor_br_br_x, pa_sc_screen_scissor_br_br_y);
+  __m128i xyoffsetadd = _mm_cvtsi64x_si128(
+      static_cast<unsigned long long>(pa_sc_window_offset_window_x_offset) |
+      (static_cast<unsigned long long>(pa_sc_window_offset_window_y_offset)
+       << 32));
+  xyoffsetadd = _mm_unpacklo_epi64(xyoffsetadd, xyoffsetadd);
+  // chrispy: put this here to make it clear that the shift by 31 is extracting
+  // this field
+  XE_MAYBE_UNUSED
+  uint32_t window_offset_disable_reference =
+      pa_sc_window_scissor_tl.window_offset_disable;
+
+  __m128i offset_disable_mask = _mm_set1_epi32(pa_sc_window_scissor_tl.value);
+
+  __m128i addend = _mm_blendv_epi8(xyoffsetadd, _mm_setzero_si128(),
+                                   _mm_srai_epi32(offset_disable_mask, 31));
+
+  tmp1 = _mm_add_epi32(tmp1, addend);
+
+  //}
+  // Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but
+  // still handled here for completeness.
+  __m128i lomax = _mm_max_epi32(tmp1, pa_sc_scissor);
+  __m128i himin = _mm_min_epi32(tmp1, pa_sc_scissor);
+
+  tmp1 = _mm_blend_epi16(lomax, himin, 0b11110000);
+
+  if constexpr (clamp_to_surface_pitch) {
+    // Clamp the horizontal scissor to surface_pitch for safety, in case that's
+    // not done by the guest for some reason (it's not when doing draws without
+    // clipping in Direct3D 9, for instance), to prevent overflow - this is
+    // important for host implementations, both based on target-indepedent
+    // rasterization without render target width at all (pixel shader
+    // interlock-based custom RB implementations) and using conventional render
+    // targets, but padded to EDRAM tiles.
+    tmp1 = _mm_blend_epi16(
+        tmp1, _mm_min_epi32(tmp1, _mm_set1_epi32(surface_pitch)),
+                            0b00110011);
+  }
+
+  tmp1 = _mm_max_epi32(tmp1, _mm_setzero_si128());
+
+  __m128i tl_in_high = _mm_unpacklo_epi64(tmp1, tmp1);
+
+  __m128i final_br = _mm_max_epi32(tmp1, tl_in_high);
+  final_br = _mm_sub_epi32(final_br, tl_in_high);
+  __m128i scissor_res = _mm_blend_epi16(tmp1, final_br, 0b11110000);
+  _mm_storeu_si128((__m128i*)&scissor_out, scissor_res);
+#else
   auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
   auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
   auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
@@ -629,6 +711,7 @@ XE_NOINLINE static void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
   scissor_out.offset[1] = uint32_t(tl_y);
   scissor_out.extent[0] = uint32_t(br_x - tl_x);
   scissor_out.extent[1] = uint32_t(br_y - tl_y);
+#endif
 }
 
 void GetScissor(const RegisterFile& XE_RESTRICT regs,

From 7a0fd0f32a65582071c1618a0c92e4c9727efcfc Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Wed, 14 Dec 2022 09:56:33 -0800
Subject: [PATCH 4/6] Remove MaybeYields when vsync is off

---
 src/xenia/gpu/pm4_command_processor_implement.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h
index 21b9553d4..89dcdf7ca 100644
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@@ -523,19 +523,18 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
         PrepareForWait();
         if (!cvars::vsync) {
           // User wants it fast and dangerous.
-          xe::threading::MaybeYield();
+          // do nothing
         } else {
           xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100));
+          ReturnFromWait();
         }
-        // xe::threading::SyncMemory();
-        ReturnFromWait();
 
         if (!worker_running_) {
           // Short-circuited exit.
           return false;
         }
       } else {
-        xe::threading::MaybeYield();
+        //xe::threading::MaybeYield();
       }
     }
   } while (!matched);

From 754293ffc302374dbe025455cefa0ad1c78ad733 Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Wed, 14 Dec 2022 10:13:13 -0800
Subject: [PATCH 5/6] Fix mistake with fetch constant dirty mask

---
 src/xenia/gpu/texture_cache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h
index 606367b66..717273275 100644
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@@ -108,7 +108,7 @@ class TextureCache {
     // generate a mask of all bits from before the first index, and xor it with
     // all bits before the last index this produces a mask covering only the
     // bits between first and last
-    uint32_t res = ((1U << first_index) - 1) ^ ((1U << last_index) - 1);
+    uint32_t res = ((1U << first_index) - 1) ^ ((1U << (last_index + 1)) - 1);
     // todo: check that this is right
 
     texture_bindings_in_sync_ &= ~res;

From f931c34ecb3406f68e3d7d10536bd61dc1d442a7 Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Wed, 14 Dec 2022 11:34:33 -0800
Subject: [PATCH 6/6] Cleaned up for commit,  moved
 WriteRegistersFromMemCommonSense code into WriteRegistersFromMem optimized
 copy_and_swap_32_unaligned further

---
 src/xenia/base/memory.cc                      |  23 ++-
 .../gpu/d3d12/d3d12_command_processor.cc      | 135 +++---------------
 src/xenia/gpu/d3d12/d3d12_command_processor.h |  11 +-
 .../gpu/pm4_command_processor_implement.h     |   1 -
 4 files changed, 49 insertions(+), 121 deletions(-)

diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc
index 41972aae9..4fb537226 100644
--- a/src/xenia/base/memory.cc
+++ b/src/xenia/base/memory.cc
@@ -327,15 +327,34 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
 
       __m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
       __m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
-
+	  //chrispy: todo, benchmark this w/ and w/out these prefetches here on multiple machines
+	  //finding a good distance for prefetchw in particular is probably important
+	  //for when we're writing across 2 cachelines
+	  #if 0
+      if (i + 48 <= count) {
+        swcache::PrefetchNTA(&src[i + 32]);
+        if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
+          swcache::PrefetchW(&dest[i + 32]);
+        }
+      }
+	  #endif
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
     }
-    for (; i + 8 <= count; i += 8) {
+    if (i + 8 <= count) {
       __m256i input =
           _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
       __m256i output = _mm256_shuffle_epi8(input, shufmask);
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
+      i += 8;
+    }
+    if (i + 4 <= count) {
+      __m128i input =
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
+      __m128i output =
+          _mm_shuffle_epi8(input, _mm256_castsi256_si128(shufmask));
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
+      i += 4;
     }
   } else {
     __m128i shufmask =
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 5c7c6722f..3d31e50de 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -1793,10 +1793,8 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
 void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
                                                   uint32_t* base,
                                                   uint32_t num_registers) {
-  for (uint32_t i = 0; i < num_registers; ++i) {
-    uint32_t data = xe::load_and_swap<uint32_t>(base + i);
-    D3D12CommandProcessor::WriteRegister(start_index + i, data);
-  }
+  WriteRegisterRangeFromMem_WithKnownBound<0, 0xFFFF>(start_index, base,
+                                                      num_registers);
 }
 
 void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring,
@@ -1903,11 +1901,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
   uint32_t num_regs_firstrange =
       static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
 
-  D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
+  D3D12CommandProcessor::WriteRegistersFromMem(
       base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
       num_regs_firstrange);
 
-  D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
+  D3D12CommandProcessor::WriteRegistersFromMem(
       base + num_regs_firstrange,
       reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
       num_registers - num_regs_firstrange);
@@ -1917,14 +1915,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
 void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
                                                        uint32_t base,
                                                        uint32_t num_registers) {
-  // WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
-  //                                                     num_registers);
-
   RingBuffer::ReadRange range =
       ring->BeginRead(num_registers * sizeof(uint32_t));
 
   XE_LIKELY_IF(!range.second) {
-    WriteRegistersFromMemCommonSense(
+    WriteRegistersFromMem(
         base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
         num_registers);
 
@@ -1946,9 +1941,9 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
          bounds_may_have_reg<register_lower_bound, register_upper_bound>(
              last_reg);
 }
+XE_FORCEINLINE
 void D3D12CommandProcessor::WriteShaderConstantsFromMem(
     uint32_t start_index, uint32_t* base, uint32_t num_registers) {
-#if 1
   if (frame_open_) {
     bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
     bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
@@ -1986,57 +1981,16 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
           }
         }
       }
-
-#if 0
-      if (!cbuffer_vertex_uptodate) {
-        if (256 >= end_map_index) {
-          goto skip_map_checks;
-        }
-        map_index = 256;
-      }
-
-      for (; map_index < end_map_index; ++map_index) {
-        uint32_t float_constant_index = map_index;
-        if (float_constant_index >= 256) {
-          float_constant_index -= 256;
-          if (current_float_constant_map_pixel_[float_constant_index >> 6] &
-              (1ull << (float_constant_index & 63))) {
-            cbuffer_pixel_uptodate = false;
-            if (!cbuffer_vertex_uptodate) {
-              break;
-            }
-          }
-        } else {
-          if (current_float_constant_map_vertex_[float_constant_index >> 6] &
-              (1ull << (float_constant_index & 63))) {
-            cbuffer_vertex_uptodate = false;
-            if (!cbuffer_pixel_uptodate) {
-              break;
-            } else {
-              map_index = 255;  // skip to checking pixel
-              continue;  // increment will put us at 256, then the check will
-                         // happen
-            }
-          }
-        }
-      }
-    skip_map_checks:;
-#endif
     }
     cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
     cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
   }
-#else
-  if (frame_open_) {
-    cbuffer_binding_float_pixel_.up_to_date = false;
-    cbuffer_binding_float_vertex_.up_to_date = false;
-  }
-#endif
+
   // maybe use non-temporal copy if possible...
   copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
                              num_registers);
 }
-
+XE_FORCEINLINE
 void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
                                                  uint32_t* base,
                                                  uint32_t num_registers) {
@@ -2044,6 +1998,7 @@ void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
   copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
                              num_registers);
 }
+XE_FORCEINLINE
 void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
                                               uint32_t* base,
                                               uint32_t num_registers) {
@@ -2061,56 +2016,6 @@ void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
                              num_registers);
 }
 
-void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
-    uint32_t start_index, uint32_t* base, uint32_t num_registers) {
-  uint32_t end = start_index + num_registers;
-
-  uint32_t current_index = start_index;
-
-  auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
-    return std::min<uint32_t>(regnum, end) - current_index;
-  };
-#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
-  copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
-#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
-  WriteFetchFromMem(ind, b, n)
-#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
-  WritePossiblySpecialRegistersFromMem(ind, bs, n)
-#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
-                                        n)                                   \
-  WriteShaderConstantsFromMem(index, base, n)
-#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
-
-#define DO_A_RANGE(start_range, end_range, cb)                     \
-  if (current_index < (end_range)) {                               \
-    uint32_t ntowrite = get_end_before_qty(end_range);             \
-    cb((start_range), (end_range), current_index, base, ntowrite); \
-    current_index += ntowrite;                                     \
-    base += ntowrite;                                              \
-  }                                                                \
-  if (current_index >= end) {                                      \
-    return;                                                        \
-  }
-
-  DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
-
-  DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
-             SPECIAL_REG_RANGE_CALLBACK);
-  DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
-             REGULAR_WRITE_CALLBACK);
-
-  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
-             XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
-             WRITE_SHADER_CONSTANTS_CALLBACK);
-
-  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
-             XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
-             WRITE_FETCH_CONSTANTS_CALLBACK);
-  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
-             XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
-  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
-             REGULAR_WRITE_CALLBACK);
-}
 void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
     uint32_t start_index, uint32_t* base, uint32_t numregs) {
   uint32_t end = numregs + start_index;
@@ -2156,15 +2061,18 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
   WriteShaderConstantsFromMem(index, base, n)
 #define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
 
-#define DO_A_RANGE(start_range, end_range, cb)                     \
-  if (current_index < (end_range)) {                               \
-    uint32_t ntowrite = get_end_before_qty(end_range);             \
-    cb((start_range), (end_range), current_index, base, ntowrite); \
-    current_index += ntowrite;                                     \
-    base += ntowrite;                                              \
-  }                                                                \
-  if (current_index >= end) {                                      \
-    return;                                                        \
+#define DO_A_RANGE(start_range, end_range, cb)                       \
+  if constexpr (start_range >= register_lower_bound ||               \
+                end_range > register_lower_bound) {                 \
+    if (current_index < (end_range)) {                               \
+      uint32_t ntowrite = get_end_before_qty(end_range);             \
+      cb((start_range), (end_range), current_index, base, ntowrite); \
+      current_index += ntowrite;                                     \
+      base += ntowrite;                                              \
+    }                                                                \
+    if (current_index >= end) {                                      \
+      return;                                                        \
+    }                                                                \
   }
 
 #define REFRESH_MSVC_RANGE()                              \
@@ -2172,6 +2080,7 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
                  current_index < register_upper_bound)
 
   REFRESH_MSVC_RANGE();
+
   DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
   REFRESH_MSVC_RANGE();
   DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 7bf11c9b7..1fbfba23c 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -214,18 +214,19 @@ class D3D12CommandProcessor final : public CommandProcessor {
   XE_FORCEINLINE
   void WriteRegisterForceinline(uint32_t index, uint32_t value);
   void WriteRegister(uint32_t index, uint32_t value) override;
-  XE_FORCEINLINE
+  
   virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
                                      uint32_t num_registers) override;
-  //SHADER_CONSTANT_blah_XWYZ
+  /*helper functions for WriteRegistersFromMem*/
+  XE_FORCEINLINE
   void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base,
-                                        uint32_t num_registers);
+                                     uint32_t num_registers);
+  XE_FORCEINLINE
   void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base,
                             uint32_t num_registers);
+  XE_FORCEINLINE
   void WriteFetchFromMem(uint32_t start_index, uint32_t* base,
                          uint32_t num_registers);
-  void WriteRegistersFromMemCommonSense(uint32_t start_index,  uint32_t* base,
-                                     uint32_t num_registers) ;
 
   void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
                                            uint32_t num_registers);
diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h
index 89dcdf7ca..4f0aaa330 100644
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@@ -534,7 +534,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
           return false;
         }
       } else {
-        //xe::threading::MaybeYield();
       }
     }
   } while (!matched);