faster/more compact MatchValueAndRef

Made commandprocessor GetcurrentRingReadcount inline, it was made noinline to match PGO decisions but i think PGO can make extra reg allocation decisions that make this inlining choice yield gains, whereas if we do it manually we lose a tiny bit of performance Working on a more compact vectorized version of GetScissor to save icache on cmd processor thread Add WriteRegisterForceinline, will probably end up amending to remove it add ERMS path for vastcpy Adding WriteRegistersFromMemCommonSense (name will be changed later), name is because i realized my approach with optimizing writeregisters has been backwards, instead of handling all checks more quickly within the loop that writes the registers, we need a different loop for each range with unique handling. we also manage to hoist a lot of the logic out of the loops Use 100ns delay for MaybeYield, noticed we often return almost immediately from the syscall so we end up wasting some cpu, instead we give up the cpu for the min waitable time (on my system, this is 0.5 ms) Added a note about affinity mask/dynamic process affinity updates to threading_win Add TextureFetchConstantsWritten
2022-12-13 10:45:19 -08:00 · 2022-12-13 10:45:19 -08:00 · 82dcf3f951
parent 43d7fc5158
commit 82dcf3f951
9 changed files with 345 additions and 97 deletions
--- a/src/xenia/base/memory.cc
+++ b/src/xenia/base/memory.cc
@ -9,8 +9,8 @@

 #include "xenia/base/memory.h"
 #include "xenia/base/cvar.h"
-#include "xenia/base/platform.h"
 #include "xenia/base/logging.h"
+#include "xenia/base/platform.h"

 #if XE_ARCH_ARM64
 #include <arm_neon.h>
@ -59,7 +59,7 @@ static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,

  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
-  
+
  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
    xe::swcache::CacheLine line0, line1, line2, line3;

@ -173,7 +173,12 @@ static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
    _movdir64b(physaddr + i, rdmapping + i);
  }
 }
-
+static void vastcpy_impl_repmovs(CacheLine* XE_RESTRICT physaddr,
+                                 CacheLine* XE_RESTRICT rdmapping,
+                                 uint32_t written_length) {
+  __movsq((unsigned long long*)physaddr, (unsigned long long*)rdmapping,
+          written_length / 8);
+}
 XE_COLD
 static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
                          CacheLine* XE_RESTRICT rdmapping,
@ -189,6 +194,9 @@ static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
    XELOGI("Selecting MOVDIR64M vastcpy.");
    dispatch_to_use = vastcpy_impl_movdir64m;
+  } else if (amd64::GetFeatureFlags() & amd64::kX64FastRepMovs) {
+    XELOGI("Selecting rep movs vastcpy.");
+    dispatch_to_use = vastcpy_impl_repmovs;
  } else {
    XELOGI("Selecting generic AVX vastcpy.");
    dispatch_to_use = vastcpy_impl_avx;
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@ -60,6 +60,12 @@ namespace xe {
 namespace threading {

 void EnableAffinityConfiguration() {
+  // chrispy: i don't think this is necessary,
+  // affinity always seems to be the system mask? research more
+  // also, maybe if ignore_thread_affinities is on we should use
+  // SetProcessAffinityUpdateMode to allow windows to dynamically update
+  // our process' affinity (by default windows cannot change the affinity itself
+  // at runtime, user code must do it)
  HANDLE process_handle = GetCurrentProcess();
  DWORD_PTR process_affinity_mask;
  DWORD_PTR system_affinity_mask;
@ -117,7 +123,7 @@ void set_name(const std::string_view name) {

 // checked ntoskrnl, it does not modify delay, so we can place this as a
 // constant and avoid creating a stack variable
-static const LARGE_INTEGER sleepdelay0_for_maybeyield{{0LL}};
+static const LARGE_INTEGER sleepdelay0_for_maybeyield{{~0u, -1}};

 void MaybeYield() {
 #if 0
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -1712,8 +1712,10 @@ void D3D12CommandProcessor::ShutdownContext() {

  CommandProcessor::ShutdownContext();
 }
-// todo: bit-pack the bools and use bitarith to reduce branches
-void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
+
+XE_FORCEINLINE
+void D3D12CommandProcessor::WriteRegisterForceinline(uint32_t index,
+                                                     uint32_t value) {
  __m128i to_rangecheck = _mm_set1_epi16(static_cast<short>(index));

  __m128i lower_bounds = _mm_setr_epi16(
@ -1783,6 +1785,10 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
    return;
  }
 }
+// todo: bit-pack the bools and use bitarith to reduce branches
+void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
+  WriteRegisterForceinline(index, value);
+}

 void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
                                                  uint32_t* base,
@ -1911,8 +1917,22 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
 void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
                                                       uint32_t base,
                                                       uint32_t num_registers) {
-  WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
-                                                       num_registers);
+  // WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
+  //                                                     num_registers);
+
+  RingBuffer::ReadRange range =
+      ring->BeginRead(num_registers * sizeof(uint32_t));
+
+  XE_LIKELY_IF(!range.second) {
+    WriteRegistersFromMemCommonSense(
+        base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
+        num_registers);
+
+    ring->EndRead(range);
+  }
+  else {
+    return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
+  }
 }

 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
@ -1926,6 +1946,145 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
         bounds_may_have_reg<register_lower_bound, register_upper_bound>(
             last_reg);
 }
+void D3D12CommandProcessor::WriteShaderConstantsFromMem(
+    uint32_t start_index, uint32_t* base, uint32_t num_registers) {
+  if (frame_open_) {
+    bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
+    bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
+    if (cbuffer_pixel_uptodate || cbuffer_vertex_uptodate) {
+      // super naive, could just do some bit magic and interval checking,
+      // but we just need this hoisted out of the copy so we do a bulk copy
+      // because its the actual load/swap/store we're getting murdered by
+      // this precheck followed by copy_and_swap_32_unaligned reduced the cpu
+      // usage from packettype0/writeregistersfrommem from 10-11% of cpu time
+      // spent on xenia to like 1%
+      // chrispy: todo, this can be reduced even further, should be split into
+      // two loops and should skip whole words, this could net us even bigger
+      // gains
+      uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
+      uint32_t end_map_index =
+          (start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
+      if (!cbuffer_vertex_uptodate) {
+        if (256 >= end_map_index) {
+          goto skip_map_checks;
+        }
+        map_index = 256;
+      }
+      for (; map_index < end_map_index; ++map_index) {
+        uint32_t float_constant_index = map_index;
+        if (float_constant_index >= 256) {
+          float_constant_index -= 256;
+          if (current_float_constant_map_pixel_[float_constant_index >> 6] &
+              (1ull << (float_constant_index & 63))) {
+            cbuffer_pixel_uptodate = false;
+            if (!cbuffer_vertex_uptodate) {
+              break;
+            }
+          }
+        } else {
+          if (current_float_constant_map_vertex_[float_constant_index >> 6] &
+              (1ull << (float_constant_index & 63))) {
+            cbuffer_vertex_uptodate = false;
+            if (!cbuffer_pixel_uptodate) {
+              break;
+            } else {
+              map_index = 255;  // skip to checking pixel
+              continue;  // increment will put us at 256, then the check will
+                         // happen
+            }
+          }
+        }
+      }
+    skip_map_checks:;
+    }
+    cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
+    cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
+  }
+  // maybe use non-temporal copy if possible...
+  copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
+                             num_registers);
+}
+
+void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
+                                                 uint32_t* base,
+                                                 uint32_t num_registers) {
+  cbuffer_binding_bool_loop_.up_to_date = false;
+  copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
+                             num_registers);
+}
+void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
+                                              uint32_t* base,
+                                              uint32_t num_registers) {
+  cbuffer_binding_fetch_.up_to_date = false;
+
+  uint32_t first_fetch =
+      ((start_index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
+  uint32_t last_fetch =  // i think last_fetch should be inclusive if its modulo
+                         // is nz...
+      (((start_index + num_registers) - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) /
+       6);
+  texture_cache_->TextureFetchConstantsWritten(first_fetch, last_fetch);
+
+  copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
+                             num_registers);
+}
+
+void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
+    uint32_t start_index, uint32_t* base, uint32_t num_registers) {
+  uint32_t end = start_index + num_registers;
+
+  uint32_t current_index = start_index;
+
+  auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
+    return std::min<uint32_t>(regnum, end) - current_index;
+  };
+
+#define DO_A_RANGE_CALLBACK(start_range, end_range, index, base, n)     \
+  WriteRegisterRangeFromMem_WithKnownBound<(start_range), (end_range)>( \
+      index, base, n)
+
+#define DO_A_RANGE(start_range, end_range, cb)                     \
+  if (current_index < (end_range)) {                               \
+    uint32_t ntowrite = get_end_before_qty(end_range);             \
+    cb((start_range), (end_range), current_index, base, ntowrite); \
+    current_index += ntowrite;                                     \
+    base += ntowrite;                                              \
+  }                                                                \
+  if (current_index >= end) {                                      \
+    return;                                                        \
+  }
+
+  if (start_index >= XE_GPU_REG_SHADER_CONSTANT_000_X) {  // fairly common
+    goto shader_vars_start;
+  }
+
+#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
+  copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
+  DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
+  DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
+             DO_A_RANGE_CALLBACK);
+  DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
+             REGULAR_WRITE_CALLBACK);
+
+#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
+                                        n)                                   \
+  WriteShaderConstantsFromMem(index, base, n)
+shader_vars_start:
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
+             XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+             WRITE_SHADER_CONSTANTS_CALLBACK);
+
+#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
+  WriteFetchFromMem(ind, b, n)
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+             XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
+             WRITE_FETCH_CONSTANTS_CALLBACK);
+#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
+             XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
+             REGULAR_WRITE_CALLBACK);
+}
 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
 XE_FORCEINLINE void
 D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
@ -1935,12 +2094,21 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
  constexpr auto bounds_has_bounds =
      bounds_may_have_bounds<register_lower_bound, register_upper_bound>;

+  bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
+  bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
+
+  bool skip_uptodate_checks =
+      (!cbuffer_pixel_uptodate && !cbuffer_vertex_uptodate) || (!frame_open_);
  for (uint32_t i = 0; i < num_registers; ++i) {
    uint32_t data = xe::load_and_swap<uint32_t>(range + i);
-
-    {
-      uint32_t index = base + i;
-      uint32_t value = data;
+    uint32_t index = base + i;
+    uint32_t value = data;
+    // cant if constexpr this one or we get unreferenced label errors, and if we
+    // move the label into the else we get errors about a jump from one if
+    // constexpr into another
+    if (register_lower_bound == 0 && register_upper_bound == 0xFFFF) {
+      D3D12CommandProcessor::WriteRegisterForceinline(index, value);
+    } else {
      XE_MSVC_ASSUME(index >= register_lower_bound &&
                     index < register_upper_bound);
      register_file_->values[index].u32 = value;
@ -1968,27 +2136,13 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
        HandleSpecialRegisterWrite(index, value);
        goto write_done;
      }
-      XE_MSVC_ASSUME(index >= register_lower_bound &&
-                     index < register_upper_bound);
-      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
-                                      XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
-        if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
-            index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
-          cbuffer_binding_fetch_.up_to_date = false;
-          // texture cache is never nullptr
-          texture_cache_->TextureFetchConstantWritten(
-              (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
-
-          goto write_done;
-        }
-      }
      XE_MSVC_ASSUME(index >= register_lower_bound &&
                     index < register_upper_bound);
      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
                                      XE_GPU_REG_SHADER_CONSTANT_511_W)) {
        if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
            index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
-          if (frame_open_) {
+          if (!skip_uptodate_checks) {
            uint32_t float_constant_index =
                (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
            if (float_constant_index >= 256) {
@ -2002,12 +2156,30 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
                                                     6] &
                  (1ull << (float_constant_index & 63))) {
                cbuffer_binding_float_vertex_.up_to_date = false;
+                if (!cbuffer_pixel_uptodate) {
+                  skip_uptodate_checks = true;
+                }
              }
            }
          }
          goto write_done;
        }
      }
+      XE_MSVC_ASSUME(index >= register_lower_bound &&
+                     index < register_upper_bound);
+      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+                                      XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
+        if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
+            index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
+          cbuffer_binding_fetch_.up_to_date = false;
+          // texture cache is never nullptr
+          texture_cache_->TextureFetchConstantWritten(
+              (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
+
+          goto write_done;
+        }
+      }
+
      XE_MSVC_ASSUME(index >= register_lower_bound &&
                     index < register_upper_bound);
      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -211,12 +211,21 @@ class D3D12CommandProcessor final : public CommandProcessor {
 protected:
  bool SetupContext() override;
  void ShutdownContext() override;
-
+  XE_FORCEINLINE
+  void WriteRegisterForceinline(uint32_t index, uint32_t value);
  void WriteRegister(uint32_t index, uint32_t value) override;
  XE_FORCEINLINE
  virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
                                     uint32_t num_registers) override;
-
+  //SHADER_CONSTANT_blah_XWYZ
+  void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base,
+                                        uint32_t num_registers);
+  void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base,
+                            uint32_t num_registers);
+  void WriteFetchFromMem(uint32_t start_index, uint32_t* base,
+                         uint32_t num_registers);
+  void WriteRegistersFromMemCommonSense(uint32_t start_index,  uint32_t* base,
+                                     uint32_t num_registers) ;
  template <uint32_t register_lower_bound, uint32_t register_upper_bound>
  XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
      uint32_t start_index, uint32_t* base, uint32_t num_registers);
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -551,30 +551,60 @@ void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
    viewport_info_out.ndc_offset[i] = ndc_offset[i];
  }
 }
-void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
-                bool clamp_to_surface_pitch) {
+template <bool clamp_to_surface_pitch>
+XE_NOINLINE static void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
+                                       Scissor& XE_RESTRICT scissor_out) {
  auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
-  int32_t tl_x = int32_t(pa_sc_window_scissor_tl.tl_x);
-  int32_t tl_y = int32_t(pa_sc_window_scissor_tl.tl_y);
  auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
-  int32_t br_x = int32_t(pa_sc_window_scissor_br.br_x);
-  int32_t br_y = int32_t(pa_sc_window_scissor_br.br_y);
-  if (!pa_sc_window_scissor_tl.window_offset_disable) {
-    auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
-    tl_x += pa_sc_window_offset.window_x_offset;
-    tl_y += pa_sc_window_offset.window_y_offset;
-    br_x += pa_sc_window_offset.window_x_offset;
-    br_y += pa_sc_window_offset.window_y_offset;
+  auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+  auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
+  auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
+  uint32_t surface_pitch = 0;
+  if constexpr (clamp_to_surface_pitch) {
+    surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
  }
+  uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x,
+           pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y,
+           pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x,
+           pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y,
+           pa_sc_window_offset_window_x_offset =
+               pa_sc_window_offset.window_x_offset,
+           pa_sc_window_offset_window_y_offset =
+               pa_sc_window_offset.window_y_offset,
+           pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x,
+           pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y,
+           pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x,
+           pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y;
+
+  int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x);
+  int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y);
+
+  int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x);
+  int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y);
+
+  // chrispy: put this here to make it clear that the shift by 31 is extracting
+  // this field
+  XE_MAYBE_UNUSED
+  uint32_t window_offset_disable_reference =
+      pa_sc_window_scissor_tl.window_offset_disable;
+  int32_t window_offset_disable_mask =
+      ~(static_cast<int32_t>(pa_sc_window_scissor_tl.value) >> 31);
+  // if (!pa_sc_window_scissor_tl.window_offset_disable) {
+
+  tl_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask;
+  tl_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask;
+  br_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask;
+  br_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask;
+  //}
  // Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but
  // still handled here for completeness.
-  auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
-  tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl.tl_x));
-  tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl.tl_y));
-  auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
-  br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br.br_x));
-  br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br.br_y));
-  if (clamp_to_surface_pitch) {
+
+  tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl_tl_x));
+  tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl_tl_y));
+
+  br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br_br_x));
+  br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br_br_y));
+  if constexpr (clamp_to_surface_pitch) {
    // Clamp the horizontal scissor to surface_pitch for safety, in case that's
    // not done by the guest for some reason (it's not when doing draws without
    // clipping in Direct3D 9, for instance), to prevent overflow - this is
@ -582,7 +612,7 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
    // rasterization without render target width at all (pixel shader
    // interlock-based custom RB implementations) and using conventional render
    // targets, but padded to EDRAM tiles.
-    uint32_t surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
+
    tl_x = std::min(tl_x, int32_t(surface_pitch));
    br_x = std::min(br_x, int32_t(surface_pitch));
  }
@ -601,6 +631,15 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
  scissor_out.extent[1] = uint32_t(br_y - tl_y);
 }

+void GetScissor(const RegisterFile& XE_RESTRICT regs,
+                Scissor& XE_RESTRICT scissor_out, bool clamp_to_surface_pitch) {
+  if (clamp_to_surface_pitch) {
+    return GetScissorTmpl<true>(regs, scissor_out);
+  } else {
+    return GetScissorTmpl<false>(regs, scissor_out);
+  }
+}
+
 uint32_t GetNormalizedColorMask(const RegisterFile& regs,
                                uint32_t pixel_shader_writes_color_targets) {
  if (regs.Get<reg::RB_MODECONTROL>().edram_mode !=
@ -863,7 +902,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
    y1 = y0 + int32_t(xenos::kMaxResolveSize);
  }
  // fails in forza horizon 1
-  //x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
+  // x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
  assert_true(x0 <= x1 && y0 <= y1);
  if (x0 >= x1 || y0 >= y1) {
    XELOGE("Resolve region is empty");
@ -1103,7 +1142,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
  info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
  info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
  info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
-  #if 0
+#if 0
  XELOGD(
      "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
      "modified memory range 0x{:08X} to 0x{:08X})",
@ -1114,7 +1153,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                     xenos::ColorRenderTargetFormat(color_edram_info.format)),
      FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
      copy_dest_extent_end);
-  #endif
+#endif
  return true;
 }
 XE_MSVC_OPTIMIZE_REVERT()
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -433,13 +433,15 @@ struct GetViewportInfoArgs {
 void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
                         ViewportInfo& viewport_info_out);

-struct Scissor {
+struct alignas(16) Scissor {
  // Offset from render target UV = 0 to +UV.
  uint32_t offset[2];
  // Extent can be zero.
  uint32_t extent[2];
 };
-void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
+
+void GetScissor(const RegisterFile& XE_RESTRICT regs,
+                Scissor& XE_RESTRICT scissor_out,
                bool clamp_to_surface_pitch = true);

 // Returns the color component write mask for the draw command taking into
--- a/src/xenia/gpu/pm4_command_processor_declare.h
+++ b/src/xenia/gpu/pm4_command_processor_declare.h
@ -109,7 +109,7 @@ XE_NOINLINE
 XE_COLD
 bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT;

-XE_NOINLINE
+XE_FORCEINLINE
 XE_NOALIAS
 uint32_t GetCurrentRingReadCount();

--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@ -4,7 +4,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
                                              uint32_t count) XE_RESTRICT {
  SCOPE_profile_cpu_f("gpu");

-
  trace_writer_.WriteIndirectBufferStart(ptr, count * sizeof(uint32_t));
  if (count != 0) {
    RingBuffer old_reader = reader_;
@ -32,10 +31,9 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
    trace_writer_.WriteIndirectBufferEnd();
    reader_ = old_reader;
  } else {
-	  //rare, but i've seen it happen! (and then a division by 0 occurs)
+    // rare, but i've seen it happen! (and then a division by 0 occurs)
    return;
  }
-
 }

 bool COMMAND_PROCESSOR::ExecutePacket() {
@ -88,8 +86,9 @@ bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) {
         count * sizeof(uint32_t));
  return false;
 }
-    /*
-	Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4
+/*
+    Todo: optimize this function this one along with execute packet type III are
+   the most frequently called functions for PM4
 */
 XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
@ -99,7 +98,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {

  uint32_t count = ((packet >> 16) & 0x3FFF) + 1;

-
  if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >=
      count * sizeof(uint32_t)) {
    trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
@ -143,7 +141,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType2(uint32_t packet) XE_RESTRICT {
  trace_writer_.WritePacketEnd();
  return true;
 }
-XE_NOINLINE
+XE_FORCEINLINE
 XE_NOALIAS
 uint32_t COMMAND_PROCESSOR::GetCurrentRingReadCount() {
  return reader_.read_count();
@ -446,41 +444,46 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(
  return true;
 }

-XE_NOINLINE
+/*
+        chrispy: this is fine to inline, as a noinline function it compiled down
+   to 54 bytes
+*/
 static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) {
-  /*
-	Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits 
-	then use the wait_info value in order to select the bits that correctly implement the condition
-	If neither subtraction has the signbit set then that means the value is equal
-  */
-  bool matched = false;
-  switch (wait_info & 0x7) {
-    case 0x0:  // Never.
-      matched = false;
-      break;
-    case 0x1:  // Less than reference.
-      matched = value < ref;
-      break;
-    case 0x2:  // Less than or equal to reference.
-      matched = value <= ref;
-      break;
-    case 0x3:  // Equal to reference.
-      matched = value == ref;
-      break;
-    case 0x4:  // Not equal to reference.
-      matched = value != ref;
-      break;
-    case 0x5:  // Greater than or equal to reference.
-      matched = value >= ref;
-      break;
-    case 0x6:  // Greater than reference.
-      matched = value > ref;
-      break;
-    case 0x7:  // Always
-      matched = true;
-      break;
-  }
-  return matched;
+// smaller code is generated than the #else path, although whether it is faster
+// i do not know. i don't think games do an enormous number of cond_write
+// though, so we have picked
+// the path with the smaller codegen.
+// we do technically have more instructions executed vs the switch case method,
+// but we have no mispredicts and most of our instructions are 0.25/0.3
+// throughput
+#if 1
+  uint32_t value_minus_ref =
+      static_cast<uint32_t>(static_cast<int32_t>(value - ref) >> 31);
+  uint32_t ref_minus_value =
+      static_cast<uint32_t>(static_cast<int32_t>(ref - value) >> 31);
+  uint32_t eqmask = ~(value_minus_ref | ref_minus_value);
+  uint32_t nemask = (value_minus_ref | ref_minus_value);
+
+  uint32_t value_lt_mask = value_minus_ref;
+  uint32_t value_gt_mask = ref_minus_value;
+  uint32_t value_lte_mask = value_lt_mask | eqmask;
+  uint32_t value_gte_mask = value_gt_mask | eqmask;
+
+  uint32_t bits_for_selecting =
+      (value_lt_mask & (1 << 1)) | (value_lte_mask & (1 << 2)) |
+      (eqmask & (1 << 3)) | (nemask & (1 << 4)) | (value_gte_mask & (1 << 5)) |
+      (value_gt_mask & (1 << 6)) | (1 << 7);
+
+  return (bits_for_selecting >> (wait_info & 7)) & 1;
+
+#else
+
+  return ((((value < ref) << 1) | ((value <= ref) << 2) |
+           ((value == ref) << 3) | ((value != ref) << 4) |
+           ((value >= ref) << 5) | ((value > ref) << 6) | (1 << 7)) >>
+          (wait_info & 7)) &
+         1;
+#endif
 }
 XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
@ -1128,7 +1131,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(
 }

 uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index,
-                                                uint32_t write_index) {
+                                                 uint32_t write_index) {
  SCOPE_profile_cpu_f("gpu");
 #if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
  // If we have a pending trace stream open it now. That way we ensure we get
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@ -104,6 +104,15 @@ class TextureCache {
  void TextureFetchConstantWritten(uint32_t index) {
    texture_bindings_in_sync_ &= ~(UINT32_C(1) << index);
  }
+  void TextureFetchConstantsWritten(uint32_t first_index, uint32_t last_index) {
+    // generate a mask of all bits from before the first index, and xor it with
+    // all bits before the last index this produces a mask covering only the
+    // bits between first and last
+    uint32_t res = ((1U << first_index) - 1) ^ ((1U << last_index) - 1);
+    // todo: check that this is right
+
+    texture_bindings_in_sync_ &= ~res;
+  }

  virtual void RequestTextures(uint32_t used_texture_mask);