From d6188c5d7ebcd1c2ee2a014dcbc6accb83b5ad7a Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Sun, 9 Jan 2022 14:58:38 +0300
Subject: [PATCH] [GPU] Reuse base+index*stride in vfetch_mini instead of
 reloading the index GPR The wheel shader in 4D530910 does vfetch_full to r0
 with the index from r0.x, and then vfetch_mini. Thanks @Gliniak for the
 finding :3 Also small formatting cleanup in commented-out code.

---
 src/xenia/gpu/dxbc_shader_translator.cc       |   4 +-
 src/xenia/gpu/dxbc_shader_translator.h        |   4 +-
 src/xenia/gpu/dxbc_shader_translator_fetch.cc | 124 +++++++++++-------
 src/xenia/gpu/shader.h                        |   7 +
 src/xenia/gpu/shader_translator.cc            |   2 +-
 src/xenia/gpu/shader_translator_disasm.cc     |   9 +-
 src/xenia/gpu/ucode.h                         |  19 ++-
 7 files changed, 103 insertions(+), 66 deletions(-)

diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc
index 9f52bf046..350ea6895 100644
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@@ -849,7 +849,7 @@ void DxbcShaderTranslator::StartTranslation() {
     system_temp_aL_ = PushSystemTemp(0b1111);
     system_temp_loop_count_ = PushSystemTemp(0b1111);
     system_temp_grad_h_lod_ = PushSystemTemp(0b1111);
-    system_temp_grad_v_ = PushSystemTemp(0b0111);
+    system_temp_grad_v_vfetch_address_ = PushSystemTemp(0b1111);
 
     // Zero general-purpose registers to prevent crashes when the game
     // references them after only initializing them conditionally.
@@ -1039,7 +1039,7 @@ void DxbcShaderTranslator::CompleteShaderCode() {
     // - system_temp_aL_.
     // - system_temp_loop_count_.
     // - system_temp_grad_h_lod_.
-    // - system_temp_grad_v_.
+    // - system_temp_grad_v_vfetch_address_.
     PopSystemTemp(6);
 
     // Write memexported data to the shared memory UAV.
diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h
index 3bdde19be..0a25cef21 100644
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@@ -1104,7 +1104,9 @@ class DxbcShaderTranslator : public ShaderTranslator {
   uint32_t system_temp_loop_count_;
   // Explicitly set texture gradients and LOD.
   uint32_t system_temp_grad_h_lod_;
-  uint32_t system_temp_grad_v_;
+  // .w stores `base + index * stride` in bytes from the last vfetch_full as it
+  // may be needed by vfetch_mini.
+  uint32_t system_temp_grad_v_vfetch_address_;
 
   // The bool constant number containing the condition for the currently
   // processed exec (or the last - unless a label has reset this), or
diff --git a/src/xenia/gpu/dxbc_shader_translator_fetch.cc b/src/xenia/gpu/dxbc_shader_translator_fetch.cc
index 192b29a33..63480b76c 100644
--- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc
@@ -59,47 +59,67 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
   // fetch constants on the CPU when proper bound checks are added - vfetch may
   // be conditional, so fetch constants may also be used conditionally.
 
-  // - Load the byte address in physical memory to system_temp_result_.w (so
-  //   it's not overwritten by data loads until the last one).
+  // - Load the part of the byte address in the physical memory that is the same
+  //   in vfetch_full and vfetch_mini to system_temp_grad_v_vfetch_address_.w
+  //   (the index operand GPR must not be reloaded in vfetch_mini because it
+  //   might have been overwritten previously, but that shouldn't have effect on
+  //   vfetch_mini).
 
-  dxbc::Dest address_dest(dxbc::Dest::R(system_temp_result_, 0b1000));
-  dxbc::Src address_src(dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
-  if (instr.attributes.stride) {
-    // Convert the index to an integer by flooring or by rounding to the nearest
-    // (as floor(index + 0.5) because rounding to the nearest even makes no
-    // sense for addressing, both 1.5 and 2.5 would be 2).
-    // http://web.archive.org/web/20100302145413/http://msdn.microsoft.com:80/en-us/library/bb313960.aspx
-    {
-      bool index_operand_temp_pushed = false;
-      dxbc::Src index_operand(
-          LoadOperand(instr.operands[0], 0b0001, index_operand_temp_pushed)
-              .SelectFromSwizzled(0));
-      if (instr.attributes.is_index_rounded) {
-        a_.OpAdd(address_dest, index_operand, dxbc::Src::LF(0.5f));
-        a_.OpRoundNI(address_dest, address_src);
-      } else {
-        a_.OpRoundNI(address_dest, index_operand);
-      }
-      if (index_operand_temp_pushed) {
-        PopSystemTemp();
+  dxbc::Src address_src(
+      dxbc::Src::R(system_temp_grad_v_vfetch_address_, dxbc::Src::kWWWW));
+  if (!instr.is_mini_fetch) {
+    dxbc::Dest address_dest(
+        dxbc::Dest::R(system_temp_grad_v_vfetch_address_, 0b1000));
+    if (instr.attributes.stride) {
+      // Convert the index to an integer by flooring or by rounding to the
+      // nearest (as floor(index + 0.5) because rounding to the nearest even
+      // makes no sense for addressing, both 1.5 and 2.5 would be 2).
+      {
+        bool index_operand_temp_pushed = false;
+        dxbc::Src index_operand(
+            LoadOperand(instr.operands[0], 0b0001, index_operand_temp_pushed)
+                .SelectFromSwizzled(0));
+        if (instr.attributes.is_index_rounded) {
+          a_.OpAdd(address_dest, index_operand, dxbc::Src::LF(0.5f));
+          a_.OpRoundNI(address_dest, address_src);
+        } else {
+          a_.OpRoundNI(address_dest, index_operand);
+        }
+        if (index_operand_temp_pushed) {
+          PopSystemTemp();
+        }
       }
+      a_.OpFToI(address_dest, address_src);
+      // Extract the byte address from the fetch constant to
+      // system_temp_result_.w (which is not used yet).
+      a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b1000),
+               fetch_constant_src.SelectFromSwizzled(0),
+               dxbc::Src::LU(~uint32_t(3)));
+      // Merge the index and the base address.
+      a_.OpIMAd(address_dest, address_src,
+                dxbc::Src::LU(instr.attributes.stride * sizeof(uint32_t)),
+                dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
+    } else {
+      // Fetching from the same location - extract the byte address of the
+      // beginning of the buffer.
+      a_.OpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0),
+               dxbc::Src::LU(~uint32_t(3)));
     }
-    a_.OpFToI(address_dest, address_src);
-    // Extract the byte address from the fetch constant to
-    // system_temp_result_.z.
-    a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0100),
-             fetch_constant_src.SelectFromSwizzled(0),
-             dxbc::Src::LU(~uint32_t(3)));
-    // Merge the index and the base address.
-    a_.OpIMAd(address_dest, address_src,
-              dxbc::Src::LU(instr.attributes.stride * sizeof(uint32_t)),
-              dxbc::Src::R(system_temp_result_, dxbc::Src::kZZZZ));
-  } else {
-    // Fetching from the same location - extract the byte address of the
-    // beginning of the buffer.
-    a_.OpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0),
-             dxbc::Src::LU(~uint32_t(3)));
   }
+
+  dxbc::Dest address_temp_dest(dxbc::Dest::R(system_temp_result_, 0b1000));
+  dxbc::Src address_temp_src(
+      dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
+
+  // - From now on, if any additional offset must be applied to the
+  //   `base + index * stride` part of the address, it must be done by writing
+  //   to system_temp_result_.w (address_temp_dest) instead of
+  //   system_temp_grad_v_vfetch_address_.w (since it must stay the same for the
+  //   vfetch_full and all its vfetch_mini invocations), and changing
+  //   address_src to address_temp_src afterwards. system_temp_result_.w can be
+  //   used for this purpose safely because it won't be overwritten until the
+  //   last dword is loaded (after which the address won't be needed anymore).
+
   // Add the word offset from the instruction (signed), plus the offset of the
   // first needed word within the element.
   uint32_t first_word_index;
@@ -108,8 +128,9 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
       instr.attributes.offset + int32_t(first_word_index);
   if (first_word_buffer_offset) {
     // Add the constant word offset.
-    a_.OpIAdd(address_dest, address_src,
+    a_.OpIAdd(address_temp_dest, address_src,
               dxbc::Src::LI(first_word_buffer_offset * sizeof(uint32_t)));
+    address_src = address_temp_src;
   }
 
   // - Load needed words to system_temp_result_, words 0, 1, 2, 3 to X, Y, Z, W
@@ -159,9 +180,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
           ~((uint32_t(1) << (word_index + word_count)) - uint32_t(1));
       if (word_index != word_index_previous) {
         // Go to the word in the buffer.
-        a_.OpIAdd(address_dest, address_src,
+        a_.OpIAdd(address_temp_dest, address_src,
                   dxbc::Src::LU((word_index - word_index_previous) *
                                 sizeof(uint32_t)));
+        address_src = address_temp_src;
         word_index_previous = word_index;
       }
       // Can ld_raw either to the first multiple components, or to any scalar
@@ -592,7 +614,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
     case FetchOpcode::kSetTextureGradientsVert: {
       bool grad_operand_temp_pushed = false;
       a_.OpMov(
-          dxbc::Dest::R(system_temp_grad_v_, 0b0111),
+          dxbc::Dest::R(system_temp_grad_v_vfetch_address_, 0b0111),
           LoadOperand(instr.operands[0], 0b0111, grad_operand_temp_pushed));
       if (grad_operand_temp_pushed) {
         PopSystemTemp();
@@ -1521,15 +1543,15 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
           // Extract gradient exponent biases from the fetch constant and merge
           // them with the LOD bias.
           a_.OpIBFE(dxbc::Dest::R(grad_h_lod_temp, 0b0011), dxbc::Src::LU(5),
-                     dxbc::Src::LU(22, 27, 0, 0),
-                     RequestTextureFetchConstantWord(tfetch_index, 4));
+                    dxbc::Src::LU(22, 27, 0, 0),
+                    RequestTextureFetchConstantWord(tfetch_index, 4));
           a_.OpIMAd(dxbc::Dest::R(grad_h_lod_temp, 0b0011),
-                     dxbc::Src::R(grad_h_lod_temp), dxbc::Src::LI(int32_t(1) << 23),
-                     dxbc::Src::LF(1.0f));
+                    dxbc::Src::R(grad_h_lod_temp),
+                    dxbc::Src::LI(int32_t(1) << 23), dxbc::Src::LF(1.0f));
           a_.OpMul(dxbc::Dest::R(grad_v_temp, 0b1000), lod_src,
-                    dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kYYYY));
+                   dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kYYYY));
           a_.OpMul(lod_dest, lod_src,
-                    dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kXXXX));
+                   dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kXXXX));
 #endif
           // Obtain the gradients and apply biases to them.
           if (instr.attributes.use_register_gradients) {
@@ -1540,11 +1562,11 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
             // done in getCompTexLOD, so don't do it here too.
 #if 0
             a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
-                      dxbc::Src::R(system_temp_grad_v_),
-                      dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
+                     dxbc::Src::R(system_temp_grad_v_vfetch_address_),
+                     dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
 #else
             a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
-                     dxbc::Src::R(system_temp_grad_v_), lod_src);
+                     dxbc::Src::R(system_temp_grad_v_vfetch_address_), lod_src);
 #endif
             // TODO(Triang3l): Are cube map register gradients unnormalized if
             // the coordinates themselves are unnormalized?
@@ -1586,8 +1608,8 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
             // done in getCompTexLOD, so don't do it here too.
 #if 0
             a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
-                      dxbc::Src::R(grad_v_temp),
-                      dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
+                     dxbc::Src::R(grad_v_temp),
+                     dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
 #else
             a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
                      dxbc::Src::R(grad_v_temp), lod_src);
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index f7c52cab0..2ce81409a 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -440,6 +440,13 @@ struct ParsedVertexFetchInstruction {
   // Number of source operands.
   size_t operand_count = 0;
   // Describes each source operand.
+  // Note that for vfetch_mini, which inherits the operands from vfetch_full,
+  // the index operand register may been overwritten between the vfetch_full and
+  // the vfetch_mini (happens in 4D530910 for wheels), but that should have no
+  // effect on the index actually used for fetching. A copy of the index
+  // therefore must be stored by vfetch_full (the base address, stride and
+  // rounding may be pre-applied to it since they will be the same in the
+  // vfetch_full and all its vfetch_mini instructions).
   InstructionOperand operands[2];
 
   struct Attributes {
diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index b1d1a060e..612edbfbf 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -876,7 +876,7 @@ bool ParseVertexFetchInstruction(const VertexFetchInstruction& op,
   instr.attributes.stride = full_op.stride();
   instr.attributes.exp_adjust = op.exp_adjust();
   instr.attributes.prefetch_count = op.prefetch_count();
-  instr.attributes.is_index_rounded = op.is_index_rounded();
+  instr.attributes.is_index_rounded = full_op.is_index_rounded();
   instr.attributes.is_signed = op.is_signed();
   instr.attributes.is_integer = !op.is_normalized();
   instr.attributes.signed_rf_mode = op.signed_rf_mode();
diff --git a/src/xenia/gpu/shader_translator_disasm.cc b/src/xenia/gpu/shader_translator_disasm.cc
index cf7e94d52..8dd72413a 100644
--- a/src/xenia/gpu/shader_translator_disasm.cc
+++ b/src/xenia/gpu/shader_translator_disasm.cc
@@ -328,13 +328,12 @@ void ParsedVertexFetchInstruction::Disassemble(StringBuffer* out) const {
   if (!is_mini_fetch) {
     out->Append(", ");
     DisassembleSourceOperand(operands[0], out);
-    out->Append(", ");
-    out->AppendFormat("vf{}", 95 - operands[1].storage_index);
+    out->AppendFormat(", vf{}", 95 - operands[1].storage_index);
+    if (attributes.is_index_rounded) {
+      out->Append(", RoundIndex=true");
+    }
   }
 
-  if (attributes.is_index_rounded) {
-    out->Append(", RoundIndex=true");
-  }
   if (attributes.exp_adjust) {
     out->AppendFormat(", ExpAdjust={}", attributes.exp_adjust);
   }
diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h
index 798fd5367..e86387535 100644
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@@ -599,6 +599,8 @@ struct alignas(uint32_t) VertexFetchInstruction {
   // Required condition value of the comparision (true or false).
   bool predicate_condition() const { return data_.pred_condition == 1; }
   // Vertex fetch constant index [0-95].
+  // Applicable only to vfetch_full (the address from vfetch_full is reused in
+  // vfetch_mini).
   uint32_t fetch_constant_index() const {
     return data_.const_index * 3 + data_.const_index_sel;
   }
@@ -606,6 +608,8 @@ struct alignas(uint32_t) VertexFetchInstruction {
   uint32_t dest() const { return data_.dst_reg; }
   uint32_t dest_swizzle() const { return data_.dst_swiz; }
   bool is_dest_relative() const { return data_.dst_reg_am; }
+  // The source is applicable only to vfetch_full (the address from vfetch_full
+  // is reused in vfetch_mini).
   uint32_t src() const { return data_.src_reg; }
   uint32_t src_swizzle() const { return data_.src_swiz; }
   bool is_src_relative() const { return data_.src_reg_am; }
@@ -644,18 +648,21 @@ struct alignas(uint32_t) VertexFetchInstruction {
   xenos::SignedRepeatingFractionMode signed_rf_mode() const {
     return data_.signed_rf_mode_all;
   }
+  // If true, the floating-point index is rounded to the nearest integer (likely
+  // as floor(index + 0.5) because rounding to the nearest even makes no sense
+  // for addressing, both 1.5 and 2.5 would be 2).
+  // Otherwise, it's floored (rounded towards negative infinity).
+  // Applicable only to vfetch_full (the address from vfetch_full is reused in
+  // vfetch_mini).
+  // http://web.archive.org/web/20090914055358/http://msdn.microsoft.com/en-us/library/bb313960.aspx
   bool is_index_rounded() const { return data_.is_index_rounded == 1; }
   // Dword stride, [0, 255].
+  // Applicable only to vfetch_full (the address from vfetch_full is reused in
+  // vfetch_mini).
   uint32_t stride() const { return data_.stride; }
   // Dword offset, [-4194304, 4194303].
   int32_t offset() const { return data_.offset; }
 
-  void AssignFromFull(const VertexFetchInstruction& full) {
-    data_.stride = full.data_.stride;
-    data_.const_index = full.data_.const_index;
-    data_.const_index_sel = full.data_.const_index_sel;
-  }
-
  private:
   struct Data {
     struct {