[GPU] Reuse base+index*stride in vfetch_mini instead of reloading the index GPR

The wheel shader in 4D530910 does vfetch_full to r0 with the index from r0.x, and then vfetch_mini. Thanks @Gliniak for the finding :3 Also small formatting cleanup in commented-out code.
2022-01-09 14:58:38 +03:00 · 2022-01-09 14:58:38 +03:00 · d6188c5d7e
parent 600c14b3f0
commit d6188c5d7e
7 changed files with 103 additions and 66 deletions
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@ -849,7 +849,7 @@ void DxbcShaderTranslator::StartTranslation() {
    system_temp_aL_ = PushSystemTemp(0b1111);
    system_temp_loop_count_ = PushSystemTemp(0b1111);
    system_temp_grad_h_lod_ = PushSystemTemp(0b1111);
-    system_temp_grad_v_ = PushSystemTemp(0b0111);
+    system_temp_grad_v_vfetch_address_ = PushSystemTemp(0b1111);

    // Zero general-purpose registers to prevent crashes when the game
    // references them after only initializing them conditionally.
@ -1039,7 +1039,7 @@ void DxbcShaderTranslator::CompleteShaderCode() {
    // - system_temp_aL_.
    // - system_temp_loop_count_.
    // - system_temp_grad_h_lod_.
-    // - system_temp_grad_v_.
+    // - system_temp_grad_v_vfetch_address_.
    PopSystemTemp(6);

    // Write memexported data to the shared memory UAV.
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@ -1104,7 +1104,9 @@ class DxbcShaderTranslator : public ShaderTranslator {
  uint32_t system_temp_loop_count_;
  // Explicitly set texture gradients and LOD.
  uint32_t system_temp_grad_h_lod_;
-  uint32_t system_temp_grad_v_;
+  // .w stores `base + index * stride` in bytes from the last vfetch_full as it
+  // may be needed by vfetch_mini.
+  uint32_t system_temp_grad_v_vfetch_address_;

  // The bool constant number containing the condition for the currently
  // processed exec (or the last - unless a label has reset this), or
--- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc
@ -59,47 +59,67 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
  // fetch constants on the CPU when proper bound checks are added - vfetch may
  // be conditional, so fetch constants may also be used conditionally.

-  // - Load the byte address in physical memory to system_temp_result_.w (so
-  //   it's not overwritten by data loads until the last one).
+  // - Load the part of the byte address in the physical memory that is the same
+  //   in vfetch_full and vfetch_mini to system_temp_grad_v_vfetch_address_.w
+  //   (the index operand GPR must not be reloaded in vfetch_mini because it
+  //   might have been overwritten previously, but that shouldn't have effect on
+  //   vfetch_mini).

-  dxbc::Dest address_dest(dxbc::Dest::R(system_temp_result_, 0b1000));
-  dxbc::Src address_src(dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
-  if (instr.attributes.stride) {
-    // Convert the index to an integer by flooring or by rounding to the nearest
-    // (as floor(index + 0.5) because rounding to the nearest even makes no
-    // sense for addressing, both 1.5 and 2.5 would be 2).
-    // http://web.archive.org/web/20100302145413/http://msdn.microsoft.com:80/en-us/library/bb313960.aspx
-    {
-      bool index_operand_temp_pushed = false;
-      dxbc::Src index_operand(
-          LoadOperand(instr.operands[0], 0b0001, index_operand_temp_pushed)
-              .SelectFromSwizzled(0));
-      if (instr.attributes.is_index_rounded) {
-        a_.OpAdd(address_dest, index_operand, dxbc::Src::LF(0.5f));
-        a_.OpRoundNI(address_dest, address_src);
-      } else {
-        a_.OpRoundNI(address_dest, index_operand);
-      }
-      if (index_operand_temp_pushed) {
-        PopSystemTemp();
+  dxbc::Src address_src(
+      dxbc::Src::R(system_temp_grad_v_vfetch_address_, dxbc::Src::kWWWW));
+  if (!instr.is_mini_fetch) {
+    dxbc::Dest address_dest(
+        dxbc::Dest::R(system_temp_grad_v_vfetch_address_, 0b1000));
+    if (instr.attributes.stride) {
+      // Convert the index to an integer by flooring or by rounding to the
+      // nearest (as floor(index + 0.5) because rounding to the nearest even
+      // makes no sense for addressing, both 1.5 and 2.5 would be 2).
+      {
+        bool index_operand_temp_pushed = false;
+        dxbc::Src index_operand(
+            LoadOperand(instr.operands[0], 0b0001, index_operand_temp_pushed)
+                .SelectFromSwizzled(0));
+        if (instr.attributes.is_index_rounded) {
+          a_.OpAdd(address_dest, index_operand, dxbc::Src::LF(0.5f));
+          a_.OpRoundNI(address_dest, address_src);
+        } else {
+          a_.OpRoundNI(address_dest, index_operand);
+        }
+        if (index_operand_temp_pushed) {
+          PopSystemTemp();
+        }
      }
+      a_.OpFToI(address_dest, address_src);
+      // Extract the byte address from the fetch constant to
+      // system_temp_result_.w (which is not used yet).
+      a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b1000),
+               fetch_constant_src.SelectFromSwizzled(0),
+               dxbc::Src::LU(~uint32_t(3)));
+      // Merge the index and the base address.
+      a_.OpIMAd(address_dest, address_src,
+                dxbc::Src::LU(instr.attributes.stride * sizeof(uint32_t)),
+                dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
+    } else {
+      // Fetching from the same location - extract the byte address of the
+      // beginning of the buffer.
+      a_.OpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0),
+               dxbc::Src::LU(~uint32_t(3)));
    }
-    a_.OpFToI(address_dest, address_src);
-    // Extract the byte address from the fetch constant to
-    // system_temp_result_.z.
-    a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0100),
-             fetch_constant_src.SelectFromSwizzled(0),
-             dxbc::Src::LU(~uint32_t(3)));
-    // Merge the index and the base address.
-    a_.OpIMAd(address_dest, address_src,
-              dxbc::Src::LU(instr.attributes.stride * sizeof(uint32_t)),
-              dxbc::Src::R(system_temp_result_, dxbc::Src::kZZZZ));
-  } else {
-    // Fetching from the same location - extract the byte address of the
-    // beginning of the buffer.
-    a_.OpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0),
-             dxbc::Src::LU(~uint32_t(3)));
  }
+
+  dxbc::Dest address_temp_dest(dxbc::Dest::R(system_temp_result_, 0b1000));
+  dxbc::Src address_temp_src(
+      dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
+
+  // - From now on, if any additional offset must be applied to the
+  //   `base + index * stride` part of the address, it must be done by writing
+  //   to system_temp_result_.w (address_temp_dest) instead of
+  //   system_temp_grad_v_vfetch_address_.w (since it must stay the same for the
+  //   vfetch_full and all its vfetch_mini invocations), and changing
+  //   address_src to address_temp_src afterwards. system_temp_result_.w can be
+  //   used for this purpose safely because it won't be overwritten until the
+  //   last dword is loaded (after which the address won't be needed anymore).
+
  // Add the word offset from the instruction (signed), plus the offset of the
  // first needed word within the element.
  uint32_t first_word_index;
@ -108,8 +128,9 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
      instr.attributes.offset + int32_t(first_word_index);
  if (first_word_buffer_offset) {
    // Add the constant word offset.
-    a_.OpIAdd(address_dest, address_src,
+    a_.OpIAdd(address_temp_dest, address_src,
              dxbc::Src::LI(first_word_buffer_offset * sizeof(uint32_t)));
+    address_src = address_temp_src;
  }

  // - Load needed words to system_temp_result_, words 0, 1, 2, 3 to X, Y, Z, W
@ -159,9 +180,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
          ~((uint32_t(1) << (word_index + word_count)) - uint32_t(1));
      if (word_index != word_index_previous) {
        // Go to the word in the buffer.
-        a_.OpIAdd(address_dest, address_src,
+        a_.OpIAdd(address_temp_dest, address_src,
                  dxbc::Src::LU((word_index - word_index_previous) *
                                sizeof(uint32_t)));
+        address_src = address_temp_src;
        word_index_previous = word_index;
      }
      // Can ld_raw either to the first multiple components, or to any scalar
@ -592,7 +614,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
    case FetchOpcode::kSetTextureGradientsVert: {
      bool grad_operand_temp_pushed = false;
      a_.OpMov(
-          dxbc::Dest::R(system_temp_grad_v_, 0b0111),
+          dxbc::Dest::R(system_temp_grad_v_vfetch_address_, 0b0111),
          LoadOperand(instr.operands[0], 0b0111, grad_operand_temp_pushed));
      if (grad_operand_temp_pushed) {
        PopSystemTemp();
@ -1521,15 +1543,15 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
          // Extract gradient exponent biases from the fetch constant and merge
          // them with the LOD bias.
          a_.OpIBFE(dxbc::Dest::R(grad_h_lod_temp, 0b0011), dxbc::Src::LU(5),
-                     dxbc::Src::LU(22, 27, 0, 0),
-                     RequestTextureFetchConstantWord(tfetch_index, 4));
+                    dxbc::Src::LU(22, 27, 0, 0),
+                    RequestTextureFetchConstantWord(tfetch_index, 4));
          a_.OpIMAd(dxbc::Dest::R(grad_h_lod_temp, 0b0011),
-                     dxbc::Src::R(grad_h_lod_temp), dxbc::Src::LI(int32_t(1) << 23),
-                     dxbc::Src::LF(1.0f));
+                    dxbc::Src::R(grad_h_lod_temp),
+                    dxbc::Src::LI(int32_t(1) << 23), dxbc::Src::LF(1.0f));
          a_.OpMul(dxbc::Dest::R(grad_v_temp, 0b1000), lod_src,
-                    dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kYYYY));
+                   dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kYYYY));
          a_.OpMul(lod_dest, lod_src,
-                    dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kXXXX));
+                   dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kXXXX));
 #endif
          // Obtain the gradients and apply biases to them.
          if (instr.attributes.use_register_gradients) {
@ -1540,11 +1562,11 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
            // done in getCompTexLOD, so don't do it here too.
 #if 0
            a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
-                      dxbc::Src::R(system_temp_grad_v_),
-                      dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
+                     dxbc::Src::R(system_temp_grad_v_vfetch_address_),
+                     dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
 #else
            a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
-                     dxbc::Src::R(system_temp_grad_v_), lod_src);
+                     dxbc::Src::R(system_temp_grad_v_vfetch_address_), lod_src);
 #endif
            // TODO(Triang3l): Are cube map register gradients unnormalized if
            // the coordinates themselves are unnormalized?
@ -1586,8 +1608,8 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
            // done in getCompTexLOD, so don't do it here too.
 #if 0
            a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
-                      dxbc::Src::R(grad_v_temp),
-                      dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
+                     dxbc::Src::R(grad_v_temp),
+                     dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
 #else
            a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
                     dxbc::Src::R(grad_v_temp), lod_src);
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -440,6 +440,13 @@ struct ParsedVertexFetchInstruction {
  // Number of source operands.
  size_t operand_count = 0;
  // Describes each source operand.
+  // Note that for vfetch_mini, which inherits the operands from vfetch_full,
+  // the index operand register may been overwritten between the vfetch_full and
+  // the vfetch_mini (happens in 4D530910 for wheels), but that should have no
+  // effect on the index actually used for fetching. A copy of the index
+  // therefore must be stored by vfetch_full (the base address, stride and
+  // rounding may be pre-applied to it since they will be the same in the
+  // vfetch_full and all its vfetch_mini instructions).
  InstructionOperand operands[2];

  struct Attributes {
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@ -876,7 +876,7 @@ bool ParseVertexFetchInstruction(const VertexFetchInstruction& op,
  instr.attributes.stride = full_op.stride();
  instr.attributes.exp_adjust = op.exp_adjust();
  instr.attributes.prefetch_count = op.prefetch_count();
-  instr.attributes.is_index_rounded = op.is_index_rounded();
+  instr.attributes.is_index_rounded = full_op.is_index_rounded();
  instr.attributes.is_signed = op.is_signed();
  instr.attributes.is_integer = !op.is_normalized();
  instr.attributes.signed_rf_mode = op.signed_rf_mode();
--- a/src/xenia/gpu/shader_translator_disasm.cc
+++ b/src/xenia/gpu/shader_translator_disasm.cc
@ -328,13 +328,12 @@ void ParsedVertexFetchInstruction::Disassemble(StringBuffer* out) const {
  if (!is_mini_fetch) {
    out->Append(", ");
    DisassembleSourceOperand(operands[0], out);
-    out->Append(", ");
-    out->AppendFormat("vf{}", 95 - operands[1].storage_index);
+    out->AppendFormat(", vf{}", 95 - operands[1].storage_index);
+    if (attributes.is_index_rounded) {
+      out->Append(", RoundIndex=true");
+    }
  }

-  if (attributes.is_index_rounded) {
-    out->Append(", RoundIndex=true");
-  }
  if (attributes.exp_adjust) {
    out->AppendFormat(", ExpAdjust={}", attributes.exp_adjust);
  }
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@ -599,6 +599,8 @@ struct alignas(uint32_t) VertexFetchInstruction {
  // Required condition value of the comparision (true or false).
  bool predicate_condition() const { return data_.pred_condition == 1; }
  // Vertex fetch constant index [0-95].
+  // Applicable only to vfetch_full (the address from vfetch_full is reused in
+  // vfetch_mini).
  uint32_t fetch_constant_index() const {
    return data_.const_index * 3 + data_.const_index_sel;
  }
@ -606,6 +608,8 @@ struct alignas(uint32_t) VertexFetchInstruction {
  uint32_t dest() const { return data_.dst_reg; }
  uint32_t dest_swizzle() const { return data_.dst_swiz; }
  bool is_dest_relative() const { return data_.dst_reg_am; }
+  // The source is applicable only to vfetch_full (the address from vfetch_full
+  // is reused in vfetch_mini).
  uint32_t src() const { return data_.src_reg; }
  uint32_t src_swizzle() const { return data_.src_swiz; }
  bool is_src_relative() const { return data_.src_reg_am; }
@ -644,18 +648,21 @@ struct alignas(uint32_t) VertexFetchInstruction {
  xenos::SignedRepeatingFractionMode signed_rf_mode() const {
    return data_.signed_rf_mode_all;
  }
+  // If true, the floating-point index is rounded to the nearest integer (likely
+  // as floor(index + 0.5) because rounding to the nearest even makes no sense
+  // for addressing, both 1.5 and 2.5 would be 2).
+  // Otherwise, it's floored (rounded towards negative infinity).
+  // Applicable only to vfetch_full (the address from vfetch_full is reused in
+  // vfetch_mini).
+  // http://web.archive.org/web/20090914055358/http://msdn.microsoft.com/en-us/library/bb313960.aspx
  bool is_index_rounded() const { return data_.is_index_rounded == 1; }
  // Dword stride, [0, 255].
+  // Applicable only to vfetch_full (the address from vfetch_full is reused in
+  // vfetch_mini).
  uint32_t stride() const { return data_.stride; }
  // Dword offset, [-4194304, 4194303].
  int32_t offset() const { return data_.offset; }

-  void AssignFromFull(const VertexFetchInstruction& full) {
-    data_.stride = full.data_.stride;
-    data_.const_index = full.data_.const_index;
-    data_.const_index_sel = full.data_.const_index_sel;
-  }
-
 private:
  struct Data {
    struct {