[GPU] Scale gradients by SSAA factor

2020-12-20 16:35:15 +03:00 · 2020-12-20 16:35:15 +03:00 · c7fbe0e6d5
parent e6fa0ad139
commit c7fbe0e6d5
4 changed files with 97 additions and 12 deletions
--- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc
@ -17,6 +17,7 @@
 #include "xenia/base/math.h"
 #include "xenia/base/string.h"
 #include "xenia/gpu/dxbc_shader_translator.h"
+#include "xenia/gpu/gpu_flags.h"

 namespace xe {
 namespace gpu {
@ -630,21 +631,44 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
  if (instr.opcode == FetchOpcode::kGetTextureGradients) {
    // Handle before doing anything that actually needs the texture.
    bool grad_operand_temp_pushed = false;
-    DxbcSrc grad_operand =
-        LoadOperand(instr.operands[0], 0b0011, grad_operand_temp_pushed);
-    if (used_result_components & 0b0101) {
-      DxbcOpDerivRTXFine(
-          DxbcDest::R(system_temp_result_, used_result_components & 0b0101),
+    DxbcSrc grad_operand = LoadOperand(
+        instr.operands[0],
+        ((used_result_nonzero_components & 0b0011) ? 0b0001 : 0) |
+            ((used_result_nonzero_components & 0b1100) ? 0b0010 : 0),
+        grad_operand_temp_pushed);
+    if (used_result_nonzero_components & 0b0101) {
+      DxbcOpDerivRTXCoarse(DxbcDest::R(system_temp_result_,
+                                       used_result_nonzero_components & 0b0101),
                           grad_operand.SwizzleSwizzled(0b010000));
    }
-    if (used_result_components & 0b1010) {
-      DxbcOpDerivRTYFine(
-          DxbcDest::R(system_temp_result_, used_result_components & 0b1010),
+    if (used_result_nonzero_components & 0b1010) {
+      DxbcOpDerivRTYCoarse(DxbcDest::R(system_temp_result_,
+                                       used_result_nonzero_components & 0b1010),
                           grad_operand.SwizzleSwizzled(0b01000000));
    }
    if (grad_operand_temp_pushed) {
      PopSystemTemp();
    }
+    if (!edram_rov_used_ && cvars::ssaa_scale_gradients) {
+      // Scale the gradients to guest pixels with SSAA.
+      uint32_t ssaa_scale_temp = PushSystemTemp();
+      system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index;
+      DxbcOpMovC(DxbcDest::R(ssaa_scale_temp,
+                             (used_result_nonzero_components & 0b0011) |
+                                 (used_result_nonzero_components >> 2)),
+                 DxbcSrc::CB(cbuffer_index_system_constants_,
+                             uint32_t(CbufferRegister::kSystemConstants),
+                             kSysConst_SampleCountLog2_Vec,
+                             kSysConst_SampleCountLog2_Comp |
+                                 ((kSysConst_SampleCountLog2_Comp + 1) << 2)),
+                 DxbcSrc::LF(2.0f), DxbcSrc::LF(1.0f));
+      DxbcOpMul(
+          DxbcDest::R(system_temp_result_, used_result_nonzero_components),
+          DxbcSrc::R(system_temp_result_),
+          DxbcSrc::R(ssaa_scale_temp, 0b01000100));
+      // Release ssaa_scale_temp.
+      PopSystemTemp();
+    }
    StoreResult(instr.result, DxbcSrc::R(system_temp_result_));
    return;
  }
@ -1387,12 +1411,27 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
          DxbcOpExp(lod_dest, lod_src);
          // FIXME(Triang3l): Gradient exponent adjustment is currently not done
          // in getCompTexLOD, so don't do it here too.
+          bool ssaa_scale_gradients =
+              !instr.attributes.use_register_gradients && !edram_rov_used_ &&
+              cvars::ssaa_scale_gradients;
 #if 0
          // Extract gradient exponent biases from the fetch constant and merge
          // them with the LOD bias.
          DxbcOpIBFE(DxbcDest::R(grad_h_lod_temp, 0b0011), DxbcSrc::LU(5),
                     DxbcSrc::LU(22, 27, 0, 0),
                     RequestTextureFetchConstantWord(tfetch_index, 4));
+          if (ssaa_scale_gradients) {
+            // Adjust the gradient scales to include the SSAA scale.
+            system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index;
+            DxbcOpIAdd(DxbcDest::R(grad_h_lod_temp, 0b0011),
+                       DxbcSrc::R(grad_h_lod_temp),
+                       DxbcSrc::CB(
+                           cbuffer_index_system_constants_,
+                           uint32_t(CbufferRegister::kSystemConstants),
+                           kSysConst_SampleCountLog2_Vec,
+                           kSysConst_SampleCountLog2_Comp |
+                               ((kSysConst_SampleCountLog2_Comp + 1) << 2)));
+          }
          DxbcOpIMAd(DxbcDest::R(grad_h_lod_temp, 0b0011),
                     DxbcSrc::R(grad_h_lod_temp), DxbcSrc::LI(int32_t(1) << 23),
                     DxbcSrc::LF(1.0f));
@ -1400,6 +1439,32 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
                    DxbcSrc::R(grad_h_lod_temp, DxbcSrc::kYYYY));
          DxbcOpMul(lod_dest, lod_src,
                    DxbcSrc::R(grad_h_lod_temp, DxbcSrc::kXXXX));
+#else
+          if (ssaa_scale_gradients) {
+            // Adjust the gradient scales in each direction to include the SSAA
+            // scale - for ddy scale, grad_v_temp.w, not grad_h_lod_temp.w, must
+            // be used.
+            // ddy.
+            system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index;
+            DxbcOpMovC(DxbcDest::R(grad_v_temp, 0b1000),
+                       DxbcSrc::CB(cbuffer_index_system_constants_,
+                                   uint32_t(CbufferRegister::kSystemConstants),
+                                   kSysConst_SampleCountLog2_Vec)
+                           .Select(kSysConst_SampleCountLog2_Comp + 1),
+                       DxbcSrc::LF(2.0f), DxbcSrc::LF(1.0f));
+            DxbcOpMul(DxbcDest::R(grad_v_temp, 0b1000), lod_src,
+                      DxbcSrc::R(grad_v_temp, DxbcSrc::kWWWW));
+            // ddx (after ddy handling, because the ddy code uses lod_src, and
+            // it's being overwritten now).
+            system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index;
+            DxbcOpIf(true,
+                     DxbcSrc::CB(cbuffer_index_system_constants_,
+                                 uint32_t(CbufferRegister::kSystemConstants),
+                                 kSysConst_SampleCountLog2_Vec)
+                         .Select(kSysConst_SampleCountLog2_Comp));
+            DxbcOpMul(lod_dest, lod_src, DxbcSrc::LF(2.0f));
+            DxbcOpEndIf();
+          }
 #endif
          // Obtain the gradients and apply biases to them.
          if (instr.attributes.use_register_gradients) {
@ -1458,8 +1523,12 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
                      DxbcSrc::R(grad_v_temp),
                      DxbcSrc::R(grad_v_temp, DxbcSrc::kWWWW));
 #else
-            DxbcOpMul(DxbcDest::R(grad_v_temp, grad_mask),
-                      DxbcSrc::R(grad_v_temp), lod_src);
+            // With SSAA gradient scaling, the scale is separate in each
+            // direction.
+            DxbcOpMul(
+                DxbcDest::R(grad_v_temp, grad_mask), DxbcSrc::R(grad_v_temp),
+                ssaa_scale_gradients ? DxbcSrc::R(grad_v_temp, DxbcSrc::kWWWW)
+                                     : lod_src);
 #endif
          }
          if (instr.dimension == xenos::FetchOpDimension::k1D) {
--- a/src/xenia/gpu/gpu_flags.cc
+++ b/src/xenia/gpu/gpu_flags.cc
@ -40,6 +40,14 @@ DEFINE_bool(
    "be fully covered when MSAA is used with fullscreen passes.",
    "GPU");

+DEFINE_bool(
+    ssaa_scale_gradients, true,
+    "When using SSAA instead of native MSAA, adjust texture coordinate "
+    "derivatives used for mipmap selection, and getGradients results, to guest "
+    "pixels as if true MSAA rather than SSAA was used.\n"
+    "Reduces bandwidth usage of texture fetching.",
+    "GPU");
+
 DEFINE_string(
    depth_float24_conversion, "",
    "Method for converting 32-bit Z values to 20e4 floating point when using "
--- a/src/xenia/gpu/gpu_flags.h
+++ b/src/xenia/gpu/gpu_flags.h
@ -22,6 +22,8 @@ DECLARE_bool(gpu_allow_invalid_fetch_constants);

 DECLARE_bool(half_pixel_offset);

+DECLARE_bool(ssaa_scale_gradients);
+
 DECLARE_string(depth_float24_conversion);

 DECLARE_int32(query_occlusion_fake_sample_count);
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@ -551,6 +551,12 @@ enum class FetchOpcode : uint32_t {
  kGetTextureComputedLod = 17,

  // Source is 2-component. XZ = ddx(source.xy), YW = ddy(source.xy).
+  // TODO(Triang3l): Verify whether it's coarse or fine (on Adreno 200, for
+  // instance). This is using the texture unit, where the LOD is computed for
+  // the whole quad (according to the Direct3D 11.3 specification), so likely
+  // coarse; ddx / ddy from the Shader Model 4 era is also compiled by FXC to
+  // deriv_rtx/rty_coarse when targeting Shader Model 5, and on TeraScale,
+  // coarse / fine selection only appeared on Direct3D 11 GPUs.
  kGetTextureGradients = 18,

  // Gets the weights used in a bilinear fetch.