[DXBC] Multiplication signed zero handling

2020-10-30 22:14:38 +03:00 · 2020-10-30 22:14:38 +03:00 · feb8258a5e
parent adebaba799
commit feb8258a5e
3 changed files with 124 additions and 144 deletions
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@ -68,32 +68,34 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
      break;
    case AluVectorOpcode::kMul:
    case AluVectorOpcode::kMad: {
-      bool is_mad = instr.vector_opcode == AluVectorOpcode::kMad;
+      // Not using DXBC mad to prevent fused multiply-add (mul followed by add
-      if (is_mad) {
+      // may be optimized into non-fused mad by the driver in the identical
-        DxbcOpMAd(per_component_dest, operands[0], operands[1], operands[2]);
+      // operands case also).
      } else {
      DxbcOpMul(per_component_dest, operands[0], operands[1]);
-      }
+      uint32_t multiplicands_different =
      // Shader Model 3: 0 or denormal * anything = 0.
      // FIXME(Triang3l): Signed zero needs research and handling.
      uint32_t absolute_different =
          used_result_components &
-          ~instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+          ~instr.vector_operands[0].GetIdenticalMultiplicandComponents(
              instr.vector_operands[1]);
-      if (absolute_different) {
+      if (multiplicands_different) {
        // Shader Model 3: +-0 or denormal * anything = +0.
        uint32_t is_zero_temp = PushSystemTemp();
-        DxbcOpMin(DxbcDest::R(is_zero_temp, absolute_different),
+        DxbcOpMin(DxbcDest::R(is_zero_temp, multiplicands_different),
                  operands[0].Abs(), operands[1].Abs());
        // min isn't required to flush denormals, eq is.
-        DxbcOpEq(DxbcDest::R(is_zero_temp, absolute_different),
+        DxbcOpEq(DxbcDest::R(is_zero_temp, multiplicands_different),
                 DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f));
-        DxbcOpMovC(DxbcDest::R(system_temp_result_, absolute_different),
+        // Not replacing true `0 + term` with movc of the term because +0 + -0
-                   DxbcSrc::R(is_zero_temp),
+        // should result in +0, not -0.
-                   is_mad ? operands[2] : DxbcSrc::LF(0.0f),
+        DxbcOpMovC(DxbcDest::R(system_temp_result_, multiplicands_different),
                   DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f),
                   DxbcSrc::R(system_temp_result_));
        // Release is_zero_temp.
        PopSystemTemp();
      }
      if (instr.vector_opcode == AluVectorOpcode::kMad) {
        DxbcOpAdd(per_component_dest, DxbcSrc::R(system_temp_result_),
                  operands[2]);
      }
    } break;
    case AluVectorOpcode::kMax:
@ -179,69 +181,41 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
        component_count = 4;
      }
      result_swizzle = DxbcSrc::kXXXX;
-      uint32_t absolute_different =
+      uint32_t multiplicands_different =
          uint32_t((1 << component_count) - 1) &
-          ~instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+          ~instr.vector_operands[0].GetIdenticalMultiplicandComponents(
              instr.vector_operands[1]);
-      if (absolute_different) {
+      for (uint32_t i = 0; i < component_count; ++i) {
-        // Shader Model 3: 0 or denormal * anything = 0.
+        DxbcOpMul(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001),
        // FIXME(Triang3l): Signed zero needs research and handling.
        // Add component products only if non-zero. For dp4, 16 scalar
        // operations in the worst case (as opposed to always 20 for
        // eq/movc/eq/movc/dp4 or min/eq/movc/movc/dp4 for preparing operands
        // for dp4).
        DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0001),
                  operands[0].SelectFromSwizzled(0),
                  operands[1].SelectFromSwizzled(0));
        if (absolute_different & 0b0001) {
          DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0010),
                    operands[0].SelectFromSwizzled(0).Abs(),
                    operands[1].SelectFromSwizzled(0).Abs());
          DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0010),
                   DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY),
                   DxbcSrc::LF(0.0f));
          DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001),
                     DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY),
                     DxbcSrc::LF(0.0f),
                     DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX));
        }
        for (uint32_t i = 1; i < component_count; ++i) {
          bool component_different = (absolute_different & (1 << i)) != 0;
          DxbcOpMAd(DxbcDest::R(system_temp_result_,
                                component_different ? 0b0010 : 0b0001),
                  operands[0].SelectFromSwizzled(i),
-                    operands[1].SelectFromSwizzled(i),
+                  operands[1].SelectFromSwizzled(i));
-                    DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX));
+        if ((multiplicands_different & (1 << i)) != 0) {
-          if (component_different) {
+          // Shader Model 3: +-0 or denormal * anything = +0 (also not replacing
          // true `0 + term` with movc of the term because +0 + -0 should result
          // in +0, not -0).
          DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
                    operands[0].SelectFromSwizzled(i).Abs(),
                    operands[1].SelectFromSwizzled(i).Abs());
          DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100),
                   DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
                   DxbcSrc::LF(0.0f));
-            DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001),
+          DxbcOpMovC(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001),
                     DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
                     DxbcSrc::LF(0.0f),
                     DxbcSrc::R(system_temp_result_,
                                i ? DxbcSrc::kYYYY : DxbcSrc::kXXXX));
        }
        if (i) {
          // Not using DXBC dp# to avoid fused multiply-add, PC GPUs are scalar
          // as of 2020 anyway, and not using mad for the same reason (mul
          // followed by add may be optimized into non-fused mad by the driver
          // in the identical operands case also).
          DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001),
                    DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
                    DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY));
        }
      }
      } else {
      if (component_count == 2) {
          DxbcOpDP2(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
                    operands[1]);
        } else if (component_count == 3) {
          DxbcOpDP3(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
                    operands[1]);
        } else {
          assert_true(component_count == 4);
          DxbcOpDP4(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
                    operands[1]);
        }
      }
      if (component_count == 2) {
        // Add the third operand. Since floating-point addition isn't
        // associative, even though adding this in multiply-add for the first
        // component would be faster, it's safer to add here, in the end.
        DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001),
                  DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
                  operands[2].SelectFromSwizzled(0));
@ -592,14 +566,13 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
        DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::LF(1.0f));
      }
      if (used_result_components & 0b0010) {
        // Shader Model 3: 0 or denormal * anything = 0.
        // FIXME(Triang3l): Signed zero needs research and handling.
        DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010),
                  operands[0].SelectFromSwizzled(1),
                  operands[1].SelectFromSwizzled(1));
-        if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+        if (!(instr.vector_operands[0].GetIdenticalMultiplicandComponents(
                  instr.vector_operands[1]) &
              0b0010)) {
          // Shader Model 3: +-0 or denormal * anything = +0.
          DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
                    operands[0].SelectFromSwizzled(1).Abs(),
                    operands[1].SelectFromSwizzled(1).Abs());
@ -700,8 +673,7 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
      DxbcOpMul(ps_dest, operand_0_a, operand_0_b);
      if (instr.scalar_operands[0].components[0] !=
          instr.scalar_operands[0].components[1]) {
-        // Shader Model 3: 0 or denormal * anything = 0.
+        // Shader Model 3: +-0 or denormal * anything = +0.
        // FIXME(Triang3l): Signed zero needs research and handling.
        uint32_t is_zero_temp = PushSystemTemp();
        DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
                  operand_0_b.Abs());
@ -714,28 +686,16 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
        PopSystemTemp();
      }
      break;
-    case AluScalarOpcode::kMulsPrev: {
+    case AluScalarOpcode::kMulsPrev:
      // Shader Model 3: 0 or denormal * anything = 0.
      // FIXME(Triang3l): Signed zero needs research and handling.
      uint32_t is_zero_temp = PushSystemTemp();
      DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
                ps_src.Abs());
      // min isn't required to flush denormals, eq is.
      DxbcOpEq(DxbcDest::R(is_zero_temp, 0b0001),
               DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f));
      DxbcOpMul(ps_dest, operand_0_a, ps_src);
      DxbcOpMovC(ps_dest, DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX),
                 DxbcSrc::LF(0.0f), ps_src);
      // Release is_zero_temp.
      PopSystemTemp();
    } break;
    case AluScalarOpcode::kMulsPrev2: {
      uint32_t test_temp = PushSystemTemp();
      if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) {
        // Check if need to select the src0.a * ps case.
        // ps != -FLT_MAX.
        DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX));
-      // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since -FLT_MAX
+        // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since
-      // is already loaded to an SGPR, this is also false if it's NaN.
+        // -FLT_MAX is already loaded to an SGPR, this is also false if it's
        // NaN.
        DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(),
                 DxbcSrc::LF(-FLT_MAX));
        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
@ -749,23 +709,27 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
        // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked
        // for NaN).
-      DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), operand_0_b);
+        DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f),
                 operand_0_b);
        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
                  DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
        DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX));
-      // Shader Model 3: 0 or denormal * anything = 0.
+      }
-      // ps is already known to be not NaN or Infinity, so multiplying it by 0
+      // Shader Model 3: +-0 or denormal * anything = +0.
-      // will result in 0. However, src0.a can be anything, so the result should
+      DxbcOpMin(DxbcDest::R(test_temp, 0b0001), operand_0_a.Abs(),
-      // be zero if ps is zero.
+                ps_src.Abs());
-      // FIXME(Triang3l): Signed zero needs research and handling.
+      // min isn't required to flush denormals, eq is.
-      DxbcOpEq(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(0.0f));
+      DxbcOpEq(DxbcDest::R(test_temp, 0b0001),
               DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f));
      DxbcOpMul(ps_dest, operand_0_a, ps_src);
      DxbcOpMovC(ps_dest, DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
                 DxbcSrc::LF(0.0f), ps_src);
      if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) {
        DxbcOpElse();
        DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX));
        DxbcOpEndIf();
      }
      // Release test_temp.
      PopSystemTemp();
    } break;
@ -1023,11 +987,10 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
    case AluScalarOpcode::kMulsc0:
    case AluScalarOpcode::kMulsc1:
      DxbcOpMul(ps_dest, operand_0_a, operand_1);
-      if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents(
+      if (!(instr.scalar_operands[0].GetIdenticalMultiplicandComponents(
                instr.scalar_operands[1]) &
            0b0001)) {
-        // Shader Model 3: 0 or denormal * anything = 0.
+        // Shader Model 3: +-0 or denormal * anything = +0.
        // FIXME(Triang3l): Signed zero needs research and handling.
        uint32_t is_zero_temp = PushSystemTemp();
        DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
                  operand_1.Abs());
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -212,14 +212,19 @@ struct InstructionOperand {
    return false;
  }
-  // Returns which components of two operands are identical, but may have
+  // Returns which components of two operands are identical, so that
-  // different signs (for simplicity of usage with GetComponent, treating the
+  // multiplication of them would result in pow2 with + sign, including in case
-  // rightmost component as replicated).
+  // they're zero (because -0 * |-0|, or -0 * +0, is -0), for providing a fast
-  uint32_t GetAbsoluteIdenticalComponents(
+  // path in emulation of the Shader Model 3 +-0 * x = +0 multiplication
  // behavior (disregarding component_count for simplicity of usage with
  // GetComponent, treating the rightmost component as replicated).
  uint32_t GetIdenticalMultiplicandComponents(
      const InstructionOperand& other) const {
    if (storage_source != other.storage_source ||
        storage_index != other.storage_index ||
-        storage_addressing_mode != other.storage_addressing_mode) {
+        storage_addressing_mode != other.storage_addressing_mode ||
        is_absolute_value != other.is_absolute_value ||
        (!is_absolute_value && is_negated != other.is_negated)) {
      return 0;
    }
    uint32_t identical_components = 0;
@ -229,15 +234,14 @@ struct InstructionOperand {
    }
    return identical_components;
  }
-  // Returns which components of two operands will always be bitwise equal, but
+  // Returns which components of two operands will always be bitwise equal
-  // may have different signs (disregarding component_count for simplicity of
+  // (disregarding component_count for simplicity of usage with GetComponent,
-  // usage with GetComponent, treating the rightmost component as replicated).
+  // treating the rightmost component as replicated).
  uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
-    if (is_negated != other.is_negated ||
+    if (is_negated != other.is_negated) {
        is_absolute_value != other.is_absolute_value) {
      return 0;
    }
-    return GetAbsoluteIdenticalComponents(other);
+    return GetIdenticalMultiplicandComponents(other);
  }
 };
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@ -800,13 +800,26 @@ static_assert_size(TextureFetchInstruction, 12);
 //   Both are valid only within the current ALU clause. They are not modified
 //   when the instruction that would write them fails its predication check.
 // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for
-//   multiplication (0 or denormal * anything = 0) wherever it's present (mul,
+//   multiplication (+-0 or denormal * anything = +0) wherever it's present
-//   mad, dp, etc.) and for NaN in min/max. It's very important to respect this
+//   (mul, mad, dp, etc.) and for NaN in min/max. It's very important to respect
-//   rule for multiplication, as games often rely on it in vector normalization
+//   this rule for multiplication, as games often rely on it in vector
-//   (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of things in
+//   normalization (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of
-//   games - causes white screen in Halo 3, white specular on characters in GTA
+//   things in games - causes white screen in Halo 3, white specular on
-//   IV.
+//   characters in GTA IV. The result is always positive zero in this case, no
-// TODO(Triang3l): Investigate signed zero handling in multiplication.
+//   matter what the signs of the other operands are, according to R5xx
 //   Acceleration section 8.7.5 "Legacy multiply behavior" and testing on
 //   Adreno 200. This means that the following need to be taken into account
 //   (according to 8.7.2 "ALU Non-Transcendental Floating Point"):
 //   - +0 * -0 is -0 with IEEE conformance, however, with this legacy SM3
 //     handling, it should result in +0.
 //   - +0 + -0 is +0, so multiply-add should not be replaced with conditional
 //     move of the third operand in case of zero multiplicands, because the term
 //     may be -0, while the result should be +0 in this case.
 //   http://developer.amd.com/wordpress/media/2013/10/R5xx_Acceleration_v1.5.pdf
 //   Multiply-add also appears to be not fused (the SM3 behavior instruction on
 //   GCN is called v_mad_legacy_f32, not v_fma_legacy_f32) - shader translators
 //   should not use instructions that may be interpreted by the host GPU as
 //   fused multiply-add.
 enum class AluScalarOpcode : uint32_t {
  // Floating-Point Add