Merge pull request #9973 from JosJuice/jit-fma-negation-order

Jit: Use accurate negation order for FMA instructions
2021-07-31 20:18:49 -04:00 · 2021-07-31 20:18:49 -04:00 · 627832355e
parent 7c365349ee 08b358a829
commit 627832355e
3 changed files with 104 additions and 165 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -345,13 +345,18 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
    RegCache::Realize(Ra, Rb, Rc, Rd);
  }

-  X64Reg scratch_xmm = !use_fma && inst.SUBOP5 == 30 ? XMM1 : XMM0;
-  X64Reg result_xmm = scratch_xmm == XMM0 ? XMM1 : XMM0;
+  const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30;  // msub, nmsub
+  const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31;    // nmsub, nmadd
+  const bool madds0 = inst.SUBOP5 == 14;
+  const bool madds1 = inst.SUBOP5 == 15;
+
+  X64Reg scratch_xmm = XMM0;
+  X64Reg result_xmm = XMM1;
  if (software_fma)
  {
    for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
    {
-      if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15)  // (i == 0 || madds0) && !madds1
+      if ((i == 0 || madds0) && !madds1)
      {
        if (round_input)
          Force25BitPrecision(XMM1, Rc, XMM2);
@ -381,7 +386,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
        MOVHLPS(XMM2, Rb.GetSimpleReg());
      }

-      if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30)  // nsub, nmsub
+      if (subtract)
        XORPS(XMM2, MConst(psSignBits));

      BitSet32 registers_in_use = CallerSavedRegistersInUse();
@ -399,128 +404,87 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
    {
      result_xmm = XMM0;
    }
-
-    if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31)  // nmsub, nmadd
-      XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
  }
  else
  {
-    switch (inst.SUBOP5)
+    if (madds0)
    {
-    case 14:  // madds0
      MOVDDUP(result_xmm, Rc);
      if (round_input)
        Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
-      break;
-    case 15:  // madds1
+    }
+    else if (madds1)
+    {
      avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
      if (round_input)
        Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
-      break;
-    default:
-      if (single && round_input)
+    }
+    else
+    {
+      if (round_input)
        Force25BitPrecision(result_xmm, Rc, scratch_xmm);
      else
        MOVAPD(result_xmm, Rc);
-      break;
    }

    if (use_fma)
    {
-      switch (inst.SUBOP5)
+      if (subtract)
      {
-      case 28:  // msub
        if (packed)
          VFMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
        else
          VFMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
-        break;
-      case 14:  // madds0
-      case 15:  // madds1
-      case 29:  // madd
-        if (packed)
-          VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
-        else
-          VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
-        break;
-      // PowerPC and x86 define NMADD/NMSUB differently
-      // x86: D = -A*C (+/-) B
-      // PPC: D = -(A*C (+/-) B)
-      // so we have to swap them; the ADD/SUB here isn't a typo.
-      case 30:  // nmsub
-        if (packed)
-          VFNMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
-        else
-          VFNMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
-        break;
-      case 31:  // nmadd
-        if (packed)
-          VFNMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
-        else
-          VFNMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
-        break;
-      }
-    }
-    else
-    {
-      if (inst.SUBOP5 == 30)  // nmsub
-      {
-        // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
-        // so handle it separately.
-        MOVAPD(scratch_xmm, Rb);
-        if (packed)
-        {
-          MULPD(result_xmm, Ra);
-          SUBPD(scratch_xmm, R(result_xmm));
-        }
-        else
-        {
-          MULSD(result_xmm, Ra);
-          SUBSD(scratch_xmm, R(result_xmm));
-        }
-        result_xmm = scratch_xmm;
      }
      else
      {
        if (packed)
-        {
-          MULPD(result_xmm, Ra);
-          if (inst.SUBOP5 == 28)  // msub
-            SUBPD(result_xmm, Rb);
-          else  //(n)madd(s[01])
-            ADDPD(result_xmm, Rb);
-        }
+          VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
        else
-        {
-          MULSD(result_xmm, Ra);
-          if (inst.SUBOP5 == 28)
-            SUBSD(result_xmm, Rb);
-          else
-            ADDSD(result_xmm, Rb);
-        }
-        if (inst.SUBOP5 == 31)  // nmadd
-          XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
+          VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
+      }
+    }
+    else
+    {
+      if (packed)
+      {
+        MULPD(result_xmm, Ra);
+        if (subtract)
+          SUBPD(result_xmm, Rb);
+        else
+          ADDPD(result_xmm, Rb);
+      }
+      else
+      {
+        MULSD(result_xmm, Ra);
+        if (subtract)
+          SUBSD(result_xmm, Rb);
+        else
+          ADDSD(result_xmm, Rb);
      }
    }
  }

+  // Using x64's nmadd/nmsub would require us to swap the sign of the addend
+  // (i.e. PPC nmadd maps to x64 nmsub), which can cause problems with signed zeroes.
+  // Also, PowerPC's nmadd/nmsub round before the final negation unlike x64's nmadd/nmsub.
+  // So, negate using a separate instruction instead of using x64's nmadd/nmsub.
+  if (negate)
+    XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
+
  if (SConfig::GetInstance().bAccurateNaNs && result_xmm == XMM0)
  {
    // HandleNaNs needs to clobber XMM0
-    MOVAPD(XMM1, R(result_xmm));
-    result_xmm = XMM1;
+    MOVAPD(Rd, R(result_xmm));
+    result_xmm = Rd;
  }

+  HandleNaNs(inst, result_xmm, result_xmm, XMM0);
+
  if (single)
-  {
-    HandleNaNs(inst, result_xmm, result_xmm, XMM0);
    FinalizeSingleResult(Rd, R(result_xmm), packed, true);
-  }
  else
-  {
-    HandleNaNs(inst, result_xmm, result_xmm, XMM0);
    FinalizeDoubleResult(Rd, R(result_xmm));
-  }
 }

 void Jit64::fsign(UGeckoInstruction inst)
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@ -178,18 +178,24 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
    case 25:
      m_float_emit.FMUL(VD, VA, VC);
      break;
-    case 28:
+    case 28:  // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
      m_float_emit.FNMSUB(VD, VA, VC, VB);
-      break;  // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
-    case 29:
+      break;
+    case 29:  // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
      m_float_emit.FMADD(VD, VA, VC, VB);
-      break;  // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
-    case 30:
-      m_float_emit.FMSUB(VD, VA, VC, VB);
-      break;  // fnmsub: "D = -(A*C - B)" vs "Vd = Va + (-Vn)*Vm"
-    case 31:
-      m_float_emit.FNMADD(VD, VA, VC, VB);
-      break;  // fnmadd: "D = -(A*C + B)" vs "Vd = (-Va) + (-Vn)*Vm"
+      break;
+    // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic],
+    // the subtly different definitions affect how signed zeroes are handled.
+    // Also, PowerPC's nmadd/nmsub perform rounding before the final negation.
+    // So, negate using a separate instruction instead of using AArch64's nmadd/msub.
+    case 30:  // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
+      m_float_emit.FNMSUB(VD, VA, VC, VB);
+      m_float_emit.FNEG(VD, VD);
+      break;
+    case 31:  // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
+      m_float_emit.FMADD(VD, VA, VC, VB);
+      m_float_emit.FNEG(VD, VD);
+      break;
    default:
      ASSERT_MSG(DYNA_REC, 0, "fp_arith");
      break;
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@ -147,26 +147,29 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
  ARM64Reg V0 = ARM64Reg::INVALID_REG;
  ARM64Reg V1Q = ARM64Reg::INVALID_REG;

-  if (round_c || (d != b && (d == a || d == c)))
-  {
-    V0Q = fpr.GetReg();
-    V0 = reg_encoder(V0Q);
-  }
+  const auto allocate_v0_if_needed = [&] {
+    if (V0Q == ARM64Reg::INVALID_REG)
+    {
+      V0Q = fpr.GetReg();
+      V0 = reg_encoder(V0Q);
+    }
+  };

  if (round_c)
  {
    ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");

+    allocate_v0_if_needed();
    V1Q = fpr.GetReg();

    Force25BitPrecision(reg_encoder(V1Q), VC, V0);
    VC = reg_encoder(V1Q);
  }

+  ARM64Reg result_reg = VD;
  switch (op5)
  {
-  case 14:  // ps_madds0
-    // d = a * c.ps0 + b
+  case 14:  // ps_madds0: d = a * c.ps0 + b
    if (VD == VB)
    {
      m_float_emit.FMLA(size, VD, VA, VC, 0);
@ -178,13 +181,13 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
    }
    else
    {
+      allocate_v0_if_needed();
      m_float_emit.MOV(V0, VB);
      m_float_emit.FMLA(size, V0, VA, VC, 0);
-      m_float_emit.MOV(VD, V0);
+      result_reg = V0;
    }
    break;
-  case 15:  // ps_madds1
-    // d = a * c.ps1 + b
+  case 15:  // ps_madds1: d = a * c.ps1 + b
    if (VD == VB)
    {
      m_float_emit.FMLA(size, VD, VA, VC, 1);
@ -196,34 +199,29 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
    }
    else
    {
+      allocate_v0_if_needed();
      m_float_emit.MOV(V0, VB);
      m_float_emit.FMLA(size, V0, VA, VC, 1);
-      m_float_emit.MOV(VD, V0);
+      result_reg = V0;
    }
    break;
-  case 28:  // ps_msub
-    // d = a * c - b
-    if (VD == VB)
-    {
-      // d = -(-a * c + b)
-      // rounding is incorrect if the rounding mode is +/- infinity
-      m_float_emit.FMLS(size, VD, VA, VC);
-      m_float_emit.FNEG(size, VD, VD);
-    }
-    else if (VD != VA && VD != VC)
+  case 28:  // ps_msub:  d = a * c - b
+  case 30:  // ps_nmsub: d = -(a * c - b)
+    if (VD != VA && VD != VC)
    {
      m_float_emit.FNEG(size, VD, VB);
      m_float_emit.FMLA(size, VD, VA, VC);
    }
    else
    {
+      allocate_v0_if_needed();
      m_float_emit.FNEG(size, V0, VB);
      m_float_emit.FMLA(size, V0, VA, VC);
-      m_float_emit.MOV(VD, V0);
+      result_reg = V0;
    }
    break;
-  case 29:  // ps_madd
-    // d = a * c + b
+  case 29:  // ps_madd:  d = a * c + b
+  case 31:  // ps_nmadd: d = -(a * c + b)
    if (VD == VB)
    {
      m_float_emit.FMLA(size, VD, VA, VC);
@ -235,53 +233,10 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
    }
    else
    {
+      allocate_v0_if_needed();
      m_float_emit.MOV(V0, VB);
      m_float_emit.FMLA(size, V0, VA, VC);
-      m_float_emit.MOV(VD, V0);
-    }
-    break;
-  case 30:  // ps_nmsub
-    // d = -(a * c - b)
-    // =>
-    // d = -a * c + b
-    // Note: PowerPC rounds before the final negation.
-    // We don't handle this at the moment because it's
-    // only relevant when rounding to +/- infinity.
-    if (VD == VB)
-    {
-      m_float_emit.FMLS(size, VD, VA, VC);
-    }
-    else if (VD != VA && VD != VC)
-    {
-      m_float_emit.MOV(VD, VB);
-      m_float_emit.FMLS(size, VD, VA, VC);
-    }
-    else
-    {
-      m_float_emit.MOV(V0, VB);
-      m_float_emit.FMLS(size, V0, VA, VC);
-      m_float_emit.MOV(VD, V0);
-    }
-    break;
-  case 31:  // ps_nmadd
-    // d = -(a * c + b)
-    if (VD == VB)
-    {
-      m_float_emit.FMLA(size, VD, VA, VC);
-      m_float_emit.FNEG(size, VD, VD);
-    }
-    else if (VD != VA && VD != VC)
-    {
-      // d = -a * c - b
-      // See rounding note at ps_nmsub.
-      m_float_emit.FNEG(size, VD, VB);
-      m_float_emit.FMLS(size, VD, VA, VC);
-    }
-    else
-    {
-      m_float_emit.MOV(V0, VB);
-      m_float_emit.FMLA(size, V0, VA, VC);
-      m_float_emit.FNEG(size, VD, V0);
+      result_reg = V0;
    }
    break;
  default:
@ -289,6 +244,20 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
    break;
  }

+  switch (op5)
+  {
+  case 30:  // ps_nmsub
+  case 31:  // ps_nmadd
+    // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
+    // for any of AArch64's FMA instructions, so we negate using a separate instruction.
+    m_float_emit.FNEG(size, VD, result_reg);
+    break;
+  default:
+    if (result_reg != VD)
+      m_float_emit.MOV(VD, result_reg);
+    break;
+  }
+
  if (V0Q != ARM64Reg::INVALID_REG)
    fpr.Unlock(V0Q);
  if (V1Q != ARM64Reg::INVALID_REG)