From 2c38d6419e4e4299960b35e071c8cef411d18995 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sun, 23 May 2021 23:00:57 +0200
Subject: [PATCH] Jit64: Emulate FMA accurately when determinism is enabled

When determinism is enabled, we either want all CPUs to use FMA or
we want no CPUs to use FMA. Until now, Jit64 has been been doing
the latter. However, this is inaccurate behavior, all CPUs since
Haswell support FMA, and getting JitArm64 to match the exact
inaccurate rounding used by Jit64 would be a bit annoying. This
commit switches us over to using FMA on all CPUs when determinism
is enabled, with older CPUs calling the std::fma function.
---
 .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp  | 291 +++++++++++-------
 .../Core/PowerPC/Jit64Common/EmuCodeBlock.cpp |   3 +-
 2 files changed, 186 insertions(+), 108 deletions(-)
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index fa5d9d6d72..ead87a64df 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -3,6 +3,8 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <cmath>
+#include <limits>
 #include <vector>
 
 #include "Common/Assert.h"
@@ -239,138 +241,213 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   JITDISABLE(bJITFloatingPointOff);
   FALLBACK_IF(inst.Rc);
 
+  // While we don't know if any games are actually affected (replays seem to work with all the usual
+  // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
+  // be extra careful and use software FMA on CPUs that don't have hardware FMA.
+  const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism();
+
   int a = inst.FA;
   int b = inst.FB;
   int c = inst.FC;
   int d = inst.FD;
   bool single = inst.OPCD == 4 || inst.OPCD == 59;
   bool round_input = single && !js.op->fprIsSingle[c];
-  bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] &&
-                                   js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
+  bool packed =
+      inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
+                         js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
 
-  // While we don't know if any games are actually affected (replays seem to work with all the usual
-  // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
-  // be extra careful and don't use FMA, even if in theory it might be okay.
-  // Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
-  // to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
-  // instances on different computers giving identical results.
-  const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism();
-
-  // For use_fma == true:
-  //   Statistics suggests b is a lot less likely to be unbound in practice, so
-  //   if we have to pick one of a or b to bind, let's make it b.
-  RCOpArg Ra = fpr.Use(a, RCMode::Read);
-  RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
-  RCOpArg Rc = fpr.Use(c, RCMode::Read);
-  RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
-  RegCache::Realize(Ra, Rb, Rc, Rd);
-
-  switch (inst.SUBOP5)
+  RCOpArg Ra;
+  RCOpArg Rb;
+  RCOpArg Rc;
+  RCX64Reg Rd;
+  RCX64Reg scratch_guard;
+  if (software_fma)
   {
-  case 14:
-    MOVDDUP(XMM1, Rc);
-    if (round_input)
-      Force25BitPrecision(XMM1, R(XMM1), XMM0);
-    break;
-  case 15:
-    avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3);
-    if (round_input)
-      Force25BitPrecision(XMM1, R(XMM1), XMM0);
-    break;
-  default:
-    bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::WantsDeterminism());
-    X64Reg tmp1 = special ? XMM0 : XMM1;
-    X64Reg tmp2 = special ? XMM1 : XMM0;
-    if (single && round_input)
-      Force25BitPrecision(tmp1, Rc, tmp2);
-    else
-      MOVAPD(tmp1, Rc);
-    break;
-  }
-
-  if (use_fma)
-  {
-    switch (inst.SUBOP5)
-    {
-    case 28:  // msub
-      if (packed)
-        VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    case 14:  // madds0
-    case 15:  // madds1
-    case 29:  // madd
-      if (packed)
-        VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    // PowerPC and x86 define NMADD/NMSUB differently
-    // x86: D = -A*C (+/-) B
-    // PPC: D = -(A*C (+/-) B)
-    // so we have to swap them; the ADD/SUB here isn't a typo.
-    case 30:  // nmsub
-      if (packed)
-        VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    case 31:  // nmadd
-      if (packed)
-        VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    }
-  }
-  else if (inst.SUBOP5 == 30)  // nmsub
-  {
-    // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it
-    // separately.
-    MOVAPD(XMM1, Rb);
-    if (packed)
-    {
-      MULPD(XMM0, Ra);
-      SUBPD(XMM1, R(XMM0));
-    }
-    else
-    {
-      MULSD(XMM0, Ra);
-      SUBSD(XMM1, R(XMM0));
-    }
+    scratch_guard = fpr.Scratch(XMM2);
+    Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
+    Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
+    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
+    RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
   }
   else
   {
+    // For cpu_info.bFMA == true:
+    //   Statistics suggests b is a lot less likely to be unbound in practice, so
+    //   if we have to pick one of a or b to bind, let's make it b.
+    Ra = fpr.Use(a, RCMode::Read);
+    Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = fpr.Use(c, RCMode::Read);
+    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
+    RegCache::Realize(Ra, Rb, Rc, Rd);
+  }
+
+  X64Reg result_reg = XMM0;
+  if (software_fma)
+  {
+    for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
+    {
+      if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15)  // (i == 0 || madds0) && !madds1
+      {
+        if (round_input)
+          Force25BitPrecision(XMM1, Rc, XMM2);
+        else
+          MOVSD(XMM1, Rc);
+      }
+      else
+      {
+        MOVHLPS(XMM1, Rc.GetSimpleReg());
+        if (round_input)
+          Force25BitPrecision(XMM1, R(XMM1), XMM2);
+      }
+
+      // Write the result from the previous loop iteration into Rd so we don't lose it.
+      // It's important that this is done after reading Rc above, in case we have madds1 and c == d.
+      if (packed && i == 0)
+        MOVLHPS(Rd, XMM0);
+
+      if (i == 0)
+      {
+        MOVSD(XMM0, Ra);
+        MOVSD(XMM2, Rb);
+      }
+      else
+      {
+        MOVHLPS(XMM0, Ra.GetSimpleReg());
+        MOVHLPS(XMM2, Rb.GetSimpleReg());
+      }
+
+      if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30)  // nsub, nmsub
+        XORPS(XMM2, MConst(psSignBits));
+
+      BitSet32 registers_in_use = CallerSavedRegistersInUse();
+      ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
+      ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
+      ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
+    }
+
     if (packed)
     {
-      MULPD(XMM1, Ra);
-      if (inst.SUBOP5 == 28)  // msub
-        SUBPD(XMM1, Rb);
-      else  //(n)madd(s[01])
-        ADDPD(XMM1, Rb);
+      MOVSD(Rd, XMM0);
+      result_reg = Rd;
+    }
+
+    if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31)  // nmsub, nmadd
+      XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits));
+  }
+  else
+  {
+    switch (inst.SUBOP5)
+    {
+    case 14:  // madds0
+      MOVDDUP(XMM0, Rc);
+      if (round_input)
+        Force25BitPrecision(XMM0, R(XMM0), XMM1);
+      break;
+    case 15:  // madds1
+      avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3);
+      if (round_input)
+        Force25BitPrecision(XMM0, R(XMM0), XMM1);
+      break;
+    default:
+      if (single && round_input)
+        Force25BitPrecision(XMM0, Rc, XMM1);
+      else
+        MOVAPD(XMM0, Rc);
+      break;
+    }
+
+    if (cpu_info.bFMA)
+    {
+      switch (inst.SUBOP5)
+      {
+      case 28:  // msub
+        if (packed)
+          VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      case 14:  // madds0
+      case 15:  // madds1
+      case 29:  // madd
+        if (packed)
+          VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      // PowerPC and x86 define NMADD/NMSUB differently
+      // x86: D = -A*C (+/-) B
+      // PPC: D = -(A*C (+/-) B)
+      // so we have to swap them; the ADD/SUB here isn't a typo.
+      case 30:  // nmsub
+        if (packed)
+          VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      case 31:  // nmadd
+        if (packed)
+          VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      }
     }
     else
     {
-      MULSD(XMM1, Ra);
-      if (inst.SUBOP5 == 28)
-        SUBSD(XMM1, Rb);
+      // No hardware support for FMA, and determinism is not enabled. In this case we inaccurately
+      // do the multiplication and addition/subtraction in two separate operations for performance.
+
+      if (inst.SUBOP5 == 30)  // nmsub
+      {
+        // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
+        // so handle it separately.
+        MOVAPD(XMM1, Rb);
+        if (packed)
+        {
+          MULPD(XMM0, Ra);
+          SUBPD(XMM1, R(XMM0));
+        }
+        else
+        {
+          MULSD(XMM0, Ra);
+          SUBSD(XMM1, R(XMM0));
+        }
+        result_reg = XMM1;
+      }
       else
-        ADDSD(XMM1, Rb);
+      {
+        if (packed)
+        {
+          MULPD(XMM0, Ra);
+          if (inst.SUBOP5 == 28)  // msub
+            SUBPD(XMM0, Rb);
+          else  //(n)madd(s[01])
+            ADDPD(XMM0, Rb);
+        }
+        else
+        {
+          MULSD(XMM0, Ra);
+          if (inst.SUBOP5 == 28)
+            SUBSD(XMM0, Rb);
+          else
+            ADDSD(XMM0, Rb);
+        }
+        if (inst.SUBOP5 == 31)  // nmadd
+          XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits));
+      }
     }
-    if (inst.SUBOP5 == 31)  // nmadd
-      XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
   }
 
   if (single)
   {
-    HandleNaNs(inst, Rd, XMM1);
-    ForceSinglePrecision(Rd, Rd, packed, true);
+    HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
+    ForceSinglePrecision(Rd, R(result_reg), packed, true);
   }
   else
   {
-    HandleNaNs(inst, XMM1, XMM1);
-    MOVSD(Rd, R(XMM1));
+    HandleNaNs(inst, result_reg, result_reg, XMM1);
+    MOVSD(Rd, R(result_reg));
   }
   SetFPRFIfNeeded(Rd);
 }
diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
index 1de9547b89..dbd2cd3497 100644
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@@ -828,7 +828,8 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
     else
     {
       (this->*sseOp)(XMM0, arg2, imm);
-      MOVAPD(regOp, R(XMM0));
+      if (regOp != XMM0)
+        MOVAPD(regOp, R(XMM0));
     }
   }
   else