Jit64: Preserve inputs when software_fma && m_accurate_nans

When writing the software FMA code, I didn't realize that we can't overwrite d if d is the same register as one of the inputs and HandleNaNs is going to be called. This fixes that.
2022-10-08 19:05:51 +02:00 · 2022-10-08 19:05:51 +02:00 · 5e58a46361
parent 4312840a4b
commit 5e58a46361
1 changed files with 30 additions and 19 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -107,8 +107,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
  if (!m_accurate_nans)
    return;
  ASSERT(xmm != clobber);
  if (inst.OPCD != 4)
  {
    // not paired-single
@ -148,6 +146,8 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
  {
    // paired-single
    ASSERT(xmm != clobber);
    if (cpu_info.bSSE4_1)
    {
      avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, R(xmm), R(xmm), CMP_UNORD);
@ -325,7 +325,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
  FALLBACK_IF(jo.fp_exceptions);
  // We would like to emulate FMA instructions accurately without rounding error if possible, but
-  // unfortunately emulating FMA in software is just too slow on CPUs that are too old to have FMA
+  // unfortunately, emulating FMA in software is just too slow on CPUs that are too old to have FMA
  // instructions, so we have the Config::SESSION_USE_FMA setting to determine whether we should
  // emulate FMA instructions accurately or by a performing a multiply followed by a separate add.
  //
@ -346,6 +346,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
  int d = inst.FD;
  bool single = inst.OPCD == 4 || inst.OPCD == 59;
  bool round_input = single && !js.op->fprIsSingle[c];
  bool preserve_inputs = m_accurate_nans;
  bool preserve_d = preserve_inputs && (a == d || b == d || c == d);
  bool packed =
      inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
                         js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
@ -356,21 +358,35 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
  const bool madds1 = inst.SUBOP5 == 15;
  const bool madds_accurate_nans = m_accurate_nans && (madds0 || madds1);
  X64Reg scratch_xmm = XMM0;
  X64Reg result_xmm = XMM1;
  X64Reg Rc_duplicated = XMM2;
  RCOpArg Ra;
  RCOpArg Rb;
  RCOpArg Rc;
  RCX64Reg Rd;
-  RCX64Reg scratch_guard;
+  RCX64Reg xmm2_guard;
  RCX64Reg result_xmm_guard;
  RCX64Reg Rc_duplicated_guard;
  X64Reg Rc_duplicated = XMM2;
  if (software_fma)
  {
-    scratch_guard = fpr.Scratch(XMM2);
+    xmm2_guard = fpr.Scratch(XMM2);
    Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
    Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
    Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
-    RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
+    if (preserve_d && packed)
    {
      result_xmm_guard = fpr.Scratch();
      RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard, result_xmm_guard);
      result_xmm = Gen::X64Reg(result_xmm_guard);
    }
    else
    {
      RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard);
      result_xmm = packed ? Gen::X64Reg(Rd) : XMM0;
    }
  }
  else
  {
@ -391,8 +407,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
    }
  }
  X64Reg scratch_xmm = XMM0;
  X64Reg result_xmm = XMM1;
  if (software_fma)
  {
    for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
@ -411,10 +425,11 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
          Force25BitPrecision(XMM1, R(XMM1), XMM2);
      }
-      // Write the result from the previous loop iteration into Rd so we don't lose it.
+      // Write the result from the previous loop iteration into result_xmm so we don't lose it.
-      // It's important that this is done after reading Rc above, in case we have madds1 and c == d.
+      // It's important that this is done after reading Rc above, in case we have madds1 and
      // result_xmm == Rd == Rc.
      if (packed && i == 0)
-        MOVLHPS(Rd, XMM0);
+        MOVLHPS(result_xmm, XMM0);
      if (i == 0)
      {
@ -437,14 +452,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
    }
    if (packed)
-    {
+      MOVSD(R(result_xmm), XMM0);
      MOVSD(Rd, XMM0);
      result_xmm = Rd;
    }
    else
-    {
+      DEBUG_ASSERT(result_xmm == XMM0);
      result_xmm = XMM0;
    }
    if (madds_accurate_nans)
    {
@ -530,6 +540,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
    // HandleNaNs needs to clobber XMM0
    MOVAPD(Rd, R(result_xmm));
    result_xmm = Rd;
    DEBUG_ASSERT(!preserve_d);
  }
  // If packed, the clobber register must be XMM0. If not packed, the clobber register is unused.