From 42fd273a692a6ba408763b1bbc701f275b1f59d5 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Mon, 14 Jun 2021 15:17:35 +0200 Subject: [PATCH] JitArm64: Implement FMA-less path for FMA instructions For determinism compatibility with old x64 CPUs. Off by default. --- .../JitArm64/JitArm64_FloatingPoint.cpp | 58 ++++++++++++++----- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 38 ++++++++++-- 2 files changed, 77 insertions(+), 19 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 05c338e9c4..76a3fc0f66 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -4,8 +4,10 @@ #include "Common/Arm64Emitter.h" #include "Common/CPUDetect.h" #include "Common/CommonTypes.h" +#include "Common/Config/Config.h" #include "Common/StringUtil.h" +#include "Core/Config/SessionSettings.h" #include "Core/ConfigManager.h" #include "Core/Core.h" #include "Core/CoreTiming.h" @@ -89,6 +91,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst) ARM64Reg VA{}, VB{}, VC{}, VD{}; ARM64Reg V0Q = ARM64Reg::INVALID_REG; + ARM64Reg V1Q = ARM64Reg::INVALID_REG; if (packed) { @@ -151,17 +154,26 @@ void JitArm64::fp_arith(UGeckoInstruction inst) VC = reg_encoder(fpr.R(c, type)); VD = reg_encoder(fpr.RW(d, type_out)); + const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA); + if (round_c) { ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); V0Q = fpr.GetReg(); - const ARM64Reg V1Q = fpr.GetReg(); + V1Q = fpr.GetReg(); - Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q)); - VC = reg_encoder(V0Q); + Force25BitPrecision(reg_encoder(V1Q), VC, reg_encoder(V0Q)); + VC = reg_encoder(V1Q); + } - fpr.Unlock(V1Q); + ARM64Reg inaccurate_fma_temp_reg = VD; + if (inaccurate_fma && d == b) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); + + inaccurate_fma_temp_reg = reg_encoder(V0Q); } switch (op5) @@ -178,23 +190,37 @@ void JitArm64::fp_arith(UGeckoInstruction inst) case 25: m_float_emit.FMUL(VD, VA, VC); break; - case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm" - m_float_emit.FNMSUB(VD, VA, VC, VB); - break; - case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" - m_float_emit.FMADD(VD, VA, VC, VB); - break; // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic], // the subtly different definitions affect how signed zeroes are handled. // Also, PowerPC's nmadd/nmsub perform rounding before the final negation. - // So, negate using a separate instruction instead of using AArch64's nmadd/msub. + // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub. + case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm" case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)" - m_float_emit.FNMSUB(VD, VA, VC, VB); - m_float_emit.FNEG(VD, VD); + if (inaccurate_fma) + { + m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); + m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB); + } + else + { + m_float_emit.FNMSUB(VD, VA, VC, VB); + } + if (op5 == 30) + m_float_emit.FNEG(VD, VD); break; + case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)" - m_float_emit.FMADD(VD, VA, VC, VB); - m_float_emit.FNEG(VD, VD); + if (inaccurate_fma) + { + m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); + m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB); + } + else + { + m_float_emit.FMADD(VD, VA, VC, VB); + } + if (op5 == 31) + m_float_emit.FNEG(VD, VD); break; default: ASSERT_MSG(DYNA_REC, 0, "fp_arith"); @@ -204,6 +230,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst) if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q); + if (V1Q != ARM64Reg::INVALID_REG) + fpr.Unlock(V1Q); if (outputs_are_singles) { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 1e8edb5036..ead760c43a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -3,8 +3,10 @@ #include "Common/Arm64Emitter.h" #include "Common/CommonTypes.h" +#include "Common/Config/Config.h" #include "Common/StringUtil.h" +#include "Core/Config/SessionSettings.h" #include "Core/ConfigManager.h" #include "Core/Core.h" #include "Core/CoreTiming.h" @@ -132,6 +134,7 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) const u32 d = inst.FD; const u32 op5 = inst.SUBOP5; + const bool inaccurate_fma = !Config::Get(Config::SESSION_USE_FMA); const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c); const bool round_c = !js.op->fprIsSingle[inst.FC]; const RegType type = singles ? RegType::Single : RegType::Register; @@ -166,11 +169,23 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) VC = reg_encoder(V1Q); } + ARM64Reg inaccurate_fma_temp_reg = VD; + if (inaccurate_fma && d == b) + { + allocate_v0_if_needed(); + inaccurate_fma_temp_reg = V0; + } + ARM64Reg result_reg = VD; switch (op5) { case 14: // ps_madds0: d = a * c.ps0 + b - if (VD == VB) + if (inaccurate_fma) + { + m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC, 0); + m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); + } + else if (VD == VB) { m_float_emit.FMLA(size, VD, VA, VC, 0); } @@ -188,7 +203,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) } break; case 15: // ps_madds1: d = a * c.ps1 + b - if (VD == VB) + if (inaccurate_fma) + { + m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC, 1); + m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); + } + else if (VD == VB) { m_float_emit.FMLA(size, VD, VA, VC, 1); } @@ -207,7 +227,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) break; case 28: // ps_msub: d = a * c - b case 30: // ps_nmsub: d = -(a * c - b) - if (VD != VA && VD != VC) + if (inaccurate_fma) + { + m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC); + m_float_emit.FSUB(size, VD, inaccurate_fma_temp_reg, VB); + } + else if (VD != VA && VD != VC) { m_float_emit.FNEG(size, VD, VB); m_float_emit.FMLA(size, VD, VA, VC); @@ -222,7 +247,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) break; case 29: // ps_madd: d = a * c + b case 31: // ps_nmadd: d = -(a * c + b) - if (VD == VB) + if (inaccurate_fma) + { + m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC); + m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); + } + else if (VD == VB) { m_float_emit.FMLA(size, VD, VA, VC); }