Jit64: Emulate FMA accurately when determinism is enabled
When determinism is enabled, we either want all CPUs to use FMA or we want no CPUs to use FMA. Until now, Jit64 has been been doing the latter. However, this is inaccurate behavior, all CPUs since Haswell support FMA, and getting JitArm64 to match the exact inaccurate rounding used by Jit64 would be a bit annoying. This commit switches us over to using FMA on all CPUs when determinism is enabled, with older CPUs calling the std::fma function.
This commit is contained in:
parent
9bc5bd83a9
commit
2c38d6419e
|
@ -3,6 +3,8 @@
|
|||
// Refer to the license.txt file included.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "Common/Assert.h"
|
||||
|
@ -239,72 +241,139 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
JITDISABLE(bJITFloatingPointOff);
|
||||
FALLBACK_IF(inst.Rc);
|
||||
|
||||
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
||||
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
||||
// be extra careful and use software FMA on CPUs that don't have hardware FMA.
|
||||
const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism();
|
||||
|
||||
int a = inst.FA;
|
||||
int b = inst.FB;
|
||||
int c = inst.FC;
|
||||
int d = inst.FD;
|
||||
bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
||||
bool round_input = single && !js.op->fprIsSingle[c];
|
||||
bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] &&
|
||||
bool packed =
|
||||
inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
|
||||
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
|
||||
|
||||
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
||||
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
||||
// be extra careful and don't use FMA, even if in theory it might be okay.
|
||||
// Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
|
||||
// to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
|
||||
// instances on different computers giving identical results.
|
||||
const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism();
|
||||
|
||||
// For use_fma == true:
|
||||
RCOpArg Ra;
|
||||
RCOpArg Rb;
|
||||
RCOpArg Rc;
|
||||
RCX64Reg Rd;
|
||||
RCX64Reg scratch_guard;
|
||||
if (software_fma)
|
||||
{
|
||||
scratch_guard = fpr.Scratch(XMM2);
|
||||
Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
|
||||
Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||
Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
|
||||
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
||||
RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For cpu_info.bFMA == true:
|
||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||
// if we have to pick one of a or b to bind, let's make it b.
|
||||
RCOpArg Ra = fpr.Use(a, RCMode::Read);
|
||||
RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||
RCOpArg Rc = fpr.Use(c, RCMode::Read);
|
||||
RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
||||
Ra = fpr.Use(a, RCMode::Read);
|
||||
Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||
Rc = fpr.Use(c, RCMode::Read);
|
||||
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
||||
RegCache::Realize(Ra, Rb, Rc, Rd);
|
||||
}
|
||||
|
||||
X64Reg result_reg = XMM0;
|
||||
if (software_fma)
|
||||
{
|
||||
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
|
||||
{
|
||||
if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15) // (i == 0 || madds0) && !madds1
|
||||
{
|
||||
if (round_input)
|
||||
Force25BitPrecision(XMM1, Rc, XMM2);
|
||||
else
|
||||
MOVSD(XMM1, Rc);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVHLPS(XMM1, Rc.GetSimpleReg());
|
||||
if (round_input)
|
||||
Force25BitPrecision(XMM1, R(XMM1), XMM2);
|
||||
}
|
||||
|
||||
// Write the result from the previous loop iteration into Rd so we don't lose it.
|
||||
// It's important that this is done after reading Rc above, in case we have madds1 and c == d.
|
||||
if (packed && i == 0)
|
||||
MOVLHPS(Rd, XMM0);
|
||||
|
||||
if (i == 0)
|
||||
{
|
||||
MOVSD(XMM0, Ra);
|
||||
MOVSD(XMM2, Rb);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVHLPS(XMM0, Ra.GetSimpleReg());
|
||||
MOVHLPS(XMM2, Rb.GetSimpleReg());
|
||||
}
|
||||
|
||||
if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30) // nsub, nmsub
|
||||
XORPS(XMM2, MConst(psSignBits));
|
||||
|
||||
BitSet32 registers_in_use = CallerSavedRegistersInUse();
|
||||
ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
|
||||
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
|
||||
ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
|
||||
}
|
||||
|
||||
if (packed)
|
||||
{
|
||||
MOVSD(Rd, XMM0);
|
||||
result_reg = Rd;
|
||||
}
|
||||
|
||||
if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd
|
||||
XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits));
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 14:
|
||||
MOVDDUP(XMM1, Rc);
|
||||
case 14: // madds0
|
||||
MOVDDUP(XMM0, Rc);
|
||||
if (round_input)
|
||||
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
break;
|
||||
case 15:
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3);
|
||||
case 15: // madds1
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3);
|
||||
if (round_input)
|
||||
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
break;
|
||||
default:
|
||||
bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::WantsDeterminism());
|
||||
X64Reg tmp1 = special ? XMM0 : XMM1;
|
||||
X64Reg tmp2 = special ? XMM1 : XMM0;
|
||||
if (single && round_input)
|
||||
Force25BitPrecision(tmp1, Rc, tmp2);
|
||||
Force25BitPrecision(XMM0, Rc, XMM1);
|
||||
else
|
||||
MOVAPD(tmp1, Rc);
|
||||
MOVAPD(XMM0, Rc);
|
||||
break;
|
||||
}
|
||||
|
||||
if (use_fma)
|
||||
if (cpu_info.bFMA)
|
||||
{
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 28: // msub
|
||||
if (packed)
|
||||
VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||
VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||
else
|
||||
VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||
VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||
break;
|
||||
case 14: // madds0
|
||||
case 15: // madds1
|
||||
case 29: // madd
|
||||
if (packed)
|
||||
VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||
VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||
else
|
||||
VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||
VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||
break;
|
||||
// PowerPC and x86 define NMADD/NMSUB differently
|
||||
// x86: D = -A*C (+/-) B
|
||||
|
@ -312,22 +381,27 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
// so we have to swap them; the ADD/SUB here isn't a typo.
|
||||
case 30: // nmsub
|
||||
if (packed)
|
||||
VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||
VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||
else
|
||||
VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||
VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||
break;
|
||||
case 31: // nmadd
|
||||
if (packed)
|
||||
VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||
VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||
else
|
||||
VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||
VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (inst.SUBOP5 == 30) // nmsub
|
||||
else
|
||||
{
|
||||
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it
|
||||
// separately.
|
||||
// No hardware support for FMA, and determinism is not enabled. In this case we inaccurately
|
||||
// do the multiplication and addition/subtraction in two separate operations for performance.
|
||||
|
||||
if (inst.SUBOP5 == 30) // nmsub
|
||||
{
|
||||
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
|
||||
// so handle it separately.
|
||||
MOVAPD(XMM1, Rb);
|
||||
if (packed)
|
||||
{
|
||||
|
@ -339,38 +413,41 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
MULSD(XMM0, Ra);
|
||||
SUBSD(XMM1, R(XMM0));
|
||||
}
|
||||
result_reg = XMM1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (packed)
|
||||
{
|
||||
MULPD(XMM1, Ra);
|
||||
MULPD(XMM0, Ra);
|
||||
if (inst.SUBOP5 == 28) // msub
|
||||
SUBPD(XMM1, Rb);
|
||||
SUBPD(XMM0, Rb);
|
||||
else //(n)madd(s[01])
|
||||
ADDPD(XMM1, Rb);
|
||||
ADDPD(XMM0, Rb);
|
||||
}
|
||||
else
|
||||
{
|
||||
MULSD(XMM1, Ra);
|
||||
MULSD(XMM0, Ra);
|
||||
if (inst.SUBOP5 == 28)
|
||||
SUBSD(XMM1, Rb);
|
||||
SUBSD(XMM0, Rb);
|
||||
else
|
||||
ADDSD(XMM1, Rb);
|
||||
ADDSD(XMM0, Rb);
|
||||
}
|
||||
if (inst.SUBOP5 == 31) // nmadd
|
||||
XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
|
||||
XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (single)
|
||||
{
|
||||
HandleNaNs(inst, Rd, XMM1);
|
||||
ForceSinglePrecision(Rd, Rd, packed, true);
|
||||
HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
|
||||
ForceSinglePrecision(Rd, R(result_reg), packed, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
HandleNaNs(inst, XMM1, XMM1);
|
||||
MOVSD(Rd, R(XMM1));
|
||||
HandleNaNs(inst, result_reg, result_reg, XMM1);
|
||||
MOVSD(Rd, R(result_reg));
|
||||
}
|
||||
SetFPRFIfNeeded(Rd);
|
||||
}
|
||||
|
|
|
@ -828,6 +828,7 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
|
|||
else
|
||||
{
|
||||
(this->*sseOp)(XMM0, arg2, imm);
|
||||
if (regOp != XMM0)
|
||||
MOVAPD(regOp, R(XMM0));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue