Merge pull request #9970 from JosJuice/jit64-fmaddxx-accurate-nan

Jit64: Fix fmaddXX with accurate NaNs
This commit is contained in:
Mai M 2021-07-28 20:45:00 -04:00 committed by GitHub
commit 28ee0af9a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 49 additions and 37 deletions

View File

@ -126,7 +126,7 @@ public:
bool duplicate = false); bool duplicate = false);
void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input); void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input);
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in, void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in,
Gen::X64Reg clobber = Gen::XMM0); Gen::X64Reg clobber);
void MultiplyImmediate(u32 imm, int a, int d, bool overflow); void MultiplyImmediate(u32 imm, int a, int d, bool overflow);

View File

@ -257,7 +257,7 @@ void Jit64::fp_arith(UGeckoInstruction inst)
avx_op(avxOp, sseOp, dest, Rop1, Rop2, packed, reversible); avx_op(avxOp, sseOp, dest, Rop1, Rop2, packed, reversible);
} }
HandleNaNs(inst, Rd, dest); HandleNaNs(inst, Rd, dest, XMM0);
if (single) if (single)
FinalizeSingleResult(Rd, Rd, packed, true); FinalizeSingleResult(Rd, Rd, packed, true);
else else
@ -345,7 +345,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
RegCache::Realize(Ra, Rb, Rc, Rd); RegCache::Realize(Ra, Rb, Rc, Rd);
} }
X64Reg result_reg = XMM0; X64Reg scratch_xmm = !use_fma && inst.SUBOP5 == 30 ? XMM1 : XMM0;
X64Reg result_xmm = scratch_xmm == XMM0 ? XMM1 : XMM0;
if (software_fma) if (software_fma)
{ {
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i) for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
@ -392,31 +393,35 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
if (packed) if (packed)
{ {
MOVSD(Rd, XMM0); MOVSD(Rd, XMM0);
result_reg = Rd; result_xmm = Rd;
}
else
{
result_xmm = XMM0;
} }
if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd
XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits)); XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
} }
else else
{ {
switch (inst.SUBOP5) switch (inst.SUBOP5)
{ {
case 14: // madds0 case 14: // madds0
MOVDDUP(XMM0, Rc); MOVDDUP(result_xmm, Rc);
if (round_input) if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1); Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
break; break;
case 15: // madds1 case 15: // madds1
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3); avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
if (round_input) if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1); Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
break; break;
default: default:
if (single && round_input) if (single && round_input)
Force25BitPrecision(XMM0, Rc, XMM1); Force25BitPrecision(result_xmm, Rc, scratch_xmm);
else else
MOVAPD(XMM0, Rc); MOVAPD(result_xmm, Rc);
break; break;
} }
@ -426,17 +431,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
{ {
case 28: // msub case 28: // msub
if (packed) if (packed)
VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra); VFMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else else
VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra); VFMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break; break;
case 14: // madds0 case 14: // madds0
case 15: // madds1 case 15: // madds1
case 29: // madd case 29: // madd
if (packed) if (packed)
VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra); VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else else
VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra); VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break; break;
// PowerPC and x86 define NMADD/NMSUB differently // PowerPC and x86 define NMADD/NMSUB differently
// x86: D = -A*C (+/-) B // x86: D = -A*C (+/-) B
@ -444,15 +449,15 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
// so we have to swap them; the ADD/SUB here isn't a typo. // so we have to swap them; the ADD/SUB here isn't a typo.
case 30: // nmsub case 30: // nmsub
if (packed) if (packed)
VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra); VFNMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else else
VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra); VFNMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break; break;
case 31: // nmadd case 31: // nmadd
if (packed) if (packed)
VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra); VFNMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else else
VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra); VFNMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break; break;
} }
} }
@ -462,52 +467,59 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
{ {
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
// so handle it separately. // so handle it separately.
MOVAPD(XMM1, Rb); MOVAPD(scratch_xmm, Rb);
if (packed) if (packed)
{ {
MULPD(XMM0, Ra); MULPD(result_xmm, Ra);
SUBPD(XMM1, R(XMM0)); SUBPD(scratch_xmm, R(result_xmm));
} }
else else
{ {
MULSD(XMM0, Ra); MULSD(result_xmm, Ra);
SUBSD(XMM1, R(XMM0)); SUBSD(scratch_xmm, R(result_xmm));
} }
result_reg = XMM1; result_xmm = scratch_xmm;
} }
else else
{ {
if (packed) if (packed)
{ {
MULPD(XMM0, Ra); MULPD(result_xmm, Ra);
if (inst.SUBOP5 == 28) // msub if (inst.SUBOP5 == 28) // msub
SUBPD(XMM0, Rb); SUBPD(result_xmm, Rb);
else //(n)madd(s[01]) else //(n)madd(s[01])
ADDPD(XMM0, Rb); ADDPD(result_xmm, Rb);
} }
else else
{ {
MULSD(XMM0, Ra); MULSD(result_xmm, Ra);
if (inst.SUBOP5 == 28) if (inst.SUBOP5 == 28)
SUBSD(XMM0, Rb); SUBSD(result_xmm, Rb);
else else
ADDSD(XMM0, Rb); ADDSD(result_xmm, Rb);
} }
if (inst.SUBOP5 == 31) // nmadd if (inst.SUBOP5 == 31) // nmadd
XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits)); XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
} }
} }
} }
if (SConfig::GetInstance().bAccurateNaNs && result_xmm == XMM0)
{
// HandleNaNs needs to clobber XMM0
MOVAPD(XMM1, R(result_xmm));
result_xmm = XMM1;
}
if (single) if (single)
{ {
HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1); HandleNaNs(inst, result_xmm, result_xmm, XMM0);
FinalizeSingleResult(Rd, R(result_reg), packed, true); FinalizeSingleResult(Rd, R(result_xmm), packed, true);
} }
else else
{ {
HandleNaNs(inst, result_reg, result_reg, XMM1); HandleNaNs(inst, result_xmm, result_xmm, XMM0);
FinalizeDoubleResult(Rd, R(result_reg)); FinalizeDoubleResult(Rd, R(result_xmm));
} }
} }

View File

@ -109,7 +109,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
if (round_input) if (round_input)
Force25BitPrecision(XMM1, R(XMM1), XMM0); Force25BitPrecision(XMM1, R(XMM1), XMM0);
MULPD(XMM1, Ra); MULPD(XMM1, Ra);
HandleNaNs(inst, Rd, XMM1); HandleNaNs(inst, Rd, XMM1, XMM0);
FinalizeSingleResult(Rd, Rd); FinalizeSingleResult(Rd, Rd);
} }