Merge pull request #9970 from JosJuice/jit64-fmaddxx-accurate-nan
Jit64: Fix fmaddXX with accurate NaNs
This commit is contained in:
commit
28ee0af9a3
|
@ -126,7 +126,7 @@ public:
|
||||||
bool duplicate = false);
|
bool duplicate = false);
|
||||||
void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input);
|
void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input);
|
||||||
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in,
|
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in,
|
||||||
Gen::X64Reg clobber = Gen::XMM0);
|
Gen::X64Reg clobber);
|
||||||
|
|
||||||
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
||||||
|
|
||||||
|
|
|
@ -257,7 +257,7 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
||||||
avx_op(avxOp, sseOp, dest, Rop1, Rop2, packed, reversible);
|
avx_op(avxOp, sseOp, dest, Rop1, Rop2, packed, reversible);
|
||||||
}
|
}
|
||||||
|
|
||||||
HandleNaNs(inst, Rd, dest);
|
HandleNaNs(inst, Rd, dest, XMM0);
|
||||||
if (single)
|
if (single)
|
||||||
FinalizeSingleResult(Rd, Rd, packed, true);
|
FinalizeSingleResult(Rd, Rd, packed, true);
|
||||||
else
|
else
|
||||||
|
@ -345,7 +345,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
RegCache::Realize(Ra, Rb, Rc, Rd);
|
RegCache::Realize(Ra, Rb, Rc, Rd);
|
||||||
}
|
}
|
||||||
|
|
||||||
X64Reg result_reg = XMM0;
|
X64Reg scratch_xmm = !use_fma && inst.SUBOP5 == 30 ? XMM1 : XMM0;
|
||||||
|
X64Reg result_xmm = scratch_xmm == XMM0 ? XMM1 : XMM0;
|
||||||
if (software_fma)
|
if (software_fma)
|
||||||
{
|
{
|
||||||
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
|
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
|
||||||
|
@ -392,31 +393,35 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
MOVSD(Rd, XMM0);
|
MOVSD(Rd, XMM0);
|
||||||
result_reg = Rd;
|
result_xmm = Rd;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
result_xmm = XMM0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd
|
if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd
|
||||||
XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits));
|
XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 14: // madds0
|
case 14: // madds0
|
||||||
MOVDDUP(XMM0, Rc);
|
MOVDDUP(result_xmm, Rc);
|
||||||
if (round_input)
|
if (round_input)
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
|
||||||
break;
|
break;
|
||||||
case 15: // madds1
|
case 15: // madds1
|
||||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3);
|
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
|
||||||
if (round_input)
|
if (round_input)
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
if (single && round_input)
|
if (single && round_input)
|
||||||
Force25BitPrecision(XMM0, Rc, XMM1);
|
Force25BitPrecision(result_xmm, Rc, scratch_xmm);
|
||||||
else
|
else
|
||||||
MOVAPD(XMM0, Rc);
|
MOVAPD(result_xmm, Rc);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -426,17 +431,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
case 28: // msub
|
case 28: // msub
|
||||||
if (packed)
|
if (packed)
|
||||||
VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
VFMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
|
||||||
else
|
else
|
||||||
VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
VFMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
|
||||||
break;
|
break;
|
||||||
case 14: // madds0
|
case 14: // madds0
|
||||||
case 15: // madds1
|
case 15: // madds1
|
||||||
case 29: // madd
|
case 29: // madd
|
||||||
if (packed)
|
if (packed)
|
||||||
VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
|
||||||
else
|
else
|
||||||
VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
|
||||||
break;
|
break;
|
||||||
// PowerPC and x86 define NMADD/NMSUB differently
|
// PowerPC and x86 define NMADD/NMSUB differently
|
||||||
// x86: D = -A*C (+/-) B
|
// x86: D = -A*C (+/-) B
|
||||||
|
@ -444,15 +449,15 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
// so we have to swap them; the ADD/SUB here isn't a typo.
|
// so we have to swap them; the ADD/SUB here isn't a typo.
|
||||||
case 30: // nmsub
|
case 30: // nmsub
|
||||||
if (packed)
|
if (packed)
|
||||||
VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
VFNMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
|
||||||
else
|
else
|
||||||
VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
VFNMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
|
||||||
break;
|
break;
|
||||||
case 31: // nmadd
|
case 31: // nmadd
|
||||||
if (packed)
|
if (packed)
|
||||||
VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
VFNMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
|
||||||
else
|
else
|
||||||
VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
VFNMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -462,52 +467,59 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
|
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
|
||||||
// so handle it separately.
|
// so handle it separately.
|
||||||
MOVAPD(XMM1, Rb);
|
MOVAPD(scratch_xmm, Rb);
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
MULPD(XMM0, Ra);
|
MULPD(result_xmm, Ra);
|
||||||
SUBPD(XMM1, R(XMM0));
|
SUBPD(scratch_xmm, R(result_xmm));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MULSD(XMM0, Ra);
|
MULSD(result_xmm, Ra);
|
||||||
SUBSD(XMM1, R(XMM0));
|
SUBSD(scratch_xmm, R(result_xmm));
|
||||||
}
|
}
|
||||||
result_reg = XMM1;
|
result_xmm = scratch_xmm;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
MULPD(XMM0, Ra);
|
MULPD(result_xmm, Ra);
|
||||||
if (inst.SUBOP5 == 28) // msub
|
if (inst.SUBOP5 == 28) // msub
|
||||||
SUBPD(XMM0, Rb);
|
SUBPD(result_xmm, Rb);
|
||||||
else //(n)madd(s[01])
|
else //(n)madd(s[01])
|
||||||
ADDPD(XMM0, Rb);
|
ADDPD(result_xmm, Rb);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MULSD(XMM0, Ra);
|
MULSD(result_xmm, Ra);
|
||||||
if (inst.SUBOP5 == 28)
|
if (inst.SUBOP5 == 28)
|
||||||
SUBSD(XMM0, Rb);
|
SUBSD(result_xmm, Rb);
|
||||||
else
|
else
|
||||||
ADDSD(XMM0, Rb);
|
ADDSD(result_xmm, Rb);
|
||||||
}
|
}
|
||||||
if (inst.SUBOP5 == 31) // nmadd
|
if (inst.SUBOP5 == 31) // nmadd
|
||||||
XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits));
|
XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (SConfig::GetInstance().bAccurateNaNs && result_xmm == XMM0)
|
||||||
|
{
|
||||||
|
// HandleNaNs needs to clobber XMM0
|
||||||
|
MOVAPD(XMM1, R(result_xmm));
|
||||||
|
result_xmm = XMM1;
|
||||||
|
}
|
||||||
|
|
||||||
if (single)
|
if (single)
|
||||||
{
|
{
|
||||||
HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
|
HandleNaNs(inst, result_xmm, result_xmm, XMM0);
|
||||||
FinalizeSingleResult(Rd, R(result_reg), packed, true);
|
FinalizeSingleResult(Rd, R(result_xmm), packed, true);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
HandleNaNs(inst, result_reg, result_reg, XMM1);
|
HandleNaNs(inst, result_xmm, result_xmm, XMM0);
|
||||||
FinalizeDoubleResult(Rd, R(result_reg));
|
FinalizeDoubleResult(Rd, R(result_xmm));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -109,7 +109,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
if (round_input)
|
if (round_input)
|
||||||
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
||||||
MULPD(XMM1, Ra);
|
MULPD(XMM1, Ra);
|
||||||
HandleNaNs(inst, Rd, XMM1);
|
HandleNaNs(inst, Rd, XMM1, XMM0);
|
||||||
FinalizeSingleResult(Rd, Rd);
|
FinalizeSingleResult(Rd, Rd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue