JIT: skip weird fmul rounding if the input is known to be single precision
This commit is contained in:
parent
d4125231f3
commit
7df50b0710
|
@ -64,12 +64,13 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
||||||
FALLBACK_IF(inst.Rc);
|
FALLBACK_IF(inst.Rc);
|
||||||
|
|
||||||
bool single = inst.OPCD == 59;
|
bool single = inst.OPCD == 59;
|
||||||
|
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
|
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
|
||||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
|
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
|
||||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
|
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
|
||||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul
|
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, round_input); break; //mul
|
||||||
default:
|
default:
|
||||||
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
||||||
}
|
}
|
||||||
|
@ -81,12 +82,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
JITDISABLE(bJITFloatingPointOff);
|
JITDISABLE(bJITFloatingPointOff);
|
||||||
FALLBACK_IF(inst.Rc);
|
FALLBACK_IF(inst.Rc);
|
||||||
|
|
||||||
bool single_precision = inst.OPCD == 59;
|
|
||||||
|
|
||||||
int a = inst.FA;
|
int a = inst.FA;
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
|
bool single_precision = inst.OPCD == 59;
|
||||||
|
bool round_input = single_precision && !jit->js.op->fprIsSingle[c];
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
|
|
||||||
|
@ -98,10 +99,10 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
// instances on different computers giving identical results.
|
// instances on different computers giving identical results.
|
||||||
if (cpu_info.bFMA && !Core::g_want_determinism)
|
if (cpu_info.bFMA && !Core::g_want_determinism)
|
||||||
{
|
{
|
||||||
if (single_precision)
|
if (single_precision && round_input)
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||||
else
|
else
|
||||||
MOVSD(XMM0, fpr.R(c));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||||
// if we have to pick one of a or b to bind, let's make it b.
|
// if we have to pick one of a or b to bind, let's make it b.
|
||||||
fpr.BindToRegister(b, true, false);
|
fpr.BindToRegister(b, true, false);
|
||||||
|
@ -128,20 +129,20 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
else if (inst.SUBOP5 == 30) //nmsub
|
else if (inst.SUBOP5 == 30) //nmsub
|
||||||
{
|
{
|
||||||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||||
if (single_precision)
|
if (single_precision && round_input)
|
||||||
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
||||||
else
|
else
|
||||||
MOVSD(XMM1, fpr.R(c));
|
MOVAPD(XMM1, fpr.R(c));
|
||||||
MULSD(XMM1, fpr.R(a));
|
MULSD(XMM1, fpr.R(a));
|
||||||
MOVSD(XMM0, fpr.R(b));
|
MOVSD(XMM0, fpr.R(b));
|
||||||
SUBSD(XMM0, R(XMM1));
|
SUBSD(XMM0, R(XMM1));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (single_precision)
|
if (single_precision && round_input)
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||||
else
|
else
|
||||||
MOVSD(XMM0, fpr.R(c));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
MULSD(XMM0, fpr.R(a));
|
MULSD(XMM0, fpr.R(a));
|
||||||
if (inst.SUBOP5 == 28) //msub
|
if (inst.SUBOP5 == 28) //msub
|
||||||
SUBSD(XMM0, fpr.R(b));
|
SUBSD(XMM0, fpr.R(b));
|
||||||
|
|
|
@ -124,6 +124,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
|
||||||
JITDISABLE(bJITPairedOff);
|
JITDISABLE(bJITPairedOff);
|
||||||
FALLBACK_IF(inst.Rc);
|
FALLBACK_IF(inst.Rc);
|
||||||
|
|
||||||
|
bool round_input = !jit->js.op->fprIsSingle[inst.FC];
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 18: // div
|
case 18: // div
|
||||||
|
@ -136,7 +137,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
|
||||||
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
|
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
|
||||||
break;
|
break;
|
||||||
case 25: // mul
|
case 25: // mul
|
||||||
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true);
|
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, round_input);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
||||||
|
@ -187,6 +188,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
int a = inst.FA;
|
int a = inst.FA;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
|
bool round_input = !jit->js.op->fprIsSingle[c];
|
||||||
fpr.Lock(a, c, d);
|
fpr.Lock(a, c, d);
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
|
@ -199,7 +201,8 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
default:
|
default:
|
||||||
PanicAlert("ps_muls WTF!!!");
|
PanicAlert("ps_muls WTF!!!");
|
||||||
}
|
}
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
MULPD(XMM0, fpr.R(a));
|
MULPD(XMM0, fpr.R(a));
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||||
|
@ -306,6 +309,7 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
|
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
|
||||||
|
bool round_input = !jit->js.op->fprIsSingle[c];
|
||||||
fpr.Lock(a,b,c,d);
|
fpr.Lock(a,b,c,d);
|
||||||
|
|
||||||
if (fma)
|
if (fma)
|
||||||
|
@ -314,16 +318,21 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||||
if (inst.SUBOP5 == 14)
|
if (inst.SUBOP5 == 14)
|
||||||
{
|
{
|
||||||
MOVDDUP(XMM0, fpr.R(c));
|
MOVDDUP(XMM0, fpr.R(c));
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
}
|
}
|
||||||
else if (inst.SUBOP5 == 15)
|
else if (inst.SUBOP5 == 15)
|
||||||
{
|
{
|
||||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||||
|
else
|
||||||
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fma)
|
if (fma)
|
||||||
|
|
|
@ -827,10 +827,21 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||||
// the same location later).
|
// the same location later).
|
||||||
gprInUse |= code[i].regsOut;
|
gprInUse |= code[i].regsOut;
|
||||||
if (code[i].fregOut >= 0)
|
if (code[i].fregOut >= 0)
|
||||||
{
|
|
||||||
fprInUse[code[i].fregOut] = true;
|
fprInUse[code[i].fregOut] = true;
|
||||||
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
}
|
||||||
fprInXmm[code[i].fregOut] = true;
|
|
||||||
|
// Forward scan, for flags that need the other direction for calculation
|
||||||
|
BitSet32 fprIsSingle;
|
||||||
|
for (u32 i = 0; i < block->m_num_instructions; i++)
|
||||||
|
{
|
||||||
|
code[i].fprIsSingle = fprIsSingle;
|
||||||
|
if (code[i].fregOut >= 0)
|
||||||
|
{
|
||||||
|
// This instruction outputs float, so we can omit the special rounding done in fmuls/fmadds
|
||||||
|
if (code[i].opinfo->type == OPTYPE_SINGLEFP || code[i].opinfo->type == OPTYPE_PS || strncmp(code[i].opinfo->opname, "lfs", 3))
|
||||||
|
fprIsSingle[code[i].fregOut] = true;
|
||||||
|
else
|
||||||
|
fprIsSingle[code[i].fregOut] = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return address;
|
return address;
|
||||||
|
|
|
@ -51,6 +51,8 @@ struct CodeOp //16B
|
||||||
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
|
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
|
||||||
// an XMM only to move it again to a GPR afterwards.
|
// an XMM only to move it again to a GPR afterwards.
|
||||||
BitSet32 fprInXmm;
|
BitSet32 fprInXmm;
|
||||||
|
// whether an fpr is known to be an actual single-precision value at this point in the block.
|
||||||
|
BitSet32 fprIsSingle;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct BlockStats
|
struct BlockStats
|
||||||
|
|
Loading…
Reference in New Issue