JIT: skip weird fmul rounding if the input is known to be single precision

This commit is contained in:
Fiora 2014-10-11 13:07:31 -07:00
parent d4125231f3
commit 7df50b0710
4 changed files with 40 additions and 17 deletions

View File

@ -64,12 +64,13 @@ void Jit64::fp_arith(UGeckoInstruction inst)
FALLBACK_IF(inst.Rc);
bool single = inst.OPCD == 59;
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
switch (inst.SUBOP5)
{
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, round_input); break; //mul
default:
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
}
@ -81,12 +82,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
bool single_precision = inst.OPCD == 59;
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
int d = inst.FD;
bool single_precision = inst.OPCD == 59;
bool round_input = single_precision && !jit->js.op->fprIsSingle[c];
fpr.Lock(a, b, c, d);
@ -98,10 +99,10 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
// instances on different computers giving identical results.
if (cpu_info.bFMA && !Core::g_want_determinism)
{
if (single_precision)
if (single_precision && round_input)
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVSD(XMM0, fpr.R(c));
MOVAPD(XMM0, fpr.R(c));
// Statistics suggests b is a lot less likely to be unbound in practice, so
// if we have to pick one of a or b to bind, let's make it b.
fpr.BindToRegister(b, true, false);
@ -128,20 +129,20 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
else if (inst.SUBOP5 == 30) //nmsub
{
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
if (single_precision)
if (single_precision && round_input)
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
else
MOVSD(XMM1, fpr.R(c));
MOVAPD(XMM1, fpr.R(c));
MULSD(XMM1, fpr.R(a));
MOVSD(XMM0, fpr.R(b));
SUBSD(XMM0, R(XMM1));
}
else
{
if (single_precision)
if (single_precision && round_input)
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVSD(XMM0, fpr.R(c));
MOVAPD(XMM0, fpr.R(c));
MULSD(XMM0, fpr.R(a));
if (inst.SUBOP5 == 28) //msub
SUBSD(XMM0, fpr.R(b));

View File

@ -124,6 +124,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
JITDISABLE(bJITPairedOff);
FALLBACK_IF(inst.Rc);
bool round_input = !jit->js.op->fprIsSingle[inst.FC];
switch (inst.SUBOP5)
{
case 18: // div
@ -136,7 +137,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
break;
case 25: // mul
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true);
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, round_input);
break;
default:
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
@ -187,6 +188,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
int d = inst.FD;
int a = inst.FA;
int c = inst.FC;
bool round_input = !jit->js.op->fprIsSingle[c];
fpr.Lock(a, c, d);
switch (inst.SUBOP5)
{
@ -199,6 +201,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
default:
PanicAlert("ps_muls WTF!!!");
}
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
MULPD(XMM0, fpr.R(a));
fpr.BindToRegister(d, false);
@ -306,6 +309,7 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
int c = inst.FC;
int d = inst.FD;
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
bool round_input = !jit->js.op->fprIsSingle[c];
fpr.Lock(a,b,c,d);
if (fma)
@ -314,16 +318,21 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
if (inst.SUBOP5 == 14)
{
MOVDDUP(XMM0, fpr.R(c));
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
}
else if (inst.SUBOP5 == 15)
{
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
}
else
{
if (round_input)
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVAPD(XMM0, fpr.R(c));
}
if (fma)

View File

@ -827,10 +827,21 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
// the same location later).
gprInUse |= code[i].regsOut;
if (code[i].fregOut >= 0)
{
fprInUse[code[i].fregOut] = true;
if (strncmp(code[i].opinfo->opname, "stfd", 4))
fprInXmm[code[i].fregOut] = true;
}
// Forward scan, for flags that need the other direction for calculation
BitSet32 fprIsSingle;
for (u32 i = 0; i < block->m_num_instructions; i++)
{
code[i].fprIsSingle = fprIsSingle;
if (code[i].fregOut >= 0)
{
// This instruction outputs float, so we can omit the special rounding done in fmuls/fmadds
if (code[i].opinfo->type == OPTYPE_SINGLEFP || code[i].opinfo->type == OPTYPE_PS || strncmp(code[i].opinfo->opname, "lfs", 3))
fprIsSingle[code[i].fregOut] = true;
else
fprIsSingle[code[i].fregOut] = false;
}
}
return address;

View File

@ -51,6 +51,8 @@ struct CodeOp //16B
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
// an XMM only to move it again to a GPR afterwards.
BitSet32 fprInXmm;
// whether an fpr is known to be an actual single-precision value at this point in the block.
BitSet32 fprIsSingle;
};
struct BlockStats