JIT: skip weird fmul rounding if the input is known to be single precision
This commit is contained in:
parent
d4125231f3
commit
7df50b0710
|
@ -64,12 +64,13 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
|||
FALLBACK_IF(inst.Rc);
|
||||
|
||||
bool single = inst.OPCD == 59;
|
||||
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
|
||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
|
||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
|
||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul
|
||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, round_input); break; //mul
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
||||
}
|
||||
|
@ -81,12 +82,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
JITDISABLE(bJITFloatingPointOff);
|
||||
FALLBACK_IF(inst.Rc);
|
||||
|
||||
bool single_precision = inst.OPCD == 59;
|
||||
|
||||
int a = inst.FA;
|
||||
int b = inst.FB;
|
||||
int c = inst.FC;
|
||||
int d = inst.FD;
|
||||
bool single_precision = inst.OPCD == 59;
|
||||
bool round_input = single_precision && !jit->js.op->fprIsSingle[c];
|
||||
|
||||
fpr.Lock(a, b, c, d);
|
||||
|
||||
|
@ -98,10 +99,10 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
// instances on different computers giving identical results.
|
||||
if (cpu_info.bFMA && !Core::g_want_determinism)
|
||||
{
|
||||
if (single_precision)
|
||||
if (single_precision && round_input)
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
else
|
||||
MOVSD(XMM0, fpr.R(c));
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||
// if we have to pick one of a or b to bind, let's make it b.
|
||||
fpr.BindToRegister(b, true, false);
|
||||
|
@ -128,20 +129,20 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
else if (inst.SUBOP5 == 30) //nmsub
|
||||
{
|
||||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||
if (single_precision)
|
||||
if (single_precision && round_input)
|
||||
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
||||
else
|
||||
MOVSD(XMM1, fpr.R(c));
|
||||
MOVAPD(XMM1, fpr.R(c));
|
||||
MULSD(XMM1, fpr.R(a));
|
||||
MOVSD(XMM0, fpr.R(b));
|
||||
SUBSD(XMM0, R(XMM1));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (single_precision)
|
||||
if (single_precision && round_input)
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
else
|
||||
MOVSD(XMM0, fpr.R(c));
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
MULSD(XMM0, fpr.R(a));
|
||||
if (inst.SUBOP5 == 28) //msub
|
||||
SUBSD(XMM0, fpr.R(b));
|
||||
|
|
|
@ -124,6 +124,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
|
|||
JITDISABLE(bJITPairedOff);
|
||||
FALLBACK_IF(inst.Rc);
|
||||
|
||||
bool round_input = !jit->js.op->fprIsSingle[inst.FC];
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: // div
|
||||
|
@ -136,7 +137,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
|
|||
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
|
||||
break;
|
||||
case 25: // mul
|
||||
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true);
|
||||
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, round_input);
|
||||
break;
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
||||
|
@ -187,6 +188,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
|||
int d = inst.FD;
|
||||
int a = inst.FA;
|
||||
int c = inst.FC;
|
||||
bool round_input = !jit->js.op->fprIsSingle[c];
|
||||
fpr.Lock(a, c, d);
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
|
@ -199,6 +201,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
|||
default:
|
||||
PanicAlert("ps_muls WTF!!!");
|
||||
}
|
||||
if (round_input)
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
fpr.BindToRegister(d, false);
|
||||
|
@ -306,6 +309,7 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
|||
int c = inst.FC;
|
||||
int d = inst.FD;
|
||||
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
|
||||
bool round_input = !jit->js.op->fprIsSingle[c];
|
||||
fpr.Lock(a,b,c,d);
|
||||
|
||||
if (fma)
|
||||
|
@ -314,16 +318,21 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
|||
if (inst.SUBOP5 == 14)
|
||||
{
|
||||
MOVDDUP(XMM0, fpr.R(c));
|
||||
if (round_input)
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
}
|
||||
else if (inst.SUBOP5 == 15)
|
||||
{
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||
if (round_input)
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (round_input)
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
else
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
}
|
||||
|
||||
if (fma)
|
||||
|
|
|
@ -827,10 +827,21 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
|||
// the same location later).
|
||||
gprInUse |= code[i].regsOut;
|
||||
if (code[i].fregOut >= 0)
|
||||
{
|
||||
fprInUse[code[i].fregOut] = true;
|
||||
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
||||
fprInXmm[code[i].fregOut] = true;
|
||||
}
|
||||
|
||||
// Forward scan, for flags that need the other direction for calculation
|
||||
BitSet32 fprIsSingle;
|
||||
for (u32 i = 0; i < block->m_num_instructions; i++)
|
||||
{
|
||||
code[i].fprIsSingle = fprIsSingle;
|
||||
if (code[i].fregOut >= 0)
|
||||
{
|
||||
// This instruction outputs float, so we can omit the special rounding done in fmuls/fmadds
|
||||
if (code[i].opinfo->type == OPTYPE_SINGLEFP || code[i].opinfo->type == OPTYPE_PS || strncmp(code[i].opinfo->opname, "lfs", 3))
|
||||
fprIsSingle[code[i].fregOut] = true;
|
||||
else
|
||||
fprIsSingle[code[i].fregOut] = false;
|
||||
}
|
||||
}
|
||||
return address;
|
||||
|
|
|
@ -51,6 +51,8 @@ struct CodeOp //16B
|
|||
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
|
||||
// an XMM only to move it again to a GPR afterwards.
|
||||
BitSet32 fprInXmm;
|
||||
// whether an fpr is known to be an actual single-precision value at this point in the block.
|
||||
BitSet32 fprIsSingle;
|
||||
};
|
||||
|
||||
struct BlockStats
|
||||
|
|
Loading…
Reference in New Issue