Jit64: merge ps_maddXX into fmaddXX
This commit is contained in:
parent
c6147c5ed5
commit
36d6a16559
|
@ -194,7 +194,6 @@ public:
|
||||||
void ps_mr(UGeckoInstruction inst);
|
void ps_mr(UGeckoInstruction inst);
|
||||||
void ps_sign(UGeckoInstruction inst); //aggregate
|
void ps_sign(UGeckoInstruction inst); //aggregate
|
||||||
void ps_mergeXX(UGeckoInstruction inst);
|
void ps_mergeXX(UGeckoInstruction inst);
|
||||||
void ps_maddXX(UGeckoInstruction inst);
|
|
||||||
void ps_res(UGeckoInstruction inst);
|
void ps_res(UGeckoInstruction inst);
|
||||||
void ps_rsqrte(UGeckoInstruction inst);
|
void ps_rsqrte(UGeckoInstruction inst);
|
||||||
void ps_sum(UGeckoInstruction inst);
|
void ps_sum(UGeckoInstruction inst);
|
||||||
|
|
|
@ -122,8 +122,8 @@ static GekkoOPTemplate table4_2[] =
|
||||||
{11, &Jit64::ps_sum}, // ps_sum1
|
{11, &Jit64::ps_sum}, // ps_sum1
|
||||||
{12, &Jit64::ps_muls}, // ps_muls0
|
{12, &Jit64::ps_muls}, // ps_muls0
|
||||||
{13, &Jit64::ps_muls}, // ps_muls1
|
{13, &Jit64::ps_muls}, // ps_muls1
|
||||||
{14, &Jit64::ps_maddXX}, // ps_madds0
|
{14, &Jit64::fmaddXX}, // ps_madds0
|
||||||
{15, &Jit64::ps_maddXX}, // ps_madds1
|
{15, &Jit64::fmaddXX}, // ps_madds1
|
||||||
{18, &Jit64::fp_arith}, // ps_div
|
{18, &Jit64::fp_arith}, // ps_div
|
||||||
{20, &Jit64::fp_arith}, // ps_sub
|
{20, &Jit64::fp_arith}, // ps_sub
|
||||||
{21, &Jit64::fp_arith}, // ps_add
|
{21, &Jit64::fp_arith}, // ps_add
|
||||||
|
@ -131,10 +131,10 @@ static GekkoOPTemplate table4_2[] =
|
||||||
{24, &Jit64::ps_res}, // ps_res
|
{24, &Jit64::ps_res}, // ps_res
|
||||||
{25, &Jit64::fp_arith}, // ps_mul
|
{25, &Jit64::fp_arith}, // ps_mul
|
||||||
{26, &Jit64::ps_rsqrte}, // ps_rsqrte
|
{26, &Jit64::ps_rsqrte}, // ps_rsqrte
|
||||||
{28, &Jit64::ps_maddXX}, // ps_msub
|
{28, &Jit64::fmaddXX}, // ps_msub
|
||||||
{29, &Jit64::ps_maddXX}, // ps_madd
|
{29, &Jit64::fmaddXX}, // ps_madd
|
||||||
{30, &Jit64::ps_maddXX}, // ps_nmsub
|
{30, &Jit64::fmaddXX}, // ps_nmsub
|
||||||
{31, &Jit64::ps_maddXX}, // ps_nmadd
|
{31, &Jit64::fmaddXX}, // ps_nmadd
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -105,14 +105,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
bool single = inst.OPCD == 59;
|
bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
||||||
bool round_input = single && !jit->js.op->fprIsSingle[c];
|
bool round_input = single && !jit->js.op->fprIsSingle[c];
|
||||||
bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c];
|
bool packed = inst.OPCD == 4 ||
|
||||||
if (cpu_info.bAtom)
|
(!cpu_info.bAtom && single &&
|
||||||
packed = false;
|
jit->js.op->fprIsDuplicated[a] &&
|
||||||
|
jit->js.op->fprIsDuplicated[b] &&
|
||||||
|
jit->js.op->fprIsDuplicated[c]);
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
|
|
||||||
|
switch(inst.SUBOP5)
|
||||||
|
{
|
||||||
|
case 14:
|
||||||
|
MOVDDUP(XMM0, fpr.R(c));
|
||||||
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
|
break;
|
||||||
|
case 15:
|
||||||
|
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||||
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::g_want_determinism);
|
||||||
|
X64Reg tmp1 = special ? XMM1 : XMM0;
|
||||||
|
X64Reg tmp2 = special ? XMM0 : XMM1;
|
||||||
|
if (single && round_input)
|
||||||
|
Force25BitPrecision(tmp1, fpr.R(c), tmp2);
|
||||||
|
else
|
||||||
|
MOVAPD(tmp1, fpr.R(c));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
||||||
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
||||||
// be extra careful and don't use FMA, even if in theory it might be okay.
|
// be extra careful and don't use FMA, even if in theory it might be okay.
|
||||||
|
@ -121,10 +146,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
// instances on different computers giving identical results.
|
// instances on different computers giving identical results.
|
||||||
if (cpu_info.bFMA && !Core::g_want_determinism)
|
if (cpu_info.bFMA && !Core::g_want_determinism)
|
||||||
{
|
{
|
||||||
if (single && round_input)
|
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
|
||||||
else
|
|
||||||
MOVAPD(XMM0, fpr.R(c));
|
|
||||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||||
// if we have to pick one of a or b to bind, let's make it b.
|
// if we have to pick one of a or b to bind, let's make it b.
|
||||||
fpr.BindToRegister(b, true, false);
|
fpr.BindToRegister(b, true, false);
|
||||||
|
@ -136,6 +157,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
else
|
else
|
||||||
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
break;
|
break;
|
||||||
|
case 14: //madds0
|
||||||
|
case 15: //madds1
|
||||||
case 29: //madd
|
case 29: //madd
|
||||||
if (packed)
|
if (packed)
|
||||||
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
|
@ -162,11 +185,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
else if (inst.SUBOP5 == 30) //nmsub
|
else if (inst.SUBOP5 == 30) //nmsub
|
||||||
{
|
{
|
||||||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately.
|
||||||
if (single && round_input)
|
|
||||||
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
|
||||||
else
|
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
|
||||||
MOVAPD(XMM0, fpr.R(b));
|
MOVAPD(XMM0, fpr.R(b));
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
|
@ -181,16 +200,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (single && round_input)
|
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
|
||||||
else
|
|
||||||
MOVAPD(XMM0, fpr.R(c));
|
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
MULPD(XMM0, fpr.R(a));
|
MULPD(XMM0, fpr.R(a));
|
||||||
if (inst.SUBOP5 == 28) //msub
|
if (inst.SUBOP5 == 28) //msub
|
||||||
SUBPD(XMM0, fpr.R(b));
|
SUBPD(XMM0, fpr.R(b));
|
||||||
else //(n)madd
|
else //(n)madd(s[01])
|
||||||
ADDPD(XMM0, fpr.R(b));
|
ADDPD(XMM0, fpr.R(b));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
@ -224,100 +224,6 @@ void Jit64::ps_res(UGeckoInstruction inst)
|
||||||
gpr.UnlockAllX();
|
gpr.UnlockAllX();
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO: add optimized cases
|
|
||||||
void Jit64::ps_maddXX(UGeckoInstruction inst)
|
|
||||||
{
|
|
||||||
INSTRUCTION_START
|
|
||||||
JITDISABLE(bJITPairedOff);
|
|
||||||
FALLBACK_IF(inst.Rc);
|
|
||||||
|
|
||||||
int a = inst.FA;
|
|
||||||
int b = inst.FB;
|
|
||||||
int c = inst.FC;
|
|
||||||
int d = inst.FD;
|
|
||||||
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
|
|
||||||
bool round_input = !jit->js.op->fprIsSingle[c];
|
|
||||||
fpr.Lock(a, b, c, d);
|
|
||||||
|
|
||||||
if (fma)
|
|
||||||
fpr.BindToRegister(b, true, false);
|
|
||||||
|
|
||||||
if (inst.SUBOP5 == 14)
|
|
||||||
{
|
|
||||||
MOVDDUP(XMM0, fpr.R(c));
|
|
||||||
if (round_input)
|
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
|
||||||
}
|
|
||||||
else if (inst.SUBOP5 == 15)
|
|
||||||
{
|
|
||||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
|
||||||
if (round_input)
|
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (round_input)
|
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
|
||||||
else
|
|
||||||
MOVAPD(XMM0, fpr.R(c));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (fma)
|
|
||||||
{
|
|
||||||
switch (inst.SUBOP5)
|
|
||||||
{
|
|
||||||
case 14: //madds0
|
|
||||||
case 15: //madds1
|
|
||||||
case 29: //madd
|
|
||||||
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
|
||||||
break;
|
|
||||||
case 28: //msub
|
|
||||||
VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
|
||||||
break;
|
|
||||||
case 30: //nmsub
|
|
||||||
VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
|
||||||
break;
|
|
||||||
case 31: //nmadd
|
|
||||||
VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
switch (inst.SUBOP5)
|
|
||||||
{
|
|
||||||
case 14: //madds0
|
|
||||||
case 15: //madds1
|
|
||||||
case 29: //madd
|
|
||||||
MULPD(XMM0, fpr.R(a));
|
|
||||||
ADDPD(XMM0, fpr.R(b));
|
|
||||||
break;
|
|
||||||
case 28: //msub
|
|
||||||
MULPD(XMM0, fpr.R(a));
|
|
||||||
SUBPD(XMM0, fpr.R(b));
|
|
||||||
break;
|
|
||||||
case 30: //nmsub
|
|
||||||
MULPD(XMM0, fpr.R(a));
|
|
||||||
SUBPD(XMM0, fpr.R(b));
|
|
||||||
PXOR(XMM0, M(psSignBits));
|
|
||||||
break;
|
|
||||||
case 31: //nmadd
|
|
||||||
MULPD(XMM0, fpr.R(a));
|
|
||||||
ADDPD(XMM0, fpr.R(b));
|
|
||||||
PXOR(XMM0, M(psSignBits));
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fpr.BindToRegister(d, false);
|
|
||||||
ForceSinglePrecision(fpr.RX(d), R(XMM0));
|
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Jit64::ps_cmpXX(UGeckoInstruction inst)
|
void Jit64::ps_cmpXX(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
|
|
Loading…
Reference in New Issue