Jit64: merge ps_maddXX into fmaddXX

This commit is contained in:
Tillmann Karras 2015-05-21 12:33:37 +02:00
parent c6147c5ed5
commit 36d6a16559
4 changed files with 39 additions and 119 deletions

View File

@ -194,7 +194,6 @@ public:
void ps_mr(UGeckoInstruction inst); void ps_mr(UGeckoInstruction inst);
void ps_sign(UGeckoInstruction inst); //aggregate void ps_sign(UGeckoInstruction inst); //aggregate
void ps_mergeXX(UGeckoInstruction inst); void ps_mergeXX(UGeckoInstruction inst);
void ps_maddXX(UGeckoInstruction inst);
void ps_res(UGeckoInstruction inst); void ps_res(UGeckoInstruction inst);
void ps_rsqrte(UGeckoInstruction inst); void ps_rsqrte(UGeckoInstruction inst);
void ps_sum(UGeckoInstruction inst); void ps_sum(UGeckoInstruction inst);

View File

@ -122,8 +122,8 @@ static GekkoOPTemplate table4_2[] =
{11, &Jit64::ps_sum}, // ps_sum1 {11, &Jit64::ps_sum}, // ps_sum1
{12, &Jit64::ps_muls}, // ps_muls0 {12, &Jit64::ps_muls}, // ps_muls0
{13, &Jit64::ps_muls}, // ps_muls1 {13, &Jit64::ps_muls}, // ps_muls1
{14, &Jit64::ps_maddXX}, // ps_madds0 {14, &Jit64::fmaddXX}, // ps_madds0
{15, &Jit64::ps_maddXX}, // ps_madds1 {15, &Jit64::fmaddXX}, // ps_madds1
{18, &Jit64::fp_arith}, // ps_div {18, &Jit64::fp_arith}, // ps_div
{20, &Jit64::fp_arith}, // ps_sub {20, &Jit64::fp_arith}, // ps_sub
{21, &Jit64::fp_arith}, // ps_add {21, &Jit64::fp_arith}, // ps_add
@ -131,10 +131,10 @@ static GekkoOPTemplate table4_2[] =
{24, &Jit64::ps_res}, // ps_res {24, &Jit64::ps_res}, // ps_res
{25, &Jit64::fp_arith}, // ps_mul {25, &Jit64::fp_arith}, // ps_mul
{26, &Jit64::ps_rsqrte}, // ps_rsqrte {26, &Jit64::ps_rsqrte}, // ps_rsqrte
{28, &Jit64::ps_maddXX}, // ps_msub {28, &Jit64::fmaddXX}, // ps_msub
{29, &Jit64::ps_maddXX}, // ps_madd {29, &Jit64::fmaddXX}, // ps_madd
{30, &Jit64::ps_maddXX}, // ps_nmsub {30, &Jit64::fmaddXX}, // ps_nmsub
{31, &Jit64::ps_maddXX}, // ps_nmadd {31, &Jit64::fmaddXX}, // ps_nmadd
}; };

View File

@ -105,14 +105,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
int b = inst.FB; int b = inst.FB;
int c = inst.FC; int c = inst.FC;
int d = inst.FD; int d = inst.FD;
bool single = inst.OPCD == 59; bool single = inst.OPCD == 4 || inst.OPCD == 59;
bool round_input = single && !jit->js.op->fprIsSingle[c]; bool round_input = single && !jit->js.op->fprIsSingle[c];
bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c]; bool packed = inst.OPCD == 4 ||
if (cpu_info.bAtom) (!cpu_info.bAtom && single &&
packed = false; jit->js.op->fprIsDuplicated[a] &&
jit->js.op->fprIsDuplicated[b] &&
jit->js.op->fprIsDuplicated[c]);
fpr.Lock(a, b, c, d); fpr.Lock(a, b, c, d);
switch(inst.SUBOP5)
{
case 14:
MOVDDUP(XMM0, fpr.R(c));
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
break;
case 15:
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
break;
default:
bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::g_want_determinism);
X64Reg tmp1 = special ? XMM1 : XMM0;
X64Reg tmp2 = special ? XMM0 : XMM1;
if (single && round_input)
Force25BitPrecision(tmp1, fpr.R(c), tmp2);
else
MOVAPD(tmp1, fpr.R(c));
break;
}
// While we don't know if any games are actually affected (replays seem to work with all the usual // While we don't know if any games are actually affected (replays seem to work with all the usual
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
// be extra careful and don't use FMA, even if in theory it might be okay. // be extra careful and don't use FMA, even if in theory it might be okay.
@ -121,10 +146,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
// instances on different computers giving identical results. // instances on different computers giving identical results.
if (cpu_info.bFMA && !Core::g_want_determinism) if (cpu_info.bFMA && !Core::g_want_determinism)
{ {
if (single && round_input)
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVAPD(XMM0, fpr.R(c));
// Statistics suggests b is a lot less likely to be unbound in practice, so // Statistics suggests b is a lot less likely to be unbound in practice, so
// if we have to pick one of a or b to bind, let's make it b. // if we have to pick one of a or b to bind, let's make it b.
fpr.BindToRegister(b, true, false); fpr.BindToRegister(b, true, false);
@ -136,6 +157,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
else else
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
break; break;
case 14: //madds0
case 15: //madds1
case 29: //madd case 29: //madd
if (packed) if (packed)
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
@ -162,11 +185,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
} }
else if (inst.SUBOP5 == 30) //nmsub else if (inst.SUBOP5 == 30) //nmsub
{ {
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately.
if (single && round_input)
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
else
MOVAPD(XMM1, fpr.R(c));
MOVAPD(XMM0, fpr.R(b)); MOVAPD(XMM0, fpr.R(b));
if (packed) if (packed)
{ {
@ -181,16 +200,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
} }
else else
{ {
if (single && round_input)
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVAPD(XMM0, fpr.R(c));
if (packed) if (packed)
{ {
MULPD(XMM0, fpr.R(a)); MULPD(XMM0, fpr.R(a));
if (inst.SUBOP5 == 28) //msub if (inst.SUBOP5 == 28) //msub
SUBPD(XMM0, fpr.R(b)); SUBPD(XMM0, fpr.R(b));
else //(n)madd else //(n)madd(s[01])
ADDPD(XMM0, fpr.R(b)); ADDPD(XMM0, fpr.R(b));
} }
else else

View File

@ -224,100 +224,6 @@ void Jit64::ps_res(UGeckoInstruction inst)
gpr.UnlockAllX(); gpr.UnlockAllX();
} }
//TODO: add optimized cases
void Jit64::ps_maddXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITPairedOff);
FALLBACK_IF(inst.Rc);
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
int d = inst.FD;
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
bool round_input = !jit->js.op->fprIsSingle[c];
fpr.Lock(a, b, c, d);
if (fma)
fpr.BindToRegister(b, true, false);
if (inst.SUBOP5 == 14)
{
MOVDDUP(XMM0, fpr.R(c));
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
}
else if (inst.SUBOP5 == 15)
{
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
}
else
{
if (round_input)
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVAPD(XMM0, fpr.R(c));
}
if (fma)
{
switch (inst.SUBOP5)
{
case 14: //madds0
case 15: //madds1
case 29: //madd
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
break;
case 28: //msub
VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
break;
case 30: //nmsub
VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
break;
case 31: //nmadd
VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
break;
}
}
else
{
switch (inst.SUBOP5)
{
case 14: //madds0
case 15: //madds1
case 29: //madd
MULPD(XMM0, fpr.R(a));
ADDPD(XMM0, fpr.R(b));
break;
case 28: //msub
MULPD(XMM0, fpr.R(a));
SUBPD(XMM0, fpr.R(b));
break;
case 30: //nmsub
MULPD(XMM0, fpr.R(a));
SUBPD(XMM0, fpr.R(b));
PXOR(XMM0, M(psSignBits));
break;
case 31: //nmadd
MULPD(XMM0, fpr.R(a));
ADDPD(XMM0, fpr.R(b));
PXOR(XMM0, M(psSignBits));
break;
default:
_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
return;
}
}
fpr.BindToRegister(d, false);
ForceSinglePrecision(fpr.RX(d), R(XMM0));
SetFPRFIfNeeded(fpr.RX(d));
fpr.UnlockAll();
}
void Jit64::ps_cmpXX(UGeckoInstruction inst) void Jit64::ps_cmpXX(UGeckoInstruction inst)
{ {
INSTRUCTION_START INSTRUCTION_START