Merge pull request #2502 from Tilka/xmm_swap
Jit64: swap XMM registers for later optimization
This commit is contained in:
commit
0463f61499
|
@ -119,19 +119,19 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
switch(inst.SUBOP5)
|
switch(inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 14:
|
case 14:
|
||||||
MOVDDUP(XMM0, fpr.R(c));
|
MOVDDUP(XMM1, fpr.R(c));
|
||||||
if (round_input)
|
if (round_input)
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
||||||
break;
|
break;
|
||||||
case 15:
|
case 15:
|
||||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, fpr.R(c), fpr.R(c), 3);
|
||||||
if (round_input)
|
if (round_input)
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::g_want_determinism);
|
bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::g_want_determinism);
|
||||||
X64Reg tmp1 = special ? XMM1 : XMM0;
|
X64Reg tmp1 = special ? XMM0 : XMM1;
|
||||||
X64Reg tmp2 = special ? XMM0 : XMM1;
|
X64Reg tmp2 = special ? XMM1 : XMM0;
|
||||||
if (single && round_input)
|
if (single && round_input)
|
||||||
Force25BitPrecision(tmp1, fpr.R(c), tmp2);
|
Force25BitPrecision(tmp1, fpr.R(c), tmp2);
|
||||||
else
|
else
|
||||||
|
@ -154,17 +154,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
case 28: //msub
|
case 28: //msub
|
||||||
if (packed)
|
if (packed)
|
||||||
VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
VFMSUB132PD(XMM1, fpr.RX(b), fpr.R(a));
|
||||||
else
|
else
|
||||||
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
VFMSUB132SD(XMM1, fpr.RX(b), fpr.R(a));
|
||||||
break;
|
break;
|
||||||
case 14: //madds0
|
case 14: //madds0
|
||||||
case 15: //madds1
|
case 15: //madds1
|
||||||
case 29: //madd
|
case 29: //madd
|
||||||
if (packed)
|
if (packed)
|
||||||
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
VFMADD132PD(XMM1, fpr.RX(b), fpr.R(a));
|
||||||
else
|
else
|
||||||
VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
|
VFMADD132SD(XMM1, fpr.RX(b), fpr.R(a));
|
||||||
break;
|
break;
|
||||||
// PowerPC and x86 define NMADD/NMSUB differently
|
// PowerPC and x86 define NMADD/NMSUB differently
|
||||||
// x86: D = -A*C (+/-) B
|
// x86: D = -A*C (+/-) B
|
||||||
|
@ -172,61 +172,61 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
// so we have to swap them; the ADD/SUB here isn't a typo.
|
// so we have to swap them; the ADD/SUB here isn't a typo.
|
||||||
case 30: //nmsub
|
case 30: //nmsub
|
||||||
if (packed)
|
if (packed)
|
||||||
VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
VFNMADD132PD(XMM1, fpr.RX(b), fpr.R(a));
|
||||||
else
|
else
|
||||||
VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
|
VFNMADD132SD(XMM1, fpr.RX(b), fpr.R(a));
|
||||||
break;
|
break;
|
||||||
case 31: //nmadd
|
case 31: //nmadd
|
||||||
if (packed)
|
if (packed)
|
||||||
VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
VFNMSUB132PD(XMM1, fpr.RX(b), fpr.R(a));
|
||||||
else
|
else
|
||||||
VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
VFNMSUB132SD(XMM1, fpr.RX(b), fpr.R(a));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (inst.SUBOP5 == 30) //nmsub
|
else if (inst.SUBOP5 == 30) //nmsub
|
||||||
{
|
{
|
||||||
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately.
|
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately.
|
||||||
MOVAPD(XMM0, fpr.R(b));
|
MOVAPD(XMM1, fpr.R(b));
|
||||||
if (packed)
|
|
||||||
{
|
|
||||||
MULPD(XMM1, fpr.R(a));
|
|
||||||
SUBPD(XMM0, R(XMM1));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
MULSD(XMM1, fpr.R(a));
|
|
||||||
SUBSD(XMM0, R(XMM1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
MULPD(XMM0, fpr.R(a));
|
MULPD(XMM0, fpr.R(a));
|
||||||
if (inst.SUBOP5 == 28) //msub
|
SUBPD(XMM1, R(XMM0));
|
||||||
SUBPD(XMM0, fpr.R(b));
|
|
||||||
else //(n)madd(s[01])
|
|
||||||
ADDPD(XMM0, fpr.R(b));
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MULSD(XMM0, fpr.R(a));
|
MULSD(XMM0, fpr.R(a));
|
||||||
if (inst.SUBOP5 == 28)
|
SUBSD(XMM1, R(XMM0));
|
||||||
SUBSD(XMM0, fpr.R(b));
|
}
|
||||||
|
}
|
||||||
else
|
else
|
||||||
ADDSD(XMM0, fpr.R(b));
|
{
|
||||||
|
if (packed)
|
||||||
|
{
|
||||||
|
MULPD(XMM1, fpr.R(a));
|
||||||
|
if (inst.SUBOP5 == 28) //msub
|
||||||
|
SUBPD(XMM1, fpr.R(b));
|
||||||
|
else //(n)madd(s[01])
|
||||||
|
ADDPD(XMM1, fpr.R(b));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MULSD(XMM1, fpr.R(a));
|
||||||
|
if (inst.SUBOP5 == 28)
|
||||||
|
SUBSD(XMM1, fpr.R(b));
|
||||||
|
else
|
||||||
|
ADDSD(XMM1, fpr.R(b));
|
||||||
}
|
}
|
||||||
if (inst.SUBOP5 == 31) //nmadd
|
if (inst.SUBOP5 == 31) //nmadd
|
||||||
PXOR(XMM0, M(packed ? psSignBits2 : psSignBits));
|
PXOR(XMM1, M(packed ? psSignBits2 : psSignBits));
|
||||||
}
|
}
|
||||||
|
|
||||||
fpr.BindToRegister(d, !single);
|
fpr.BindToRegister(d, !single);
|
||||||
|
|
||||||
if (single)
|
if (single)
|
||||||
ForceSinglePrecision(fpr.RX(d), R(XMM0), packed, true);
|
ForceSinglePrecision(fpr.RX(d), R(XMM1), packed, true);
|
||||||
else
|
else
|
||||||
MOVSD(fpr.RX(d), R(XMM0));
|
MOVSD(fpr.RX(d), R(XMM1));
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue