JIT: various float optimizations

This commit is contained in:
Fiora 2014-07-27 19:19:01 -07:00
parent 34287b8042
commit 7b0f559ae1
1 changed files with 46 additions and 44 deletions

View File

@ -10,8 +10,8 @@
using namespace Gen; using namespace Gen;
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS) void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS)
@ -77,17 +77,8 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
if (single) if (single)
{ {
ForceSinglePrecisionS(fpr.RX(d)); ForceSinglePrecisionS(fpr.RX(d));
if (cpu_info.bSSE3)
{
MOVDDUP(fpr.RX(d), fpr.R(d)); MOVDDUP(fpr.RX(d), fpr.R(d));
} }
else
{
if (!fpr.R(d).IsSimpleReg(fpr.RX(d)))
MOVQ_xmm(fpr.RX(d), fpr.R(d));
UNPCKLPD(fpr.RX(d), R(fpr.RX(d)));
}
}
SetFPRFIfNeeded(inst, fpr.RX(d)); SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll(); fpr.UnlockAll();
} }
@ -136,29 +127,29 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
int d = inst.FD; int d = inst.FD;
fpr.Lock(a, b, c, d); fpr.Lock(a, b, c, d);
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
if (inst.SUBOP5 == 30) //nmsub
{
MOVSD(XMM1, fpr.R(c));
if (single_precision)
Force25BitPrecision(XMM1, XMM0);
MULSD(XMM1, fpr.R(a));
MOVSD(XMM0, fpr.R(b));
SUBSD(XMM0, R(XMM1));
}
else
{
MOVSD(XMM0, fpr.R(c)); MOVSD(XMM0, fpr.R(c));
if (single_precision) if (single_precision)
Force25BitPrecision(XMM0, XMM1); Force25BitPrecision(XMM0, XMM1);
switch (inst.SUBOP5)
{
case 28: //msub
MULSD(XMM0, fpr.R(a)); MULSD(XMM0, fpr.R(a));
if (inst.SUBOP5 == 28) //msub
SUBSD(XMM0, fpr.R(b)); SUBSD(XMM0, fpr.R(b));
break; else //(n)madd
case 29: //madd
MULSD(XMM0, fpr.R(a));
ADDSD(XMM0, fpr.R(b)); ADDSD(XMM0, fpr.R(b));
break; if (inst.SUBOP5 == 31) //nmadd
case 30: //nmsub
MULSD(XMM0, fpr.R(a));
SUBSD(XMM0, fpr.R(b));
PXOR(XMM0, M((void*)&psSignBits2)); PXOR(XMM0, M((void*)&psSignBits2));
break;
case 31: //nmadd
MULSD(XMM0, fpr.R(a));
ADDSD(XMM0, fpr.R(b));
PXOR(XMM0, M((void*)&psSignBits2));
break;
} }
fpr.BindToRegister(d, false); fpr.BindToRegister(d, false);
//YES it is necessary to dupe the result :( //YES it is necessary to dupe the result :(
@ -186,23 +177,26 @@ void Jit64::fsign(UGeckoInstruction inst)
int b = inst.FB; int b = inst.FB;
fpr.Lock(b, d); fpr.Lock(b, d);
fpr.BindToRegister(d, true, true); fpr.BindToRegister(d, true, true);
MOVSD(XMM0, fpr.R(b));
if (d != b)
MOVSD(fpr.RX(d), fpr.R(b));
switch (inst.SUBOP10) switch (inst.SUBOP10)
{ {
case 40: // fnegx case 40: // fnegx
PXOR(XMM0, M((void*)&psSignBits2)); // We can cheat and not worry about clobbering the top half by using masks
// that don't modify the top half.
PXOR(fpr.RX(d), M((void*)&psSignBits2));
break; break;
case 264: // fabsx case 264: // fabsx
PAND(XMM0, M((void*)&psAbsMask2)); PAND(fpr.RX(d), M((void*)&psAbsMask2));
break; break;
case 136: // fnabs case 136: // fnabs
POR(XMM0, M((void*)&psSignBits2)); POR(fpr.RX(d), M((void*)&psSignBits2));
break; break;
default: default:
PanicAlert("fsign bleh"); PanicAlert("fsign bleh");
break; break;
} }
MOVSD(fpr.R(d), XMM0);
fpr.UnlockAll(); fpr.UnlockAll();
} }
@ -220,14 +214,22 @@ void Jit64::fmrx(UGeckoInstruction inst)
fpr.Lock(b, d); fpr.Lock(b, d);
// We don't need to load d, but if it is loaded, we need to mark it as dirty.
if (fpr.IsBound(d)) if (fpr.IsBound(d))
{
// We don't need to load d, but if it is loaded, we need to mark it as dirty.
fpr.BindToRegister(d); fpr.BindToRegister(d);
// We have to use MOVLPD if b isn't loaded because "MOVSD reg, mem" sets the upper bits (64+)
// b needs to be in a register because "MOVSD reg, mem" sets the upper bits (64+) to zero and we don't want that. // to zero and we don't want that.
fpr.BindToRegister(b, true, false); if (!fpr.R(b).IsSimpleReg())
MOVLPD(fpr.RX(d), fpr.R(b));
else
MOVSD(fpr.R(d), fpr.RX(b)); MOVSD(fpr.R(d), fpr.RX(b));
}
else
{
fpr.BindToRegister(b, true, false);
MOVSD(fpr.R(d), fpr.RX(b));
}
fpr.UnlockAll(); fpr.UnlockAll();
} }