JIT: some paired singles optimizations

This commit is contained in:
Fiora 2014-07-26 23:32:02 -07:00
parent 68b2d86daf
commit 34287b8042
1 changed files with 45 additions and 73 deletions

View File

@ -3,19 +3,13 @@
// Refer to the license.txt file included. // Refer to the license.txt file included.
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
#include "Common/CPUDetect.h"
#include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/JitRegCache.h" #include "Core/PowerPC/Jit64/JitRegCache.h"
using namespace Gen; using namespace Gen;
// TODO
// ps_madds0
// ps_muls0
// ps_madds1
// cmppd, andpd, andnpd, or
// lfsx, ps_merge01 etc
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
@ -36,9 +30,6 @@ void Jit64::ps_mr(UGeckoInstruction inst)
void Jit64::ps_sel(UGeckoInstruction inst) void Jit64::ps_sel(UGeckoInstruction inst)
{ {
// we can't use (V)BLENDVPD here because it just looks at the sign bit
// but we need -0 = +0
INSTRUCTION_START INSTRUCTION_START
JITDISABLE(bJITPairedOff); JITDISABLE(bJITPairedOff);
FALLBACK_IF(inst.Rc); FALLBACK_IF(inst.Rc);
@ -49,16 +40,26 @@ void Jit64::ps_sel(UGeckoInstruction inst)
int c = inst.FC; int c = inst.FC;
fpr.Lock(a, b, c, d); fpr.Lock(a, b, c, d);
MOVAPD(XMM0, fpr.R(a));
PXOR(XMM1, R(XMM1)); if (cpu_info.bSSE4_1)
// XMM0 = XMM0 < 0 ? all 1s : all 0s {
CMPPD(XMM0, R(XMM1), LT); PXOR(XMM0, R(XMM0));
MOVAPD(XMM1, R(XMM0)); CMPPD(XMM0, fpr.R(a), LT); // XMM0 = XMM0 >= 0 ? all 1s : all 0s
PAND(XMM0, fpr.R(b)); MOVAPD(XMM1, fpr.R(b));
PANDN(XMM1, fpr.R(c)); BLENDVPD(XMM1, fpr.R(c));
POR(XMM0, R(XMM1)); }
else
{
MOVAPD(XMM1, fpr.R(a));
PXOR(XMM0, R(XMM0));
CMPPD(XMM1, R(XMM0), LT); // XMM0 = XMM0 < 0 ? all 1s : all 0s
MOVAPD(XMM0, R(XMM1));
PAND(XMM1, fpr.R(b));
PANDN(XMM0, fpr.R(c));
POR(XMM1, R(XMM0));
}
fpr.BindToRegister(d, false); fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), R(XMM0)); MOVAPD(fpr.RX(d), R(XMM1));
fpr.UnlockAll(); fpr.UnlockAll();
} }
@ -98,20 +99,6 @@ void Jit64::ps_sign(UGeckoInstruction inst)
fpr.UnlockAll(); fpr.UnlockAll();
} }
//add a, b, c
//mov a, b
//add a, c
//we need:
/*
psq_l
psq_stu
*/
/*
add a,b,a
*/
//There's still a little bit more optimization that can be squeezed out of this //There's still a little bit more optimization that can be squeezed out of this
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
{ {
@ -152,7 +139,7 @@ void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X6
MOVAPD(XMM0, fpr.R(b)); MOVAPD(XMM0, fpr.R(b));
fpr.BindToRegister(d, false); fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), fpr.R(a)); MOVAPD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), Gen::R(XMM0)); (this->*op)(fpr.RX(d), R(XMM0));
} }
} }
else else
@ -204,32 +191,26 @@ void Jit64::ps_sum(UGeckoInstruction inst)
int b = inst.FB; int b = inst.FB;
int c = inst.FC; int c = inst.FC;
fpr.Lock(a,b,c,d); fpr.Lock(a,b,c,d);
fpr.BindToRegister(d, d == a || d == b || d == c, true);
switch (inst.SUBOP5) switch (inst.SUBOP5)
{ {
case 10: case 10:
// ps_sum0, do the sum in upper subregisters, merge uppers MOVDDUP(XMM0, fpr.R(a)); // {a.ps0, a.ps0}
MOVDDUP(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
MOVAPD(XMM1, fpr.R(b)); UNPCKHPD(XMM0, fpr.R(c)); // {a.ps0 + b.ps1, c.ps1}
ADDPD(XMM0, R(XMM1));
UNPCKHPD(XMM0, fpr.R(c)); //merge
MOVAPD(fpr.R(d), XMM0);
break; break;
case 11: case 11:
// ps_sum1, do the sum in lower subregisters, merge lowers MOVDDUP(XMM1, fpr.R(a)); // {a.ps0, a.ps0}
MOVAPD(XMM0, fpr.R(a)); ADDPD(XMM1, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
MOVAPD(XMM1, fpr.R(b)); MOVAPD(XMM0, fpr.R(c));
SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower SHUFPD(XMM0, R(XMM1), 2); // {c.ps0, a.ps0 + b.ps1}
ADDPD(XMM0, R(XMM1)); // sum lowers
MOVAPD(XMM1, fpr.R(c));
UNPCKLPD(XMM1, R(XMM0)); // merge
MOVAPD(fpr.R(d), XMM1);
break; break;
default: default:
PanicAlert("ps_sum WTF!!!"); PanicAlert("ps_sum WTF!!!");
} }
ForceSinglePrecisionP(fpr.RX(d)); ForceSinglePrecisionP(XMM0);
SetFPRFIfNeeded(inst, fpr.RX(d)); SetFPRFIfNeeded(inst, XMM0);
fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), R(XMM0));
fpr.UnlockAll(); fpr.UnlockAll();
} }
@ -244,37 +225,28 @@ void Jit64::ps_muls(UGeckoInstruction inst)
int a = inst.FA; int a = inst.FA;
int c = inst.FC; int c = inst.FC;
fpr.Lock(a, c, d); fpr.Lock(a, c, d);
fpr.BindToRegister(d, d == a || d == c, true);
switch (inst.SUBOP5) switch (inst.SUBOP5)
{ {
case 12: case 12:
// Single multiply scalar high MOVDDUP(XMM0, fpr.R(c));
// TODO - faster version for when regs are different
MOVDDUP(XMM1, fpr.R(c));
Force25BitPrecision(XMM1, XMM0);
MOVAPD(XMM0, fpr.R(a));
MULPD(XMM0, R(XMM1));
MOVAPD(fpr.R(d), XMM0);
break; break;
case 13: case 13:
// TODO - faster version for when regs are different MOVAPD(XMM0, fpr.R(c));
MOVAPD(XMM1, fpr.R(c)); SHUFPD(XMM0, R(XMM0), 3);
Force25BitPrecision(XMM1, XMM0);
MOVAPD(XMM0, fpr.R(a));
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
MULPD(XMM0, R(XMM1));
MOVAPD(fpr.R(d), XMM0);
break; break;
default: default:
PanicAlert("ps_muls WTF!!!"); PanicAlert("ps_muls WTF!!!");
} }
ForceSinglePrecisionP(fpr.RX(d)); Force25BitPrecision(XMM0, XMM1);
SetFPRFIfNeeded(inst, fpr.RX(d)); MULPD(XMM0, fpr.R(a));
ForceSinglePrecisionP(XMM0);
SetFPRFIfNeeded(inst, XMM0);
fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), R(XMM0));
fpr.UnlockAll(); fpr.UnlockAll();
} }
//TODO: find easy cases and optimize them, do a breakout like ps_arith
void Jit64::ps_mergeXX(UGeckoInstruction inst) void Jit64::ps_mergeXX(UGeckoInstruction inst)
{ {
INSTRUCTION_START INSTRUCTION_START
@ -305,7 +277,7 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst)
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op"); _assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
} }
fpr.BindToRegister(d, false); fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), Gen::R(XMM0)); MOVAPD(fpr.RX(d), R(XMM0));
fpr.UnlockAll(); fpr.UnlockAll();
} }
@ -373,8 +345,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
return; return;
} }
fpr.BindToRegister(d, false); fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), Gen::R(XMM0)); ForceSinglePrecisionP(XMM0);
ForceSinglePrecisionP(fpr.RX(d)); SetFPRFIfNeeded(inst, XMM0);
SetFPRFIfNeeded(inst, fpr.RX(d)); MOVAPD(fpr.RX(d), R(XMM0));
fpr.UnlockAll(); fpr.UnlockAll();
} }