JIT: some paired singles optimizations
This commit is contained in:
parent
68b2d86daf
commit
34287b8042
|
@ -3,19 +3,13 @@
|
||||||
// Refer to the license.txt file included.
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
|
#include "Common/CPUDetect.h"
|
||||||
|
|
||||||
#include "Core/PowerPC/Jit64/Jit.h"
|
#include "Core/PowerPC/Jit64/Jit.h"
|
||||||
#include "Core/PowerPC/Jit64/JitRegCache.h"
|
#include "Core/PowerPC/Jit64/JitRegCache.h"
|
||||||
|
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
// TODO
|
|
||||||
// ps_madds0
|
|
||||||
// ps_muls0
|
|
||||||
// ps_madds1
|
|
||||||
// cmppd, andpd, andnpd, or
|
|
||||||
// lfsx, ps_merge01 etc
|
|
||||||
|
|
||||||
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
||||||
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
||||||
|
|
||||||
|
@ -36,9 +30,6 @@ void Jit64::ps_mr(UGeckoInstruction inst)
|
||||||
|
|
||||||
void Jit64::ps_sel(UGeckoInstruction inst)
|
void Jit64::ps_sel(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
// we can't use (V)BLENDVPD here because it just looks at the sign bit
|
|
||||||
// but we need -0 = +0
|
|
||||||
|
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITPairedOff);
|
JITDISABLE(bJITPairedOff);
|
||||||
FALLBACK_IF(inst.Rc);
|
FALLBACK_IF(inst.Rc);
|
||||||
|
@ -49,16 +40,26 @@ void Jit64::ps_sel(UGeckoInstruction inst)
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
|
||||||
PXOR(XMM1, R(XMM1));
|
if (cpu_info.bSSE4_1)
|
||||||
// XMM0 = XMM0 < 0 ? all 1s : all 0s
|
{
|
||||||
CMPPD(XMM0, R(XMM1), LT);
|
PXOR(XMM0, R(XMM0));
|
||||||
MOVAPD(XMM1, R(XMM0));
|
CMPPD(XMM0, fpr.R(a), LT); // XMM0 = XMM0 >= 0 ? all 1s : all 0s
|
||||||
PAND(XMM0, fpr.R(b));
|
MOVAPD(XMM1, fpr.R(b));
|
||||||
PANDN(XMM1, fpr.R(c));
|
BLENDVPD(XMM1, fpr.R(c));
|
||||||
POR(XMM0, R(XMM1));
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVAPD(XMM1, fpr.R(a));
|
||||||
|
PXOR(XMM0, R(XMM0));
|
||||||
|
CMPPD(XMM1, R(XMM0), LT); // XMM0 = XMM0 < 0 ? all 1s : all 0s
|
||||||
|
MOVAPD(XMM0, R(XMM1));
|
||||||
|
PAND(XMM1, fpr.R(b));
|
||||||
|
PANDN(XMM0, fpr.R(c));
|
||||||
|
POR(XMM1, R(XMM0));
|
||||||
|
}
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
MOVAPD(fpr.RX(d), R(XMM0));
|
MOVAPD(fpr.RX(d), R(XMM1));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,20 +99,6 @@ void Jit64::ps_sign(UGeckoInstruction inst)
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
//add a, b, c
|
|
||||||
|
|
||||||
//mov a, b
|
|
||||||
//add a, c
|
|
||||||
//we need:
|
|
||||||
/*
|
|
||||||
psq_l
|
|
||||||
psq_stu
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
add a,b,a
|
|
||||||
*/
|
|
||||||
|
|
||||||
//There's still a little bit more optimization that can be squeezed out of this
|
//There's still a little bit more optimization that can be squeezed out of this
|
||||||
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||||
{
|
{
|
||||||
|
@ -152,7 +139,7 @@ void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X6
|
||||||
MOVAPD(XMM0, fpr.R(b));
|
MOVAPD(XMM0, fpr.R(b));
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
MOVAPD(fpr.RX(d), fpr.R(a));
|
MOVAPD(fpr.RX(d), fpr.R(a));
|
||||||
(this->*op)(fpr.RX(d), Gen::R(XMM0));
|
(this->*op)(fpr.RX(d), R(XMM0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -204,32 +191,26 @@ void Jit64::ps_sum(UGeckoInstruction inst)
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
fpr.Lock(a,b,c,d);
|
fpr.Lock(a,b,c,d);
|
||||||
fpr.BindToRegister(d, d == a || d == b || d == c, true);
|
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 10:
|
case 10:
|
||||||
// ps_sum0, do the sum in upper subregisters, merge uppers
|
MOVDDUP(XMM0, fpr.R(a)); // {a.ps0, a.ps0}
|
||||||
MOVDDUP(XMM0, fpr.R(a));
|
ADDPD(XMM0, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
|
||||||
MOVAPD(XMM1, fpr.R(b));
|
UNPCKHPD(XMM0, fpr.R(c)); // {a.ps0 + b.ps1, c.ps1}
|
||||||
ADDPD(XMM0, R(XMM1));
|
|
||||||
UNPCKHPD(XMM0, fpr.R(c)); //merge
|
|
||||||
MOVAPD(fpr.R(d), XMM0);
|
|
||||||
break;
|
break;
|
||||||
case 11:
|
case 11:
|
||||||
// ps_sum1, do the sum in lower subregisters, merge lowers
|
MOVDDUP(XMM1, fpr.R(a)); // {a.ps0, a.ps0}
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
ADDPD(XMM1, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
|
||||||
MOVAPD(XMM1, fpr.R(b));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower
|
SHUFPD(XMM0, R(XMM1), 2); // {c.ps0, a.ps0 + b.ps1}
|
||||||
ADDPD(XMM0, R(XMM1)); // sum lowers
|
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
|
||||||
UNPCKLPD(XMM1, R(XMM0)); // merge
|
|
||||||
MOVAPD(fpr.R(d), XMM1);
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
PanicAlert("ps_sum WTF!!!");
|
PanicAlert("ps_sum WTF!!!");
|
||||||
}
|
}
|
||||||
ForceSinglePrecisionP(fpr.RX(d));
|
ForceSinglePrecisionP(XMM0);
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
SetFPRFIfNeeded(inst, XMM0);
|
||||||
|
fpr.BindToRegister(d, false);
|
||||||
|
MOVAPD(fpr.RX(d), R(XMM0));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -244,37 +225,28 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
int a = inst.FA;
|
int a = inst.FA;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
fpr.Lock(a, c, d);
|
fpr.Lock(a, c, d);
|
||||||
fpr.BindToRegister(d, d == a || d == c, true);
|
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 12:
|
case 12:
|
||||||
// Single multiply scalar high
|
MOVDDUP(XMM0, fpr.R(c));
|
||||||
// TODO - faster version for when regs are different
|
|
||||||
MOVDDUP(XMM1, fpr.R(c));
|
|
||||||
Force25BitPrecision(XMM1, XMM0);
|
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
|
||||||
MULPD(XMM0, R(XMM1));
|
|
||||||
MOVAPD(fpr.R(d), XMM0);
|
|
||||||
break;
|
break;
|
||||||
case 13:
|
case 13:
|
||||||
// TODO - faster version for when regs are different
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
SHUFPD(XMM0, R(XMM0), 3);
|
||||||
Force25BitPrecision(XMM1, XMM0);
|
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
|
||||||
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
|
|
||||||
MULPD(XMM0, R(XMM1));
|
|
||||||
MOVAPD(fpr.R(d), XMM0);
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
PanicAlert("ps_muls WTF!!!");
|
PanicAlert("ps_muls WTF!!!");
|
||||||
}
|
}
|
||||||
ForceSinglePrecisionP(fpr.RX(d));
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
MULPD(XMM0, fpr.R(a));
|
||||||
|
ForceSinglePrecisionP(XMM0);
|
||||||
|
SetFPRFIfNeeded(inst, XMM0);
|
||||||
|
fpr.BindToRegister(d, false);
|
||||||
|
MOVAPD(fpr.RX(d), R(XMM0));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//TODO: find easy cases and optimize them, do a breakout like ps_arith
|
|
||||||
void Jit64::ps_mergeXX(UGeckoInstruction inst)
|
void Jit64::ps_mergeXX(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
|
@ -305,7 +277,7 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst)
|
||||||
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
|
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
MOVAPD(fpr.RX(d), Gen::R(XMM0));
|
MOVAPD(fpr.RX(d), R(XMM0));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -373,8 +345,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
MOVAPD(fpr.RX(d), Gen::R(XMM0));
|
ForceSinglePrecisionP(XMM0);
|
||||||
ForceSinglePrecisionP(fpr.RX(d));
|
SetFPRFIfNeeded(inst, XMM0);
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
MOVAPD(fpr.RX(d), R(XMM0));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue