JIT: add more AVX support, reduce redundant XMM moves
10-20% reduction in typical compiled block size for float-heavy JIT blocks.
This commit is contained in:
parent
8fe730194b
commit
bf014636c8
|
@ -140,10 +140,13 @@ public:
|
|||
|
||||
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
||||
|
||||
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
||||
void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
|
||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
||||
typedef u32 (*Operation)(u32 a, u32 b);
|
||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
|
||||
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
|
||||
bool Rc = false, bool carry = false);
|
||||
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
|
||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
||||
void FloatCompare(UGeckoInstruction inst, bool upper = false);
|
||||
|
||||
// OPCODES
|
||||
|
|
|
@ -14,65 +14,27 @@ static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000
|
|||
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
|
||||
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
||||
|
||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
|
||||
void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||
{
|
||||
fpr.Lock(d, a, b);
|
||||
fpr.BindToRegister(d, d == a || d == b || !single);
|
||||
if (roundRHS)
|
||||
{
|
||||
if (d == a)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
MOVSD(XMM0, fpr.R(b));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
(this->*op)(fpr.RX(d), R(XMM0));
|
||||
Force25BitPrecision(XMM0, fpr.R(b), XMM1);
|
||||
(this->*sseOp)(fpr.RX(d), R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
fpr.BindToRegister(d, d == b);
|
||||
if (d != b)
|
||||
MOVSD(fpr.RX(d), fpr.R(b));
|
||||
Force25BitPrecision(fpr.RX(d), XMM0);
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
}
|
||||
else if (d == a)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
if (!single)
|
||||
{
|
||||
fpr.BindToRegister(b, true, false);
|
||||
}
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else if (d == b)
|
||||
{
|
||||
if (reversible)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
if (!single)
|
||||
{
|
||||
fpr.BindToRegister(a, true, false);
|
||||
}
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVSD(XMM0, fpr.R(b));
|
||||
fpr.BindToRegister(d, !single);
|
||||
MOVSD(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), Gen::R(XMM0));
|
||||
Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0);
|
||||
(this->*sseOp)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Sources different from d, can use rather quick solution
|
||||
fpr.BindToRegister(d, !single);
|
||||
if (!single)
|
||||
{
|
||||
fpr.BindToRegister(b, true, false);
|
||||
}
|
||||
MOVSD(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible);
|
||||
}
|
||||
if (single)
|
||||
{
|
||||
|
@ -104,10 +66,10 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
|||
bool single = inst.OPCD == 59;
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD, inst); break; //div
|
||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD, inst); break; //sub
|
||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, inst); break; //add
|
||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, inst, single); break; //mul
|
||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
|
||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
|
||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
|
||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
||||
}
|
||||
|
@ -131,18 +93,20 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||
if (inst.SUBOP5 == 30) //nmsub
|
||||
{
|
||||
MOVSD(XMM1, fpr.R(c));
|
||||
if (single_precision)
|
||||
Force25BitPrecision(XMM1, XMM0);
|
||||
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
||||
else
|
||||
MOVSD(XMM1, fpr.R(c));
|
||||
MULSD(XMM1, fpr.R(a));
|
||||
MOVSD(XMM0, fpr.R(b));
|
||||
SUBSD(XMM0, R(XMM1));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVSD(XMM0, fpr.R(c));
|
||||
if (single_precision)
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
else
|
||||
MOVSD(XMM0, fpr.R(c));
|
||||
MULSD(XMM0, fpr.R(a));
|
||||
if (inst.SUBOP5 == 28) //msub
|
||||
SUBSD(XMM0, fpr.R(b));
|
||||
|
|
|
@ -43,17 +43,15 @@ void Jit64::ps_sel(UGeckoInstruction inst)
|
|||
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
MOVAPD(XMM1, fpr.R(a));
|
||||
PXOR(XMM0, R(XMM0));
|
||||
CMPPD(XMM0, R(XMM1), NLE);
|
||||
CMPPD(XMM0, fpr.R(a), NLE);
|
||||
MOVAPD(XMM1, fpr.R(c));
|
||||
BLENDVPD(XMM1, fpr.R(b));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVAPD(XMM0, fpr.R(a));
|
||||
PXOR(XMM1, R(XMM1));
|
||||
CMPPD(XMM1, R(XMM0), NLE);
|
||||
CMPPD(XMM1, fpr.R(a), NLE);
|
||||
MOVAPD(XMM0, R(XMM1));
|
||||
PAND(XMM1, fpr.R(b));
|
||||
PANDN(XMM0, fpr.R(c));
|
||||
|
@ -74,26 +72,18 @@ void Jit64::ps_sign(UGeckoInstruction inst)
|
|||
int b = inst.FB;
|
||||
|
||||
fpr.Lock(d, b);
|
||||
if (d != b)
|
||||
{
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
}
|
||||
fpr.BindToRegister(d, d == b);
|
||||
|
||||
switch (inst.SUBOP10)
|
||||
{
|
||||
case 40: //neg
|
||||
PXOR(fpr.RX(d), M((void*)&psSignBits));
|
||||
avx_op(&XEmitter::VPXOR, &XEmitter::PXOR, fpr.RX(d), fpr.R(b), M((void*)&psSignBits));
|
||||
break;
|
||||
case 136: //nabs
|
||||
POR(fpr.RX(d), M((void*)&psSignBits));
|
||||
avx_op(&XEmitter::VPOR, &XEmitter::POR, fpr.RX(d), fpr.R(b), M((void*)&psSignBits));
|
||||
break;
|
||||
case 264: //abs
|
||||
PAND(fpr.RX(d), M((void*)&psAbsMask));
|
||||
avx_op(&XEmitter::VPAND, &XEmitter::PAND, fpr.RX(d), fpr.R(b), M((void*)&psAbsMask));
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -101,56 +91,29 @@ void Jit64::ps_sign(UGeckoInstruction inst)
|
|||
}
|
||||
|
||||
//There's still a little bit more optimization that can be squeezed out of this
|
||||
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||
{
|
||||
fpr.Lock(d, a, b);
|
||||
fpr.BindToRegister(d, d == a || d == b);
|
||||
|
||||
if (roundRHS)
|
||||
{
|
||||
if (d == a)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
MOVAPD(XMM0, fpr.R(b));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
(this->*op)(fpr.RX(d), R(XMM0));
|
||||
Force25BitPrecision(XMM0, fpr.R(b), XMM1);
|
||||
(this->*sseOp)(fpr.RX(d), R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
fpr.BindToRegister(d, d == b);
|
||||
if (d != b)
|
||||
MOVAPD(fpr.RX(d), fpr.R(b));
|
||||
Force25BitPrecision(fpr.RX(d), XMM0);
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
}
|
||||
else if (d == a)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else if (d == b)
|
||||
{
|
||||
if (reversible)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVAPD(XMM0, fpr.R(b));
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), R(XMM0));
|
||||
Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0);
|
||||
(this->*sseOp)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//sources different from d, can use rather quick solution
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), true, reversible);
|
||||
}
|
||||
ForceSinglePrecisionP(fpr.RX(d));
|
||||
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
@ -164,16 +127,16 @@ void Jit64::ps_arith(UGeckoInstruction inst)
|
|||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: // div
|
||||
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD, inst);
|
||||
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VDIVPD, &XEmitter::DIVPD, inst);
|
||||
break;
|
||||
case 20: // sub
|
||||
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD, inst);
|
||||
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VSUBPD, &XEmitter::SUBPD, inst);
|
||||
break;
|
||||
case 21: // add
|
||||
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD, inst);
|
||||
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
|
||||
break;
|
||||
case 25: // mul
|
||||
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, inst, true);
|
||||
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true);
|
||||
break;
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
||||
|
@ -208,10 +171,9 @@ void Jit64::ps_sum(UGeckoInstruction inst)
|
|||
default:
|
||||
PanicAlert("ps_sum WTF!!!");
|
||||
}
|
||||
ForceSinglePrecisionP(XMM0);
|
||||
SetFPRFIfNeeded(inst, XMM0);
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), R(XMM0));
|
||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
|
@ -232,18 +194,16 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
|||
MOVDDUP(XMM0, fpr.R(c));
|
||||
break;
|
||||
case 13:
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
SHUFPD(XMM0, R(XMM0), 3);
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||
break;
|
||||
default:
|
||||
PanicAlert("ps_muls WTF!!!");
|
||||
}
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ForceSinglePrecisionP(XMM0);
|
||||
SetFPRFIfNeeded(inst, XMM0);
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), R(XMM0));
|
||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
|
@ -258,27 +218,25 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst)
|
|||
int a = inst.FA;
|
||||
int b = inst.FB;
|
||||
fpr.Lock(a,b,d);
|
||||
fpr.BindToRegister(d, d == a || d == b);
|
||||
|
||||
MOVAPD(XMM0, fpr.R(a));
|
||||
switch (inst.SUBOP10)
|
||||
{
|
||||
case 528:
|
||||
UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf
|
||||
avx_op(&XEmitter::VUNPCKLPD, &XEmitter::UNPCKLPD, fpr.RX(d), fpr.R(a), fpr.R(b));
|
||||
break; //00
|
||||
case 560:
|
||||
SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, fpr.RX(d), fpr.R(a), fpr.R(b), 2);
|
||||
break; //01
|
||||
case 592:
|
||||
SHUFPD(XMM0, fpr.R(b), 1);
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, fpr.RX(d), fpr.R(a), fpr.R(b), 1);
|
||||
break; //10
|
||||
case 624:
|
||||
UNPCKHPD(XMM0, fpr.R(b));
|
||||
avx_op(&XEmitter::VUNPCKHPD, &XEmitter::UNPCKHPD, fpr.RX(d), fpr.R(a), fpr.R(b));
|
||||
break; //11
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
|
||||
}
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), R(XMM0));
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
|
@ -303,7 +261,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst)
|
|||
CALL((void *)asm_routines.frsqrte);
|
||||
MOVLHPS(fpr.RX(d), XMM0);
|
||||
|
||||
ForceSinglePrecisionP(fpr.RX(d));
|
||||
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
|
@ -330,7 +288,7 @@ void Jit64::ps_res(UGeckoInstruction inst)
|
|||
CALL((void *)asm_routines.fres);
|
||||
MOVLHPS(fpr.RX(d), XMM0);
|
||||
|
||||
ForceSinglePrecisionP(fpr.RX(d));
|
||||
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
|
@ -352,42 +310,35 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
|||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 14: //madds0
|
||||
MOVDDUP(XMM1, fpr.R(c));
|
||||
Force25BitPrecision(XMM1, XMM0);
|
||||
MOVAPD(XMM0, fpr.R(a));
|
||||
MULPD(XMM0, R(XMM1));
|
||||
MOVDDUP(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 15: //madds1
|
||||
MOVAPD(XMM1, fpr.R(c));
|
||||
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
|
||||
Force25BitPrecision(XMM1, XMM0);
|
||||
MOVAPD(XMM0, fpr.R(a));
|
||||
MULPD(XMM0, R(XMM1));
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 28: //msub
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 29: //madd
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 30: //nmsub
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
|
@ -399,9 +350,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
|||
return;
|
||||
}
|
||||
fpr.BindToRegister(d, false);
|
||||
ForceSinglePrecisionP(XMM0);
|
||||
SetFPRFIfNeeded(inst, XMM0);
|
||||
MOVAPD(fpr.RX(d), R(XMM0));
|
||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
|
|
|
@ -608,13 +608,98 @@ void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm)
|
|||
}
|
||||
}
|
||||
|
||||
void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm)
|
||||
void EmuCodeBlock::ForceSinglePrecisionP(X64Reg output, X64Reg input)
|
||||
{
|
||||
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
|
||||
if (jit->jo.accurateSinglePrecision)
|
||||
{
|
||||
CVTPD2PS(xmm, R(xmm));
|
||||
CVTPS2PD(xmm, R(xmm));
|
||||
CVTPD2PS(input, R(input));
|
||||
CVTPS2PD(output, R(input));
|
||||
}
|
||||
else if (output != input)
|
||||
{
|
||||
MOVAPD(output, R(input));
|
||||
}
|
||||
}
|
||||
|
||||
// Abstract between AVX and SSE: automatically handle 3-operand instructions
|
||||
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg),
|
||||
X64Reg regOp, OpArg arg1, OpArg arg2, bool packed, bool reversible)
|
||||
{
|
||||
if (arg1.IsSimpleReg() && regOp == arg1.GetSimpleReg())
|
||||
{
|
||||
(this->*sseOp)(regOp, arg2);
|
||||
}
|
||||
else if (arg1.IsSimpleReg() && cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp, arg1.GetSimpleReg(), arg2);
|
||||
}
|
||||
else if (arg2.IsSimpleReg() && arg2.GetSimpleReg() == regOp)
|
||||
{
|
||||
if (reversible)
|
||||
{
|
||||
(this->*sseOp)(regOp, arg1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// The ugly case: regOp == arg2 without AVX, or with arg1 == memory
|
||||
if (!arg1.IsSimpleReg() || arg1.GetSimpleReg() != XMM0)
|
||||
MOVAPD(XMM0, arg1);
|
||||
if (cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp, XMM0, arg2);
|
||||
}
|
||||
else
|
||||
{
|
||||
(this->*sseOp)(XMM0, arg2);
|
||||
if (packed)
|
||||
MOVAPD(regOp, R(XMM0));
|
||||
else
|
||||
MOVSD(regOp, R(XMM0));
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (packed)
|
||||
MOVAPD(regOp, arg1);
|
||||
else
|
||||
MOVSD(regOp, arg1);
|
||||
(this->*sseOp)(regOp, arg1 == arg2 ? R(regOp) : arg2);
|
||||
}
|
||||
}
|
||||
|
||||
// Abstract between AVX and SSE: automatically handle 3-operand instructions
|
||||
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg, u8), void (XEmitter::*sseOp)(X64Reg, OpArg, u8),
|
||||
X64Reg regOp, OpArg arg1, OpArg arg2, u8 imm)
|
||||
{
|
||||
if (arg1.IsSimpleReg() && regOp == arg1.GetSimpleReg())
|
||||
{
|
||||
(this->*sseOp)(regOp, arg2, imm);
|
||||
}
|
||||
else if (arg1.IsSimpleReg() && cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp, arg1.GetSimpleReg(), arg2, imm);
|
||||
}
|
||||
else if (arg2.IsSimpleReg() && arg2.GetSimpleReg() == regOp)
|
||||
{
|
||||
// The ugly case: regOp == arg2 without AVX, or with arg1 == memory
|
||||
if (!arg1.IsSimpleReg() || arg1.GetSimpleReg() != XMM0)
|
||||
MOVAPD(XMM0, arg1);
|
||||
if (cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp, XMM0, arg2, imm);
|
||||
}
|
||||
else
|
||||
{
|
||||
(this->*sseOp)(XMM0, arg2, imm);
|
||||
MOVAPD(regOp, R(XMM0));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVAPD(regOp, arg1);
|
||||
(this->*sseOp)(regOp, arg1 == arg2 ? R(regOp) : arg2, imm);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -625,15 +710,25 @@ static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000};
|
|||
// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
|
||||
// rounding to nearest as it does.
|
||||
// It needs a temp, so let the caller pass that in.
|
||||
void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp)
|
||||
void EmuCodeBlock::Force25BitPrecision(X64Reg output, OpArg input, X64Reg tmp)
|
||||
{
|
||||
if (jit->jo.accurateSinglePrecision)
|
||||
{
|
||||
// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
|
||||
MOVAPD(tmp, R(xmm));
|
||||
PAND(xmm, M((void*)&psMantissaTruncate));
|
||||
PAND(tmp, M((void*)&psRoundBit));
|
||||
PADDQ(xmm, R(tmp));
|
||||
if (input.IsSimpleReg() && cpu_info.bAVX)
|
||||
{
|
||||
VPAND(tmp, input.GetSimpleReg(), M((void*)&psRoundBit));
|
||||
VPAND(output, input.GetSimpleReg(), M((void*)&psMantissaTruncate));
|
||||
PADDQ(output, R(tmp));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!input.IsSimpleReg() || input.GetSimpleReg() != output)
|
||||
MOVAPD(output, input);
|
||||
avx_op(&XEmitter::VPAND, &XEmitter::PAND, tmp, R(output), M((void*)&psRoundBit), true, true);
|
||||
PAND(output, M((void*)&psMantissaTruncate));
|
||||
PADDQ(output, R(tmp));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -123,9 +123,14 @@ public:
|
|||
void JitSetCAIf(Gen::CCFlags conditionCode);
|
||||
void JitClearCA();
|
||||
|
||||
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg),
|
||||
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, bool packed = true, bool reversible = false);
|
||||
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
|
||||
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);
|
||||
|
||||
void ForceSinglePrecisionS(Gen::X64Reg xmm);
|
||||
void ForceSinglePrecisionP(Gen::X64Reg xmm);
|
||||
void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp);
|
||||
void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input);
|
||||
void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);
|
||||
|
||||
// RSCRATCH might get trashed
|
||||
void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
|
||||
|
|
Loading…
Reference in New Issue