commit
7c04c76a26
|
@ -1775,6 +1775,8 @@ void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest
|
||||||
void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);}
|
void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);}
|
||||||
void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);}
|
void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);}
|
||||||
void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);}
|
void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);}
|
||||||
|
void XEmitter::BLENDPS(X64Reg dest, OpArg arg, u8 blend) {WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1); Write8(blend);}
|
||||||
|
void XEmitter::BLENDPD(X64Reg dest, OpArg arg, u8 blend) {WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1); Write8(blend);}
|
||||||
|
|
||||||
void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);}
|
void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);}
|
||||||
void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);}
|
void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);}
|
||||||
|
|
|
@ -789,10 +789,12 @@ public:
|
||||||
void PMOVZXWQ(X64Reg dest, OpArg arg);
|
void PMOVZXWQ(X64Reg dest, OpArg arg);
|
||||||
void PMOVZXDQ(X64Reg dest, OpArg arg);
|
void PMOVZXDQ(X64Reg dest, OpArg arg);
|
||||||
|
|
||||||
// SSE4: variable blend instructions (xmm0 implicit argument)
|
// SSE4: blend instructions
|
||||||
void PBLENDVB(X64Reg dest, OpArg arg);
|
void PBLENDVB(X64Reg dest, OpArg arg);
|
||||||
void BLENDVPS(X64Reg dest, OpArg arg);
|
void BLENDVPS(X64Reg dest, OpArg arg);
|
||||||
void BLENDVPD(X64Reg dest, OpArg arg);
|
void BLENDVPD(X64Reg dest, OpArg arg);
|
||||||
|
void BLENDPS(X64Reg dest, OpArg arg, u8 blend);
|
||||||
|
void BLENDPD(X64Reg dest, OpArg arg, u8 blend);
|
||||||
|
|
||||||
// AVX
|
// AVX
|
||||||
void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||||
|
|
|
@ -330,7 +330,7 @@ static GekkoOPTemplate table63[] =
|
||||||
{72, Interpreter::fmrx, {"fmrx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
{72, Interpreter::fmrx, {"fmrx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||||
{136, Interpreter::fnabsx, {"fnabsx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
{136, Interpreter::fnabsx, {"fnabsx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||||
{40, Interpreter::fnegx, {"fnegx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
{40, Interpreter::fnegx, {"fnegx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||||
{12, Interpreter::frspx, {"frspx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
{12, Interpreter::frspx, {"frspx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||||
|
|
||||||
{64, Interpreter::mcrfs, {"mcrfs", OPTYPE_SYSTEMFP, FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
|
{64, Interpreter::mcrfs, {"mcrfs", OPTYPE_SYSTEMFP, FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
|
||||||
{583, Interpreter::mffsx, {"mffsx", OPTYPE_SYSTEMFP, FL_RC_BIT_F | FL_INOUT_FLOAT_D | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
|
{583, Interpreter::mffsx, {"mffsx", OPTYPE_SYSTEMFP, FL_RC_BIT_F | FL_INOUT_FLOAT_D | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
|
||||||
|
|
|
@ -137,13 +137,11 @@ public:
|
||||||
|
|
||||||
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
||||||
|
|
||||||
void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
|
|
||||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
|
||||||
typedef u32 (*Operation)(u32 a, u32 b);
|
typedef u32 (*Operation)(u32 a, u32 b);
|
||||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
|
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
|
||||||
bool Rc = false, bool carry = false);
|
bool Rc = false, bool carry = false);
|
||||||
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
|
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
|
||||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool packed = false, bool roundRHS = false);
|
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), bool packed = false, bool roundRHS = false);
|
||||||
void FloatCompare(UGeckoInstruction inst, bool upper = false);
|
void FloatCompare(UGeckoInstruction inst, bool upper = false);
|
||||||
|
|
||||||
// OPCODES
|
// OPCODES
|
||||||
|
@ -193,12 +191,8 @@ public:
|
||||||
|
|
||||||
void reg_imm(UGeckoInstruction inst);
|
void reg_imm(UGeckoInstruction inst);
|
||||||
|
|
||||||
void ps_sel(UGeckoInstruction inst);
|
|
||||||
void ps_mr(UGeckoInstruction inst);
|
void ps_mr(UGeckoInstruction inst);
|
||||||
void ps_sign(UGeckoInstruction inst); //aggregate
|
|
||||||
void ps_arith(UGeckoInstruction inst); //aggregate
|
|
||||||
void ps_mergeXX(UGeckoInstruction inst);
|
void ps_mergeXX(UGeckoInstruction inst);
|
||||||
void ps_maddXX(UGeckoInstruction inst);
|
|
||||||
void ps_res(UGeckoInstruction inst);
|
void ps_res(UGeckoInstruction inst);
|
||||||
void ps_rsqrte(UGeckoInstruction inst);
|
void ps_rsqrte(UGeckoInstruction inst);
|
||||||
void ps_sum(UGeckoInstruction inst);
|
void ps_sum(UGeckoInstruction inst);
|
||||||
|
|
|
@ -102,9 +102,9 @@ static GekkoOPTemplate table4[] =
|
||||||
{ //SUBOP10
|
{ //SUBOP10
|
||||||
{0, &Jit64::ps_cmpXX}, // ps_cmpu0
|
{0, &Jit64::ps_cmpXX}, // ps_cmpu0
|
||||||
{32, &Jit64::ps_cmpXX}, // ps_cmpo0
|
{32, &Jit64::ps_cmpXX}, // ps_cmpo0
|
||||||
{40, &Jit64::ps_sign}, // ps_neg
|
{40, &Jit64::fsign}, // ps_neg
|
||||||
{136, &Jit64::ps_sign}, // ps_nabs
|
{136, &Jit64::fsign}, // ps_nabs
|
||||||
{264, &Jit64::ps_sign}, // ps_abs
|
{264, &Jit64::fsign}, // ps_abs
|
||||||
{64, &Jit64::ps_cmpXX}, // ps_cmpu1
|
{64, &Jit64::ps_cmpXX}, // ps_cmpu1
|
||||||
{72, &Jit64::ps_mr}, // ps_mr
|
{72, &Jit64::ps_mr}, // ps_mr
|
||||||
{96, &Jit64::ps_cmpXX}, // ps_cmpo1
|
{96, &Jit64::ps_cmpXX}, // ps_cmpo1
|
||||||
|
@ -122,19 +122,19 @@ static GekkoOPTemplate table4_2[] =
|
||||||
{11, &Jit64::ps_sum}, // ps_sum1
|
{11, &Jit64::ps_sum}, // ps_sum1
|
||||||
{12, &Jit64::ps_muls}, // ps_muls0
|
{12, &Jit64::ps_muls}, // ps_muls0
|
||||||
{13, &Jit64::ps_muls}, // ps_muls1
|
{13, &Jit64::ps_muls}, // ps_muls1
|
||||||
{14, &Jit64::ps_maddXX}, // ps_madds0
|
{14, &Jit64::fmaddXX}, // ps_madds0
|
||||||
{15, &Jit64::ps_maddXX}, // ps_madds1
|
{15, &Jit64::fmaddXX}, // ps_madds1
|
||||||
{18, &Jit64::ps_arith}, // ps_div
|
{18, &Jit64::fp_arith}, // ps_div
|
||||||
{20, &Jit64::ps_arith}, // ps_sub
|
{20, &Jit64::fp_arith}, // ps_sub
|
||||||
{21, &Jit64::ps_arith}, // ps_add
|
{21, &Jit64::fp_arith}, // ps_add
|
||||||
{23, &Jit64::ps_sel}, // ps_sel
|
{23, &Jit64::fselx}, // ps_sel
|
||||||
{24, &Jit64::ps_res}, // ps_res
|
{24, &Jit64::ps_res}, // ps_res
|
||||||
{25, &Jit64::ps_arith}, // ps_mul
|
{25, &Jit64::fp_arith}, // ps_mul
|
||||||
{26, &Jit64::ps_rsqrte}, // ps_rsqrte
|
{26, &Jit64::ps_rsqrte}, // ps_rsqrte
|
||||||
{28, &Jit64::ps_maddXX}, // ps_msub
|
{28, &Jit64::fmaddXX}, // ps_msub
|
||||||
{29, &Jit64::ps_maddXX}, // ps_madd
|
{29, &Jit64::fmaddXX}, // ps_madd
|
||||||
{30, &Jit64::ps_maddXX}, // ps_nmsub
|
{30, &Jit64::fmaddXX}, // ps_nmsub
|
||||||
{31, &Jit64::ps_maddXX}, // ps_nmadd
|
{31, &Jit64::fmaddXX}, // ps_nmadd
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -13,10 +13,11 @@ using namespace Gen;
|
||||||
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
|
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
|
||||||
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
||||||
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
|
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
|
||||||
|
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
||||||
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
||||||
|
|
||||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
|
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
|
||||||
void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool packed, bool roundRHS)
|
void (XEmitter::*sseOp)(X64Reg, OpArg), bool packed, bool roundRHS)
|
||||||
{
|
{
|
||||||
fpr.Lock(d, a, b);
|
fpr.Lock(d, a, b);
|
||||||
fpr.BindToRegister(d, d == a || d == b || !single);
|
fpr.BindToRegister(d, d == a || d == b || !single);
|
||||||
|
@ -38,17 +39,7 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
|
||||||
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible);
|
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible);
|
||||||
}
|
}
|
||||||
if (single)
|
if (single)
|
||||||
{
|
ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
|
||||||
if (packed)
|
|
||||||
{
|
|
||||||
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
|
|
||||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
@ -77,26 +68,29 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
int arg2 = inst.SUBOP5 == 25 ? c : b;
|
int arg2 = inst.SUBOP5 == 25 ? c : b;
|
||||||
|
|
||||||
bool single = inst.OPCD == 59;
|
bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
||||||
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
|
|
||||||
// If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by
|
// If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by
|
||||||
// using packed arithmetic instead.
|
// using packed arithmetic instead.
|
||||||
bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2];
|
bool packed = inst.OPCD == 4 || (inst.OPCD == 59 &&
|
||||||
|
jit->js.op->fprIsDuplicated[a] &&
|
||||||
|
jit->js.op->fprIsDuplicated[arg2]);
|
||||||
// Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case.
|
// Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case.
|
||||||
// Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones.
|
// Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones.
|
||||||
if (inst.SUBOP5 == 18 || cpu_info.bAtom)
|
if (inst.OPCD == 59 && (inst.SUBOP5 == 18 || cpu_info.bAtom))
|
||||||
packed = false;
|
packed = false;
|
||||||
|
|
||||||
|
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
|
||||||
|
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
|
case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
|
||||||
packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, inst, packed); break;
|
packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed); break;
|
||||||
case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
|
case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
|
||||||
packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, inst, packed); break;
|
packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed); break;
|
||||||
case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
|
case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
|
||||||
packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, inst, packed); break;
|
packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed); break;
|
||||||
case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
|
case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
|
||||||
packed ? &XEmitter::MULPD : &XEmitter::MULSD, inst, packed, round_input); break;
|
packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, round_input); break;
|
||||||
default:
|
default:
|
||||||
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
||||||
}
|
}
|
||||||
|
@ -112,14 +106,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
bool single = inst.OPCD == 59;
|
bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
||||||
bool round_input = single && !jit->js.op->fprIsSingle[c];
|
bool round_input = single && !jit->js.op->fprIsSingle[c];
|
||||||
bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c];
|
bool packed = inst.OPCD == 4 ||
|
||||||
if (cpu_info.bAtom)
|
(!cpu_info.bAtom && single &&
|
||||||
packed = false;
|
jit->js.op->fprIsDuplicated[a] &&
|
||||||
|
jit->js.op->fprIsDuplicated[b] &&
|
||||||
|
jit->js.op->fprIsDuplicated[c]);
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
|
|
||||||
|
switch(inst.SUBOP5)
|
||||||
|
{
|
||||||
|
case 14:
|
||||||
|
MOVDDUP(XMM0, fpr.R(c));
|
||||||
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
|
break;
|
||||||
|
case 15:
|
||||||
|
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||||
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::g_want_determinism);
|
||||||
|
X64Reg tmp1 = special ? XMM1 : XMM0;
|
||||||
|
X64Reg tmp2 = special ? XMM0 : XMM1;
|
||||||
|
if (single && round_input)
|
||||||
|
Force25BitPrecision(tmp1, fpr.R(c), tmp2);
|
||||||
|
else
|
||||||
|
MOVAPD(tmp1, fpr.R(c));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
||||||
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
||||||
// be extra careful and don't use FMA, even if in theory it might be okay.
|
// be extra careful and don't use FMA, even if in theory it might be okay.
|
||||||
|
@ -128,10 +147,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
// instances on different computers giving identical results.
|
// instances on different computers giving identical results.
|
||||||
if (cpu_info.bFMA && !Core::g_want_determinism)
|
if (cpu_info.bFMA && !Core::g_want_determinism)
|
||||||
{
|
{
|
||||||
if (single && round_input)
|
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
|
||||||
else
|
|
||||||
MOVAPD(XMM0, fpr.R(c));
|
|
||||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||||
// if we have to pick one of a or b to bind, let's make it b.
|
// if we have to pick one of a or b to bind, let's make it b.
|
||||||
fpr.BindToRegister(b, true, false);
|
fpr.BindToRegister(b, true, false);
|
||||||
|
@ -143,6 +158,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
else
|
else
|
||||||
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
break;
|
break;
|
||||||
|
case 14: //madds0
|
||||||
|
case 15: //madds1
|
||||||
case 29: //madd
|
case 29: //madd
|
||||||
if (packed)
|
if (packed)
|
||||||
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
|
@ -169,11 +186,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
else if (inst.SUBOP5 == 30) //nmsub
|
else if (inst.SUBOP5 == 30) //nmsub
|
||||||
{
|
{
|
||||||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately.
|
||||||
if (single && round_input)
|
|
||||||
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
|
||||||
else
|
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
|
||||||
MOVAPD(XMM0, fpr.R(b));
|
MOVAPD(XMM0, fpr.R(b));
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
|
@ -188,16 +201,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (single && round_input)
|
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
|
||||||
else
|
|
||||||
MOVAPD(XMM0, fpr.R(c));
|
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
MULPD(XMM0, fpr.R(a));
|
MULPD(XMM0, fpr.R(a));
|
||||||
if (inst.SUBOP5 == 28) //msub
|
if (inst.SUBOP5 == 28) //msub
|
||||||
SUBPD(XMM0, fpr.R(b));
|
SUBPD(XMM0, fpr.R(b));
|
||||||
else //(n)madd
|
else //(n)madd(s[01])
|
||||||
ADDPD(XMM0, fpr.R(b));
|
ADDPD(XMM0, fpr.R(b));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -215,21 +224,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
fpr.BindToRegister(d, !single);
|
fpr.BindToRegister(d, !single);
|
||||||
|
|
||||||
if (single)
|
if (single)
|
||||||
{
|
ForceSinglePrecision(fpr.RX(d), R(XMM0), packed, true);
|
||||||
if (packed)
|
|
||||||
{
|
|
||||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
ForceSinglePrecisionS(fpr.RX(d), XMM0);
|
|
||||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
|
||||||
MOVSD(fpr.RX(d), R(XMM0));
|
MOVSD(fpr.RX(d), R(XMM0));
|
||||||
}
|
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
@ -242,23 +239,22 @@ void Jit64::fsign(UGeckoInstruction inst)
|
||||||
|
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
fpr.Lock(b, d);
|
bool packed = inst.OPCD == 4;
|
||||||
fpr.BindToRegister(d);
|
|
||||||
|
fpr.Lock(b, d);
|
||||||
|
OpArg src = fpr.R(b);
|
||||||
|
fpr.BindToRegister(d, false);
|
||||||
|
|
||||||
if (d != b)
|
|
||||||
MOVSD(fpr.RX(d), fpr.R(b));
|
|
||||||
switch (inst.SUBOP10)
|
switch (inst.SUBOP10)
|
||||||
{
|
{
|
||||||
case 40: // fnegx
|
case 40: // neg
|
||||||
// We can cheat and not worry about clobbering the top half by using masks
|
avx_op(&XEmitter::VPXOR, &XEmitter::PXOR, fpr.RX(d), src, M(packed ? psSignBits2 : psSignBits), packed);
|
||||||
// that don't modify the top half.
|
|
||||||
PXOR(fpr.RX(d), M(psSignBits));
|
|
||||||
break;
|
break;
|
||||||
case 264: // fabsx
|
case 136: // nabs
|
||||||
PAND(fpr.RX(d), M(psAbsMask));
|
avx_op(&XEmitter::VPOR, &XEmitter::POR, fpr.RX(d), src, M(packed ? psSignBits2 : psSignBits), packed);
|
||||||
break;
|
break;
|
||||||
case 136: // fnabs
|
case 264: // abs
|
||||||
POR(fpr.RX(d), M(psSignBits));
|
avx_op(&XEmitter::VPAND, &XEmitter::PAND, fpr.RX(d), src, M(packed ? psAbsMask2 : psAbsMask), packed);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
PanicAlert("fsign bleh");
|
PanicAlert("fsign bleh");
|
||||||
|
@ -278,13 +274,18 @@ void Jit64::fselx(UGeckoInstruction inst)
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
|
|
||||||
|
bool packed = inst.OPCD == 4; // ps_sel
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
MOVAPD(XMM1, fpr.R(a));
|
|
||||||
PXOR(XMM0, R(XMM0));
|
PXOR(XMM0, R(XMM0));
|
||||||
// This condition is very tricky; there's only one right way to handle both the case of
|
// This condition is very tricky; there's only one right way to handle both the case of
|
||||||
// negative/positive zero and NaN properly.
|
// negative/positive zero and NaN properly.
|
||||||
// (a >= -0.0 ? c : b) transforms into (0 > a ? b : c), hence the NLE.
|
// (a >= -0.0 ? c : b) transforms into (0 > a ? b : c), hence the NLE.
|
||||||
CMPSD(XMM0, R(XMM1), NLE);
|
if (packed)
|
||||||
|
CMPPD(XMM0, fpr.R(a), NLE);
|
||||||
|
else
|
||||||
|
CMPSD(XMM0, fpr.R(a), NLE);
|
||||||
|
|
||||||
if (cpu_info.bSSE4_1)
|
if (cpu_info.bSSE4_1)
|
||||||
{
|
{
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
MOVAPD(XMM1, fpr.R(c));
|
||||||
|
@ -297,8 +298,12 @@ void Jit64::fselx(UGeckoInstruction inst)
|
||||||
PANDN(XMM1, fpr.R(c));
|
PANDN(XMM1, fpr.R(c));
|
||||||
POR(XMM1, R(XMM0));
|
POR(XMM1, R(XMM0));
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d);
|
|
||||||
MOVSD(fpr.RX(d), R(XMM1));
|
fpr.BindToRegister(d, !packed);
|
||||||
|
if (packed)
|
||||||
|
MOVAPD(fpr.RX(d), R(XMM1));
|
||||||
|
else
|
||||||
|
MOVSD(fpr.RX(d), R(XMM1));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -490,13 +495,12 @@ void Jit64::frspx(UGeckoInstruction inst)
|
||||||
JITDISABLE(bJITFloatingPointOff);
|
JITDISABLE(bJITFloatingPointOff);
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
|
bool packed = jit->js.op->fprIsDuplicated[b] && !cpu_info.bAtom;
|
||||||
|
|
||||||
fpr.Lock(b, d);
|
fpr.Lock(b, d);
|
||||||
fpr.BindToRegister(d, d == b);
|
OpArg src = fpr.R(b);
|
||||||
if (b != d)
|
fpr.BindToRegister(d, false);
|
||||||
MOVAPD(fpr.RX(d), fpr.R(b));
|
ForceSinglePrecision(fpr.RX(d), src, packed, true);
|
||||||
ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
|
|
||||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,9 +10,6 @@
|
||||||
|
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
|
||||||
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
|
||||||
|
|
||||||
void Jit64::ps_mr(UGeckoInstruction inst)
|
void Jit64::ps_mr(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
|
@ -28,123 +25,6 @@ void Jit64::ps_mr(UGeckoInstruction inst)
|
||||||
MOVAPD(fpr.RX(d), fpr.R(b));
|
MOVAPD(fpr.RX(d), fpr.R(b));
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::ps_sel(UGeckoInstruction inst)
|
|
||||||
{
|
|
||||||
INSTRUCTION_START
|
|
||||||
JITDISABLE(bJITPairedOff);
|
|
||||||
FALLBACK_IF(inst.Rc);
|
|
||||||
|
|
||||||
int d = inst.FD;
|
|
||||||
int a = inst.FA;
|
|
||||||
int b = inst.FB;
|
|
||||||
int c = inst.FC;
|
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
|
||||||
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
PXOR(XMM0, R(XMM0));
|
|
||||||
CMPPD(XMM0, fpr.R(a), NLE);
|
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
|
||||||
BLENDVPD(XMM1, fpr.R(b));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PXOR(XMM1, R(XMM1));
|
|
||||||
CMPPD(XMM1, fpr.R(a), NLE);
|
|
||||||
MOVAPD(XMM0, R(XMM1));
|
|
||||||
PAND(XMM1, fpr.R(b));
|
|
||||||
PANDN(XMM0, fpr.R(c));
|
|
||||||
POR(XMM1, R(XMM0));
|
|
||||||
}
|
|
||||||
fpr.BindToRegister(d, false);
|
|
||||||
MOVAPD(fpr.RX(d), R(XMM1));
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Jit64::ps_sign(UGeckoInstruction inst)
|
|
||||||
{
|
|
||||||
INSTRUCTION_START
|
|
||||||
JITDISABLE(bJITPairedOff);
|
|
||||||
FALLBACK_IF(inst.Rc);
|
|
||||||
|
|
||||||
int d = inst.FD;
|
|
||||||
int b = inst.FB;
|
|
||||||
|
|
||||||
fpr.Lock(d, b);
|
|
||||||
fpr.BindToRegister(d, d == b);
|
|
||||||
|
|
||||||
switch (inst.SUBOP10)
|
|
||||||
{
|
|
||||||
case 40: //neg
|
|
||||||
avx_op(&XEmitter::VPXOR, &XEmitter::PXOR, fpr.RX(d), fpr.R(b), M(psSignBits));
|
|
||||||
break;
|
|
||||||
case 136: //nabs
|
|
||||||
avx_op(&XEmitter::VPOR, &XEmitter::POR, fpr.RX(d), fpr.R(b), M(psSignBits));
|
|
||||||
break;
|
|
||||||
case 264: //abs
|
|
||||||
avx_op(&XEmitter::VPAND, &XEmitter::PAND, fpr.RX(d), fpr.R(b), M(psAbsMask));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
//There's still a little bit more optimization that can be squeezed out of this
|
|
||||||
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
|
||||||
{
|
|
||||||
fpr.Lock(d, a, b);
|
|
||||||
fpr.BindToRegister(d, d == a || d == b);
|
|
||||||
|
|
||||||
if (roundRHS)
|
|
||||||
{
|
|
||||||
if (d == a)
|
|
||||||
{
|
|
||||||
Force25BitPrecision(XMM0, fpr.R(b), XMM1);
|
|
||||||
(this->*sseOp)(fpr.RX(d), R(XMM0));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0);
|
|
||||||
(this->*sseOp)(fpr.RX(d), fpr.R(a));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), true, reversible);
|
|
||||||
}
|
|
||||||
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Jit64::ps_arith(UGeckoInstruction inst)
|
|
||||||
{
|
|
||||||
INSTRUCTION_START
|
|
||||||
JITDISABLE(bJITPairedOff);
|
|
||||||
FALLBACK_IF(inst.Rc);
|
|
||||||
|
|
||||||
bool round_input = !jit->js.op->fprIsSingle[inst.FC];
|
|
||||||
switch (inst.SUBOP5)
|
|
||||||
{
|
|
||||||
case 18: // div
|
|
||||||
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VDIVPD, &XEmitter::DIVPD, inst);
|
|
||||||
break;
|
|
||||||
case 20: // sub
|
|
||||||
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VSUBPD, &XEmitter::SUBPD, inst);
|
|
||||||
break;
|
|
||||||
case 21: // add
|
|
||||||
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
|
|
||||||
break;
|
|
||||||
case 25: // mul
|
|
||||||
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, round_input);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void Jit64::ps_sum(UGeckoInstruction inst)
|
void Jit64::ps_sum(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
|
@ -156,24 +36,40 @@ void Jit64::ps_sum(UGeckoInstruction inst)
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
|
OpArg op_a = fpr.R(a);
|
||||||
|
fpr.BindToRegister(d, false);
|
||||||
|
X64Reg tmp = d == b || d == c ? XMM0 : fpr.RX(d);
|
||||||
|
MOVDDUP(tmp, op_a); // {a.ps0, a.ps0}
|
||||||
|
ADDPD(tmp, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 10:
|
case 10: // ps_sum0
|
||||||
MOVDDUP(XMM0, fpr.R(a)); // {a.ps0, a.ps0}
|
UNPCKHPD(tmp, fpr.R(c)); // {a.ps0 + b.ps1, c.ps1}
|
||||||
ADDPD(XMM0, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
|
|
||||||
UNPCKHPD(XMM0, fpr.R(c)); // {a.ps0 + b.ps1, c.ps1}
|
|
||||||
break;
|
break;
|
||||||
case 11:
|
case 11: // ps_sum1
|
||||||
MOVDDUP(XMM1, fpr.R(a)); // {a.ps0, a.ps0}
|
// {c.ps0, a.ps0 + b.ps1}
|
||||||
ADDPD(XMM1, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
|
if (fpr.R(c).IsSimpleReg())
|
||||||
MOVAPD(XMM0, fpr.R(c));
|
{
|
||||||
SHUFPD(XMM0, R(XMM1), 2); // {c.ps0, a.ps0 + b.ps1}
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
BLENDPD(tmp, fpr.R(c), 1);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVAPD(XMM1, fpr.R(c));
|
||||||
|
SHUFPD(XMM1, R(tmp), 2);
|
||||||
|
tmp = XMM1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVLPD(tmp, fpr.R(c));
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
PanicAlert("ps_sum WTF!!!");
|
PanicAlert("ps_sum WTF!!!");
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d, false);
|
ForceSinglePrecision(fpr.RX(d), R(tmp));
|
||||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
@ -192,10 +88,10 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
fpr.Lock(a, c, d);
|
fpr.Lock(a, c, d);
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 12:
|
case 12: // ps_muls0
|
||||||
MOVDDUP(XMM0, fpr.R(c));
|
MOVDDUP(XMM0, fpr.R(c));
|
||||||
break;
|
break;
|
||||||
case 13:
|
case 13: // ps_muls1
|
||||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -205,7 +101,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
MULPD(XMM0, fpr.R(a));
|
MULPD(XMM0, fpr.R(a));
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
ForceSinglePrecision(fpr.RX(d), R(XMM0));
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
@ -264,7 +160,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst)
|
||||||
CALL((void *)asm_routines.frsqrte);
|
CALL((void *)asm_routines.frsqrte);
|
||||||
MOVLHPS(fpr.RX(d), XMM0);
|
MOVLHPS(fpr.RX(d), XMM0);
|
||||||
|
|
||||||
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
ForceSinglePrecision(fpr.RX(d), fpr.R(d));
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
gpr.UnlockAllX();
|
gpr.UnlockAllX();
|
||||||
|
@ -291,106 +187,12 @@ void Jit64::ps_res(UGeckoInstruction inst)
|
||||||
CALL((void *)asm_routines.fres);
|
CALL((void *)asm_routines.fres);
|
||||||
MOVLHPS(fpr.RX(d), XMM0);
|
MOVLHPS(fpr.RX(d), XMM0);
|
||||||
|
|
||||||
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
ForceSinglePrecision(fpr.RX(d), fpr.R(d));
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
gpr.UnlockAllX();
|
gpr.UnlockAllX();
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO: add optimized cases
|
|
||||||
void Jit64::ps_maddXX(UGeckoInstruction inst)
|
|
||||||
{
|
|
||||||
INSTRUCTION_START
|
|
||||||
JITDISABLE(bJITPairedOff);
|
|
||||||
FALLBACK_IF(inst.Rc);
|
|
||||||
|
|
||||||
int a = inst.FA;
|
|
||||||
int b = inst.FB;
|
|
||||||
int c = inst.FC;
|
|
||||||
int d = inst.FD;
|
|
||||||
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
|
|
||||||
bool round_input = !jit->js.op->fprIsSingle[c];
|
|
||||||
fpr.Lock(a, b, c, d);
|
|
||||||
|
|
||||||
if (fma)
|
|
||||||
fpr.BindToRegister(b, true, false);
|
|
||||||
|
|
||||||
if (inst.SUBOP5 == 14)
|
|
||||||
{
|
|
||||||
MOVDDUP(XMM0, fpr.R(c));
|
|
||||||
if (round_input)
|
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
|
||||||
}
|
|
||||||
else if (inst.SUBOP5 == 15)
|
|
||||||
{
|
|
||||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
|
||||||
if (round_input)
|
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (round_input)
|
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
|
||||||
else
|
|
||||||
MOVAPD(XMM0, fpr.R(c));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (fma)
|
|
||||||
{
|
|
||||||
switch (inst.SUBOP5)
|
|
||||||
{
|
|
||||||
case 14: //madds0
|
|
||||||
case 15: //madds1
|
|
||||||
case 29: //madd
|
|
||||||
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
|
||||||
break;
|
|
||||||
case 28: //msub
|
|
||||||
VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
|
||||||
break;
|
|
||||||
case 30: //nmsub
|
|
||||||
VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
|
||||||
break;
|
|
||||||
case 31: //nmadd
|
|
||||||
VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
switch (inst.SUBOP5)
|
|
||||||
{
|
|
||||||
case 14: //madds0
|
|
||||||
case 15: //madds1
|
|
||||||
case 29: //madd
|
|
||||||
MULPD(XMM0, fpr.R(a));
|
|
||||||
ADDPD(XMM0, fpr.R(b));
|
|
||||||
break;
|
|
||||||
case 28: //msub
|
|
||||||
MULPD(XMM0, fpr.R(a));
|
|
||||||
SUBPD(XMM0, fpr.R(b));
|
|
||||||
break;
|
|
||||||
case 30: //nmsub
|
|
||||||
MULPD(XMM0, fpr.R(a));
|
|
||||||
SUBPD(XMM0, fpr.R(b));
|
|
||||||
PXOR(XMM0, M(psSignBits));
|
|
||||||
break;
|
|
||||||
case 31: //nmadd
|
|
||||||
MULPD(XMM0, fpr.R(a));
|
|
||||||
ADDPD(XMM0, fpr.R(b));
|
|
||||||
PXOR(XMM0, M(psSignBits));
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fpr.BindToRegister(d, false);
|
|
||||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Jit64::ps_cmpXX(UGeckoInstruction inst)
|
void Jit64::ps_cmpXX(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
|
|
|
@ -640,31 +640,30 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address
|
||||||
MOV(accessSize, MRegSum(RMEM, RSCRATCH2), R(reg));
|
MOV(accessSize, MRegSum(RMEM, RSCRATCH2), R(reg));
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmuCodeBlock::ForceSinglePrecisionS(X64Reg output, X64Reg input)
|
void EmuCodeBlock::ForceSinglePrecision(X64Reg output, OpArg input, bool packed, bool duplicate)
|
||||||
{
|
{
|
||||||
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
|
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
|
||||||
if (jit->jo.accurateSinglePrecision)
|
if (jit->jo.accurateSinglePrecision)
|
||||||
{
|
{
|
||||||
CVTSD2SS(input, R(input));
|
if (packed)
|
||||||
CVTSS2SD(output, R(input));
|
{
|
||||||
|
CVTPD2PS(output, input);
|
||||||
|
CVTPS2PD(output, R(output));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
CVTSD2SS(output, input);
|
||||||
|
CVTSS2SD(output, R(output));
|
||||||
|
if (duplicate)
|
||||||
|
MOVDDUP(output, R(output));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (output != input)
|
else if (!input.IsSimpleReg() || input.GetSimpleReg() != output)
|
||||||
{
|
{
|
||||||
MOVAPD(output, R(input));
|
if (duplicate)
|
||||||
}
|
MOVDDUP(output, input);
|
||||||
}
|
else
|
||||||
|
MOVAPD(output, input);
|
||||||
void EmuCodeBlock::ForceSinglePrecisionP(X64Reg output, X64Reg input)
|
|
||||||
{
|
|
||||||
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
|
|
||||||
if (jit->jo.accurateSinglePrecision)
|
|
||||||
{
|
|
||||||
CVTPD2PS(input, R(input));
|
|
||||||
CVTPS2PD(output, R(input));
|
|
||||||
}
|
|
||||||
else if (output != input)
|
|
||||||
{
|
|
||||||
MOVAPD(output, R(input));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -121,8 +121,7 @@ public:
|
||||||
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
|
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
|
||||||
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);
|
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);
|
||||||
|
|
||||||
void ForceSinglePrecisionS(Gen::X64Reg output, Gen::X64Reg input);
|
void ForceSinglePrecision(Gen::X64Reg output, Gen::OpArg input, bool packed = true, bool duplicate = false);
|
||||||
void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input);
|
|
||||||
void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);
|
void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);
|
||||||
|
|
||||||
// RSCRATCH might get trashed
|
// RSCRATCH might get trashed
|
||||||
|
|
Loading…
Reference in New Issue