Merge pull request #1258 from FioraAeterna/avoidfmulround
JIT: optimize single-precision ops based on knowledge of their inputs
This commit is contained in:
commit
414e36d8c9
|
@ -50,10 +50,10 @@ struct CPUInfo
|
||||||
bool bMOVBE;
|
bool bMOVBE;
|
||||||
// This flag indicates that the hardware supports some mode
|
// This flag indicates that the hardware supports some mode
|
||||||
// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
|
// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
|
||||||
// TODO: ARM
|
|
||||||
bool bFlushToZero;
|
bool bFlushToZero;
|
||||||
bool bLAHFSAHF64;
|
bool bLAHFSAHF64;
|
||||||
bool bLongMode;
|
bool bLongMode;
|
||||||
|
bool bAtom;
|
||||||
|
|
||||||
// ARM specific CPUInfo
|
// ARM specific CPUInfo
|
||||||
bool bSwp;
|
bool bSwp;
|
||||||
|
|
|
@ -129,6 +129,12 @@ void CPUInfo::Detect()
|
||||||
if (max_std_fn >= 1)
|
if (max_std_fn >= 1)
|
||||||
{
|
{
|
||||||
__cpuid(cpu_id, 0x00000001);
|
__cpuid(cpu_id, 0x00000001);
|
||||||
|
int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff);
|
||||||
|
int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0);
|
||||||
|
// Detect people unfortunate enough to be running Dolphin on an Atom
|
||||||
|
if (family == 6 && (model == 0x1C || model == 0x26 ||model == 0x27 || model == 0x35 || model == 0x36 ||
|
||||||
|
model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
|
||||||
|
bAtom = true;
|
||||||
logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
|
logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
|
||||||
ht = (cpu_id[3] >> 28) & 1;
|
ht = (cpu_id[3] >> 28) & 1;
|
||||||
|
|
||||||
|
|
|
@ -321,14 +321,14 @@ static GekkoOPTemplate table59[] =
|
||||||
|
|
||||||
static GekkoOPTemplate table63[] =
|
static GekkoOPTemplate table63[] =
|
||||||
{
|
{
|
||||||
{264, Interpreter::fabsx, {"fabsx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
{264, Interpreter::fabsx, {"fabsx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||||
{32, Interpreter::fcmpo, {"fcmpo", OPTYPE_DOUBLEFP, FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
{32, Interpreter::fcmpo, {"fcmpo", OPTYPE_DOUBLEFP, FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||||
{0, Interpreter::fcmpu, {"fcmpu", OPTYPE_DOUBLEFP, FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
{0, Interpreter::fcmpu, {"fcmpu", OPTYPE_DOUBLEFP, FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||||
{14, Interpreter::fctiwx, {"fctiwx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
{14, Interpreter::fctiwx, {"fctiwx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||||
{15, Interpreter::fctiwzx, {"fctiwzx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
{15, Interpreter::fctiwzx, {"fctiwzx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||||
{72, Interpreter::fmrx, {"fmrx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
{72, Interpreter::fmrx, {"fmrx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||||
{136, Interpreter::fnabsx, {"fnabsx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
{136, Interpreter::fnabsx, {"fnabsx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||||
{40, Interpreter::fnegx, {"fnegx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
{40, Interpreter::fnegx, {"fnegx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||||
{12, Interpreter::frspx, {"frspx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
{12, Interpreter::frspx, {"frspx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||||
|
|
||||||
{64, Interpreter::mcrfs, {"mcrfs", OPTYPE_SYSTEMFP, FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
|
{64, Interpreter::mcrfs, {"mcrfs", OPTYPE_SYSTEMFP, FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
|
||||||
|
|
|
@ -151,7 +151,7 @@ public:
|
||||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
|
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
|
||||||
bool Rc = false, bool carry = false);
|
bool Rc = false, bool carry = false);
|
||||||
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
|
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
|
||||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool packed = false, bool roundRHS = false);
|
||||||
void FloatCompare(UGeckoInstruction inst, bool upper = false);
|
void FloatCompare(UGeckoInstruction inst, bool upper = false);
|
||||||
|
|
||||||
// OPCODES
|
// OPCODES
|
||||||
|
|
|
@ -11,11 +11,12 @@
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
|
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
|
||||||
|
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
||||||
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
|
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
|
||||||
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
||||||
|
|
||||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
|
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
|
||||||
void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool packed, bool roundRHS)
|
||||||
{
|
{
|
||||||
fpr.Lock(d, a, b);
|
fpr.Lock(d, a, b);
|
||||||
fpr.BindToRegister(d, d == a || d == b || !single);
|
fpr.BindToRegister(d, d == a || d == b || !single);
|
||||||
|
@ -34,12 +35,19 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible);
|
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible);
|
||||||
}
|
}
|
||||||
if (single)
|
if (single)
|
||||||
{
|
{
|
||||||
ForceSinglePrecisionS(fpr.RX(d));
|
if (packed)
|
||||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
{
|
||||||
|
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
|
||||||
|
MOVDDUP(fpr.RX(d), fpr.R(d));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
|
@ -63,13 +71,32 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
||||||
JITDISABLE(bJITFloatingPointOff);
|
JITDISABLE(bJITFloatingPointOff);
|
||||||
FALLBACK_IF(inst.Rc);
|
FALLBACK_IF(inst.Rc);
|
||||||
|
|
||||||
|
int a = inst.FA;
|
||||||
|
int b = inst.FB;
|
||||||
|
int c = inst.FC;
|
||||||
|
int d = inst.FD;
|
||||||
|
int arg2 = inst.SUBOP5 == 25 ? c : b;
|
||||||
|
|
||||||
bool single = inst.OPCD == 59;
|
bool single = inst.OPCD == 59;
|
||||||
|
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
|
||||||
|
// If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by
|
||||||
|
// using packed arithmetic instead.
|
||||||
|
bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2];
|
||||||
|
// Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case.
|
||||||
|
// Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones.
|
||||||
|
if (inst.SUBOP5 == 18 || cpu_info.bAtom)
|
||||||
|
packed = false;
|
||||||
|
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
|
case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
|
||||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
|
packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, inst, packed); break;
|
||||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
|
case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
|
||||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul
|
packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, inst, packed); break;
|
||||||
|
case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
|
||||||
|
packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, inst, packed); break;
|
||||||
|
case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
|
||||||
|
packed ? &XEmitter::MULPD : &XEmitter::MULSD, inst, packed, round_input); break;
|
||||||
default:
|
default:
|
||||||
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
||||||
}
|
}
|
||||||
|
@ -81,12 +108,15 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
JITDISABLE(bJITFloatingPointOff);
|
JITDISABLE(bJITFloatingPointOff);
|
||||||
FALLBACK_IF(inst.Rc);
|
FALLBACK_IF(inst.Rc);
|
||||||
|
|
||||||
bool single_precision = inst.OPCD == 59;
|
|
||||||
|
|
||||||
int a = inst.FA;
|
int a = inst.FA;
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
|
bool single = inst.OPCD == 59;
|
||||||
|
bool round_input = single && !jit->js.op->fprIsSingle[c];
|
||||||
|
bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c];
|
||||||
|
if (cpu_info.bAtom)
|
||||||
|
packed = false;
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
|
|
||||||
|
@ -98,66 +128,103 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
// instances on different computers giving identical results.
|
// instances on different computers giving identical results.
|
||||||
if (cpu_info.bFMA && !Core::g_want_determinism)
|
if (cpu_info.bFMA && !Core::g_want_determinism)
|
||||||
{
|
{
|
||||||
if (single_precision)
|
if (single && round_input)
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||||
else
|
else
|
||||||
MOVSD(XMM0, fpr.R(c));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||||
// if we have to pick one of a or b to bind, let's make it b.
|
// if we have to pick one of a or b to bind, let's make it b.
|
||||||
fpr.BindToRegister(b, true, false);
|
fpr.BindToRegister(b, true, false);
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 28: //msub
|
case 28: //msub
|
||||||
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
if (packed)
|
||||||
|
VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
|
else
|
||||||
|
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
break;
|
break;
|
||||||
case 29: //madd
|
case 29: //madd
|
||||||
VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
|
if (packed)
|
||||||
|
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
|
else
|
||||||
|
VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
break;
|
break;
|
||||||
// PowerPC and x86 define NMADD/NMSUB differently
|
// PowerPC and x86 define NMADD/NMSUB differently
|
||||||
// x86: D = -A*C (+/-) B
|
// x86: D = -A*C (+/-) B
|
||||||
// PPC: D = -(A*C (+/-) B)
|
// PPC: D = -(A*C (+/-) B)
|
||||||
// so we have to swap them; the ADD/SUB here isn't a typo.
|
// so we have to swap them; the ADD/SUB here isn't a typo.
|
||||||
case 30: //nmsub
|
case 30: //nmsub
|
||||||
VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
|
if (packed)
|
||||||
|
VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
|
else
|
||||||
|
VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
break;
|
break;
|
||||||
case 31: //nmadd
|
case 31: //nmadd
|
||||||
VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
if (packed)
|
||||||
|
VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
|
else
|
||||||
|
VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (inst.SUBOP5 == 30) //nmsub
|
else if (inst.SUBOP5 == 30) //nmsub
|
||||||
{
|
{
|
||||||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||||
if (single_precision)
|
if (single && round_input)
|
||||||
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
||||||
else
|
else
|
||||||
MOVSD(XMM1, fpr.R(c));
|
MOVAPD(XMM1, fpr.R(c));
|
||||||
MULSD(XMM1, fpr.R(a));
|
MOVAPD(XMM0, fpr.R(b));
|
||||||
MOVSD(XMM0, fpr.R(b));
|
if (packed)
|
||||||
SUBSD(XMM0, R(XMM1));
|
{
|
||||||
|
MULPD(XMM1, fpr.R(a));
|
||||||
|
SUBPD(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MULSD(XMM1, fpr.R(a));
|
||||||
|
SUBSD(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (single_precision)
|
if (single && round_input)
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||||
else
|
else
|
||||||
MOVSD(XMM0, fpr.R(c));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
MULSD(XMM0, fpr.R(a));
|
if (packed)
|
||||||
if (inst.SUBOP5 == 28) //msub
|
{
|
||||||
SUBSD(XMM0, fpr.R(b));
|
MULPD(XMM0, fpr.R(a));
|
||||||
else //(n)madd
|
if (inst.SUBOP5 == 28) //msub
|
||||||
ADDSD(XMM0, fpr.R(b));
|
SUBPD(XMM0, fpr.R(b));
|
||||||
|
else //(n)madd
|
||||||
|
ADDPD(XMM0, fpr.R(b));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MULSD(XMM0, fpr.R(a));
|
||||||
|
if (inst.SUBOP5 == 28)
|
||||||
|
SUBSD(XMM0, fpr.R(b));
|
||||||
|
else
|
||||||
|
ADDSD(XMM0, fpr.R(b));
|
||||||
|
}
|
||||||
if (inst.SUBOP5 == 31) //nmadd
|
if (inst.SUBOP5 == 31) //nmadd
|
||||||
PXOR(XMM0, M((void*)&psSignBits));
|
PXOR(XMM0, M((void*)&(packed ? psSignBits2 : psSignBits)));
|
||||||
}
|
}
|
||||||
|
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, !single);
|
||||||
//YES it is necessary to dupe the result :(
|
|
||||||
//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
|
if (single)
|
||||||
if (single_precision)
|
|
||||||
{
|
{
|
||||||
ForceSinglePrecisionS(XMM0);
|
if (packed)
|
||||||
MOVDDUP(fpr.RX(d), R(XMM0));
|
{
|
||||||
|
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ForceSinglePrecisionS(fpr.RX(d), XMM0);
|
||||||
|
MOVDDUP(fpr.RX(d), fpr.R(d));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -176,7 +243,7 @@ void Jit64::fsign(UGeckoInstruction inst)
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
fpr.Lock(b, d);
|
fpr.Lock(b, d);
|
||||||
fpr.BindToRegister(d, true, true);
|
fpr.BindToRegister(d);
|
||||||
|
|
||||||
if (d != b)
|
if (d != b)
|
||||||
MOVSD(fpr.RX(d), fpr.R(b));
|
MOVSD(fpr.RX(d), fpr.R(b));
|
||||||
|
@ -212,7 +279,7 @@ void Jit64::fselx(UGeckoInstruction inst)
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
MOVSD(XMM1, fpr.R(a));
|
MOVAPD(XMM1, fpr.R(a));
|
||||||
PXOR(XMM0, R(XMM0));
|
PXOR(XMM0, R(XMM0));
|
||||||
// This condition is very tricky; there's only one right way to handle both the case of
|
// This condition is very tricky; there's only one right way to handle both the case of
|
||||||
// negative/positive zero and NaN properly.
|
// negative/positive zero and NaN properly.
|
||||||
|
@ -220,17 +287,17 @@ void Jit64::fselx(UGeckoInstruction inst)
|
||||||
CMPSD(XMM0, R(XMM1), NLE);
|
CMPSD(XMM0, R(XMM1), NLE);
|
||||||
if (cpu_info.bSSE4_1)
|
if (cpu_info.bSSE4_1)
|
||||||
{
|
{
|
||||||
MOVSD(XMM1, fpr.R(c));
|
MOVAPD(XMM1, fpr.R(c));
|
||||||
BLENDVPD(XMM1, fpr.R(b));
|
BLENDVPD(XMM1, fpr.R(b));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MOVSD(XMM1, R(XMM0));
|
MOVAPD(XMM1, R(XMM0));
|
||||||
PAND(XMM0, fpr.R(b));
|
PAND(XMM0, fpr.R(b));
|
||||||
PANDN(XMM1, fpr.R(c));
|
PANDN(XMM1, fpr.R(c));
|
||||||
POR(XMM1, R(XMM0));
|
POR(XMM1, R(XMM0));
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d, true);
|
fpr.BindToRegister(d);
|
||||||
MOVSD(fpr.RX(d), R(XMM1));
|
MOVSD(fpr.RX(d), R(XMM1));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
@ -383,7 +450,7 @@ void Jit64::fctiwx(UGeckoInstruction inst)
|
||||||
int d = inst.RD;
|
int d = inst.RD;
|
||||||
int b = inst.RB;
|
int b = inst.RB;
|
||||||
fpr.Lock(d, b);
|
fpr.Lock(d, b);
|
||||||
fpr.BindToRegister(d, d == b);
|
fpr.BindToRegister(d);
|
||||||
|
|
||||||
// Intel uses 0x80000000 as a generic error code while PowerPC uses clamping:
|
// Intel uses 0x80000000 as a generic error code while PowerPC uses clamping:
|
||||||
//
|
//
|
||||||
|
@ -426,7 +493,7 @@ void Jit64::frspx(UGeckoInstruction inst)
|
||||||
fpr.BindToRegister(d, d == b);
|
fpr.BindToRegister(d, d == b);
|
||||||
if (b != d)
|
if (b != d)
|
||||||
MOVAPD(fpr.RX(d), fpr.R(b));
|
MOVAPD(fpr.RX(d), fpr.R(b));
|
||||||
ForceSinglePrecisionS(fpr.RX(d));
|
ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
|
||||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
MOVDDUP(fpr.RX(d), fpr.R(d));
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
|
@ -442,8 +509,8 @@ void Jit64::frsqrtex(UGeckoInstruction inst)
|
||||||
|
|
||||||
gpr.FlushLockX(RSCRATCH_EXTRA);
|
gpr.FlushLockX(RSCRATCH_EXTRA);
|
||||||
fpr.Lock(b, d);
|
fpr.Lock(b, d);
|
||||||
fpr.BindToRegister(d, d == b);
|
fpr.BindToRegister(d);
|
||||||
MOVSD(XMM0, fpr.R(b));
|
MOVAPD(XMM0, fpr.R(b));
|
||||||
CALL((void *)asm_routines.frsqrte);
|
CALL((void *)asm_routines.frsqrte);
|
||||||
MOVSD(fpr.R(d), XMM0);
|
MOVSD(fpr.R(d), XMM0);
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||||
|
@ -461,8 +528,8 @@ void Jit64::fresx(UGeckoInstruction inst)
|
||||||
|
|
||||||
gpr.FlushLockX(RSCRATCH_EXTRA);
|
gpr.FlushLockX(RSCRATCH_EXTRA);
|
||||||
fpr.Lock(b, d);
|
fpr.Lock(b, d);
|
||||||
fpr.BindToRegister(d, d == b);
|
fpr.BindToRegister(d);
|
||||||
MOVSD(XMM0, fpr.R(b));
|
MOVAPD(XMM0, fpr.R(b));
|
||||||
CALL((void *)asm_routines.fres);
|
CALL((void *)asm_routines.fres);
|
||||||
MOVSD(fpr.R(d), XMM0);
|
MOVSD(fpr.R(d), XMM0);
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||||
|
|
|
@ -108,8 +108,15 @@ void Jit64::stfXXX(UGeckoInstruction inst)
|
||||||
|
|
||||||
if (single)
|
if (single)
|
||||||
{
|
{
|
||||||
fpr.BindToRegister(s, true, false);
|
if (jit->js.op->fprIsStoreSafe[s])
|
||||||
ConvertDoubleToSingle(XMM0, fpr.RX(s));
|
{
|
||||||
|
CVTSD2SS(XMM0, fpr.R(s));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fpr.BindToRegister(s, true, false);
|
||||||
|
ConvertDoubleToSingle(XMM0, fpr.RX(s));
|
||||||
|
}
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
@ -124,6 +124,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
|
||||||
JITDISABLE(bJITPairedOff);
|
JITDISABLE(bJITPairedOff);
|
||||||
FALLBACK_IF(inst.Rc);
|
FALLBACK_IF(inst.Rc);
|
||||||
|
|
||||||
|
bool round_input = !jit->js.op->fprIsSingle[inst.FC];
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 18: // div
|
case 18: // div
|
||||||
|
@ -136,7 +137,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
|
||||||
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
|
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
|
||||||
break;
|
break;
|
||||||
case 25: // mul
|
case 25: // mul
|
||||||
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true);
|
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, round_input);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
||||||
|
@ -187,6 +188,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
int a = inst.FA;
|
int a = inst.FA;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
|
bool round_input = !jit->js.op->fprIsSingle[c];
|
||||||
fpr.Lock(a, c, d);
|
fpr.Lock(a, c, d);
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
|
@ -199,7 +201,8 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
default:
|
default:
|
||||||
PanicAlert("ps_muls WTF!!!");
|
PanicAlert("ps_muls WTF!!!");
|
||||||
}
|
}
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
MULPD(XMM0, fpr.R(a));
|
MULPD(XMM0, fpr.R(a));
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||||
|
@ -306,6 +309,7 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
|
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
|
||||||
|
bool round_input = !jit->js.op->fprIsSingle[c];
|
||||||
fpr.Lock(a,b,c,d);
|
fpr.Lock(a,b,c,d);
|
||||||
|
|
||||||
if (fma)
|
if (fma)
|
||||||
|
@ -314,16 +318,21 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||||
if (inst.SUBOP5 == 14)
|
if (inst.SUBOP5 == 14)
|
||||||
{
|
{
|
||||||
MOVDDUP(XMM0, fpr.R(c));
|
MOVDDUP(XMM0, fpr.R(c));
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
}
|
}
|
||||||
else if (inst.SUBOP5 == 15)
|
else if (inst.SUBOP5 == 15)
|
||||||
{
|
{
|
||||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||||
|
else
|
||||||
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fma)
|
if (fma)
|
||||||
|
|
|
@ -667,13 +667,17 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address
|
||||||
MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), R(reg));
|
MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), R(reg));
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm)
|
void EmuCodeBlock::ForceSinglePrecisionS(X64Reg output, X64Reg input)
|
||||||
{
|
{
|
||||||
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
|
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
|
||||||
if (jit->jo.accurateSinglePrecision)
|
if (jit->jo.accurateSinglePrecision)
|
||||||
{
|
{
|
||||||
CVTSD2SS(xmm, R(xmm));
|
CVTSD2SS(input, R(input));
|
||||||
CVTSS2SD(xmm, R(xmm));
|
CVTSS2SD(output, R(input));
|
||||||
|
}
|
||||||
|
else if (output != input)
|
||||||
|
{
|
||||||
|
MOVAPD(output, R(input));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -130,7 +130,7 @@ public:
|
||||||
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
|
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
|
||||||
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);
|
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);
|
||||||
|
|
||||||
void ForceSinglePrecisionS(Gen::X64Reg xmm);
|
void ForceSinglePrecisionS(Gen::X64Reg output, Gen::X64Reg input);
|
||||||
void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input);
|
void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input);
|
||||||
void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);
|
void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);
|
||||||
|
|
||||||
|
|
|
@ -827,10 +827,48 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||||
// the same location later).
|
// the same location later).
|
||||||
gprInUse |= code[i].regsOut;
|
gprInUse |= code[i].regsOut;
|
||||||
if (code[i].fregOut >= 0)
|
if (code[i].fregOut >= 0)
|
||||||
{
|
|
||||||
fprInUse[code[i].fregOut] = true;
|
fprInUse[code[i].fregOut] = true;
|
||||||
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
}
|
||||||
fprInXmm[code[i].fregOut] = true;
|
|
||||||
|
// Forward scan, for flags that need the other direction for calculation.
|
||||||
|
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
|
||||||
|
for (u32 i = 0; i < block->m_num_instructions; i++)
|
||||||
|
{
|
||||||
|
code[i].fprIsSingle = fprIsSingle;
|
||||||
|
code[i].fprIsDuplicated = fprIsDuplicated;
|
||||||
|
code[i].fprIsStoreSafe = fprIsStoreSafe;
|
||||||
|
if (code[i].fregOut >= 0)
|
||||||
|
{
|
||||||
|
fprIsSingle[code[i].fregOut] = false;
|
||||||
|
fprIsDuplicated[code[i].fregOut] = false;
|
||||||
|
fprIsStoreSafe[code[i].fregOut] = false;
|
||||||
|
// Single, duplicated, and doesn't need PPC_FP.
|
||||||
|
if (code[i].opinfo->type == OPTYPE_SINGLEFP)
|
||||||
|
{
|
||||||
|
fprIsSingle[code[i].fregOut] = true;
|
||||||
|
fprIsDuplicated[code[i].fregOut] = true;
|
||||||
|
fprIsStoreSafe[code[i].fregOut] = true;
|
||||||
|
}
|
||||||
|
// Single and duplicated, but might be a denormal (not safe to skip PPC_FP).
|
||||||
|
// TODO: if we go directly from a load to store, skip conversion entirely?
|
||||||
|
// TODO: if we go directly from a load to a float instruction, and the value isn't used
|
||||||
|
// for anything else, we can skip PPC_FP on a load too.
|
||||||
|
if (!strncmp(code[i].opinfo->opname, "lfs", 3))
|
||||||
|
{
|
||||||
|
fprIsSingle[code[i].fregOut] = true;
|
||||||
|
fprIsDuplicated[code[i].fregOut] = true;
|
||||||
|
}
|
||||||
|
// Paired are still floats, but the top/bottom halves may differ.
|
||||||
|
if (code[i].opinfo->type == OPTYPE_PS || code[i].opinfo->type == OPTYPE_LOADPS)
|
||||||
|
{
|
||||||
|
fprIsSingle[code[i].fregOut] = true;
|
||||||
|
fprIsStoreSafe[code[i].fregOut] = true;
|
||||||
|
}
|
||||||
|
// Careful: changing the float mode in a block breaks this optimization, since
|
||||||
|
// a previous float op might have had had FTZ off while the later store has FTZ
|
||||||
|
// on. So, discard all information we have.
|
||||||
|
if (!strncmp(code[i].opinfo->opname, "mtfs", 4))
|
||||||
|
fprIsStoreSafe = BitSet32(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return address;
|
return address;
|
||||||
|
|
|
@ -51,6 +51,13 @@ struct CodeOp //16B
|
||||||
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
|
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
|
||||||
// an XMM only to move it again to a GPR afterwards.
|
// an XMM only to move it again to a GPR afterwards.
|
||||||
BitSet32 fprInXmm;
|
BitSet32 fprInXmm;
|
||||||
|
// whether an fpr is known to be an actual single-precision value at this point in the block.
|
||||||
|
BitSet32 fprIsSingle;
|
||||||
|
// whether an fpr is known to have identical top and bottom halves (e.g. due to a single instruction)
|
||||||
|
BitSet32 fprIsDuplicated;
|
||||||
|
// whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can safely
|
||||||
|
// skip PPC_FP.
|
||||||
|
BitSet32 fprIsStoreSafe;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct BlockStats
|
struct BlockStats
|
||||||
|
|
Loading…
Reference in New Issue