diff --git a/Source/Core/Common/CPUDetect.h b/Source/Core/Common/CPUDetect.h index 752d26afb2..c63076ff7b 100644 --- a/Source/Core/Common/CPUDetect.h +++ b/Source/Core/Common/CPUDetect.h @@ -50,10 +50,10 @@ struct CPUInfo bool bMOVBE; // This flag indicates that the hardware supports some mode // in which denormal inputs _and_ outputs are automatically set to (signed) zero. - // TODO: ARM bool bFlushToZero; bool bLAHFSAHF64; bool bLongMode; + bool bAtom; // ARM specific CPUInfo bool bSwp; diff --git a/Source/Core/Common/x64CPUDetect.cpp b/Source/Core/Common/x64CPUDetect.cpp index 31409685e8..8ad8046c8b 100644 --- a/Source/Core/Common/x64CPUDetect.cpp +++ b/Source/Core/Common/x64CPUDetect.cpp @@ -129,6 +129,12 @@ void CPUInfo::Detect() if (max_std_fn >= 1) { __cpuid(cpu_id, 0x00000001); + int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff); + int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0); + // Detect people unfortunate enough to be running Dolphin on an Atom + if (family == 6 && (model == 0x1C || model == 0x26 ||model == 0x27 || model == 0x35 || model == 0x36 || + model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D)) + bAtom = true; logical_cpu_count = (cpu_id[1] >> 16) & 0xFF; ht = (cpu_id[3] >> 28) & 1; diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index d34333761b..65d88872b4 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -321,14 +321,14 @@ static GekkoOPTemplate table59[] = static GekkoOPTemplate table63[] = { - {264, Interpreter::fabsx, {"fabsx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {264, Interpreter::fabsx, {"fabsx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {32, Interpreter::fcmpo, {"fcmpo", OPTYPE_DOUBLEFP, FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {0, Interpreter::fcmpu, {"fcmpu", OPTYPE_DOUBLEFP, FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, - {14, Interpreter::fctiwx, {"fctiwx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {15, Interpreter::fctiwzx, {"fctiwzx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {14, Interpreter::fctiwx, {"fctiwx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {15, Interpreter::fctiwzx, {"fctiwzx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {72, Interpreter::fmrx, {"fmrx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {136, Interpreter::fnabsx, {"fnabsx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {40, Interpreter::fnegx, {"fnegx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {136, Interpreter::fnabsx, {"fnabsx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {40, Interpreter::fnegx, {"fnegx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {12, Interpreter::frspx, {"frspx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {64, Interpreter::mcrfs, {"mcrfs", OPTYPE_SYSTEMFP, FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 5793d744c8..a2ec9f2a66 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -151,7 +151,7 @@ public: void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), - void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); + void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool packed = false, bool roundRHS = false); void FloatCompare(UGeckoInstruction inst, bool upper = false); // OPCODES diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 6a055f7cf8..f404ccd88b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -11,11 +11,12 @@ using namespace Gen; static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL}; +static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), - void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) + void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool packed, bool roundRHS) { fpr.Lock(d, a, b); fpr.BindToRegister(d, d == a || d == b || !single); @@ -34,12 +35,19 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X } else { - avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible); + avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible); } if (single) { - ForceSinglePrecisionS(fpr.RX(d)); - MOVDDUP(fpr.RX(d), fpr.R(d)); + if (packed) + { + ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); + } + else + { + ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d)); + MOVDDUP(fpr.RX(d), fpr.R(d)); + } } SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); @@ -63,13 +71,32 @@ void Jit64::fp_arith(UGeckoInstruction inst) JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); + int a = inst.FA; + int b = inst.FB; + int c = inst.FC; + int d = inst.FD; + int arg2 = inst.SUBOP5 == 25 ? c : b; + bool single = inst.OPCD == 59; + bool round_input = single && !jit->js.op->fprIsSingle[inst.FC]; + // If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by + // using packed arithmetic instead. + bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2]; + // Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case. + // Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones. + if (inst.SUBOP5 == 18 || cpu_info.bAtom) + packed = false; + switch (inst.SUBOP5) { - case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div - case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub - case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add - case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul + case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD, + packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, inst, packed); break; + case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD, + packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, inst, packed); break; + case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD, + packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, inst, packed); break; + case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD, + packed ? &XEmitter::MULPD : &XEmitter::MULSD, inst, packed, round_input); break; default: _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } @@ -81,12 +108,15 @@ void Jit64::fmaddXX(UGeckoInstruction inst) JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); - bool single_precision = inst.OPCD == 59; - int a = inst.FA; int b = inst.FB; int c = inst.FC; int d = inst.FD; + bool single = inst.OPCD == 59; + bool round_input = single && !jit->js.op->fprIsSingle[c]; + bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c]; + if (cpu_info.bAtom) + packed = false; fpr.Lock(a, b, c, d); @@ -98,66 +128,103 @@ void Jit64::fmaddXX(UGeckoInstruction inst) // instances on different computers giving identical results. if (cpu_info.bFMA && !Core::g_want_determinism) { - if (single_precision) + if (single && round_input) Force25BitPrecision(XMM0, fpr.R(c), XMM1); else - MOVSD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); // Statistics suggests b is a lot less likely to be unbound in practice, so // if we have to pick one of a or b to bind, let's make it b. fpr.BindToRegister(b, true, false); switch (inst.SUBOP5) { case 28: //msub - VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); + if (packed) + VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); + else + VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); break; case 29: //madd - VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); + if (packed) + VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); + else + VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); break; // PowerPC and x86 define NMADD/NMSUB differently // x86: D = -A*C (+/-) B // PPC: D = -(A*C (+/-) B) // so we have to swap them; the ADD/SUB here isn't a typo. case 30: //nmsub - VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); + if (packed) + VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); + else + VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); break; case 31: //nmadd - VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); + if (packed) + VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); + else + VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); break; } } else if (inst.SUBOP5 == 30) //nmsub { // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately - if (single_precision) + if (single && round_input) Force25BitPrecision(XMM1, fpr.R(c), XMM0); else - MOVSD(XMM1, fpr.R(c)); - MULSD(XMM1, fpr.R(a)); - MOVSD(XMM0, fpr.R(b)); - SUBSD(XMM0, R(XMM1)); + MOVAPD(XMM1, fpr.R(c)); + MOVAPD(XMM0, fpr.R(b)); + if (packed) + { + MULPD(XMM1, fpr.R(a)); + SUBPD(XMM0, R(XMM1)); + } + else + { + MULSD(XMM1, fpr.R(a)); + SUBSD(XMM0, R(XMM1)); + } } else { - if (single_precision) + if (single && round_input) Force25BitPrecision(XMM0, fpr.R(c), XMM1); else - MOVSD(XMM0, fpr.R(c)); - MULSD(XMM0, fpr.R(a)); - if (inst.SUBOP5 == 28) //msub - SUBSD(XMM0, fpr.R(b)); - else //(n)madd - ADDSD(XMM0, fpr.R(b)); + MOVAPD(XMM0, fpr.R(c)); + if (packed) + { + MULPD(XMM0, fpr.R(a)); + if (inst.SUBOP5 == 28) //msub + SUBPD(XMM0, fpr.R(b)); + else //(n)madd + ADDPD(XMM0, fpr.R(b)); + } + else + { + MULSD(XMM0, fpr.R(a)); + if (inst.SUBOP5 == 28) + SUBSD(XMM0, fpr.R(b)); + else + ADDSD(XMM0, fpr.R(b)); + } if (inst.SUBOP5 == 31) //nmadd - PXOR(XMM0, M((void*)&psSignBits)); + PXOR(XMM0, M((void*)&(packed ? psSignBits2 : psSignBits))); } - fpr.BindToRegister(d, false); - //YES it is necessary to dupe the result :( - //TODO : analysis - does the top reg get used? If so, dupe, if not, don't. - if (single_precision) + fpr.BindToRegister(d, !single); + + if (single) { - ForceSinglePrecisionS(XMM0); - MOVDDUP(fpr.RX(d), R(XMM0)); + if (packed) + { + ForceSinglePrecisionP(fpr.RX(d), XMM0); + } + else + { + ForceSinglePrecisionS(fpr.RX(d), XMM0); + MOVDDUP(fpr.RX(d), fpr.R(d)); + } } else { @@ -176,7 +243,7 @@ void Jit64::fsign(UGeckoInstruction inst) int d = inst.FD; int b = inst.FB; fpr.Lock(b, d); - fpr.BindToRegister(d, true, true); + fpr.BindToRegister(d); if (d != b) MOVSD(fpr.RX(d), fpr.R(b)); @@ -212,7 +279,7 @@ void Jit64::fselx(UGeckoInstruction inst) int c = inst.FC; fpr.Lock(a, b, c, d); - MOVSD(XMM1, fpr.R(a)); + MOVAPD(XMM1, fpr.R(a)); PXOR(XMM0, R(XMM0)); // This condition is very tricky; there's only one right way to handle both the case of // negative/positive zero and NaN properly. @@ -220,17 +287,17 @@ void Jit64::fselx(UGeckoInstruction inst) CMPSD(XMM0, R(XMM1), NLE); if (cpu_info.bSSE4_1) { - MOVSD(XMM1, fpr.R(c)); + MOVAPD(XMM1, fpr.R(c)); BLENDVPD(XMM1, fpr.R(b)); } else { - MOVSD(XMM1, R(XMM0)); + MOVAPD(XMM1, R(XMM0)); PAND(XMM0, fpr.R(b)); PANDN(XMM1, fpr.R(c)); POR(XMM1, R(XMM0)); } - fpr.BindToRegister(d, true); + fpr.BindToRegister(d); MOVSD(fpr.RX(d), R(XMM1)); fpr.UnlockAll(); } @@ -383,7 +450,7 @@ void Jit64::fctiwx(UGeckoInstruction inst) int d = inst.RD; int b = inst.RB; fpr.Lock(d, b); - fpr.BindToRegister(d, d == b); + fpr.BindToRegister(d); // Intel uses 0x80000000 as a generic error code while PowerPC uses clamping: // @@ -426,7 +493,7 @@ void Jit64::frspx(UGeckoInstruction inst) fpr.BindToRegister(d, d == b); if (b != d) MOVAPD(fpr.RX(d), fpr.R(b)); - ForceSinglePrecisionS(fpr.RX(d)); + ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d)); MOVDDUP(fpr.RX(d), fpr.R(d)); SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); @@ -442,8 +509,8 @@ void Jit64::frsqrtex(UGeckoInstruction inst) gpr.FlushLockX(RSCRATCH_EXTRA); fpr.Lock(b, d); - fpr.BindToRegister(d, d == b); - MOVSD(XMM0, fpr.R(b)); + fpr.BindToRegister(d); + MOVAPD(XMM0, fpr.R(b)); CALL((void *)asm_routines.frsqrte); MOVSD(fpr.R(d), XMM0); SetFPRFIfNeeded(inst, fpr.RX(d)); @@ -461,8 +528,8 @@ void Jit64::fresx(UGeckoInstruction inst) gpr.FlushLockX(RSCRATCH_EXTRA); fpr.Lock(b, d); - fpr.BindToRegister(d, d == b); - MOVSD(XMM0, fpr.R(b)); + fpr.BindToRegister(d); + MOVAPD(XMM0, fpr.R(b)); CALL((void *)asm_routines.fres); MOVSD(fpr.R(d), XMM0); SetFPRFIfNeeded(inst, fpr.RX(d)); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index a859a53ff9..2a246b3a0b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -108,8 +108,15 @@ void Jit64::stfXXX(UGeckoInstruction inst) if (single) { - fpr.BindToRegister(s, true, false); - ConvertDoubleToSingle(XMM0, fpr.RX(s)); + if (jit->js.op->fprIsStoreSafe[s]) + { + CVTSD2SS(XMM0, fpr.R(s)); + } + else + { + fpr.BindToRegister(s, true, false); + ConvertDoubleToSingle(XMM0, fpr.RX(s)); + } MOVD_xmm(R(RSCRATCH), XMM0); } else diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index fe36351b25..815cd77c92 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -124,6 +124,7 @@ void Jit64::ps_arith(UGeckoInstruction inst) JITDISABLE(bJITPairedOff); FALLBACK_IF(inst.Rc); + bool round_input = !jit->js.op->fprIsSingle[inst.FC]; switch (inst.SUBOP5) { case 18: // div @@ -136,7 +137,7 @@ void Jit64::ps_arith(UGeckoInstruction inst) tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst); break; case 25: // mul - tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true); + tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, round_input); break; default: _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); @@ -187,6 +188,7 @@ void Jit64::ps_muls(UGeckoInstruction inst) int d = inst.FD; int a = inst.FA; int c = inst.FC; + bool round_input = !jit->js.op->fprIsSingle[c]; fpr.Lock(a, c, d); switch (inst.SUBOP5) { @@ -199,7 +201,8 @@ void Jit64::ps_muls(UGeckoInstruction inst) default: PanicAlert("ps_muls WTF!!!"); } - Force25BitPrecision(XMM0, R(XMM0), XMM1); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); MULPD(XMM0, fpr.R(a)); fpr.BindToRegister(d, false); ForceSinglePrecisionP(fpr.RX(d), XMM0); @@ -306,6 +309,7 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) int c = inst.FC; int d = inst.FD; bool fma = cpu_info.bFMA && !Core::g_want_determinism; + bool round_input = !jit->js.op->fprIsSingle[c]; fpr.Lock(a,b,c,d); if (fma) @@ -314,16 +318,21 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) if (inst.SUBOP5 == 14) { MOVDDUP(XMM0, fpr.R(c)); - Force25BitPrecision(XMM0, R(XMM0), XMM1); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); } else if (inst.SUBOP5 == 15) { avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); - Force25BitPrecision(XMM0, R(XMM0), XMM1); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); } else { - Force25BitPrecision(XMM0, fpr.R(c), XMM1); + if (round_input) + Force25BitPrecision(XMM0, fpr.R(c), XMM1); + else + MOVAPD(XMM0, fpr.R(c)); } if (fma) diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 30573246c0..a9808f7d07 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -667,13 +667,17 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), R(reg)); } -void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm) +void EmuCodeBlock::ForceSinglePrecisionS(X64Reg output, X64Reg input) { // Most games don't need these. Zelda requires it though - some platforms get stuck without them. if (jit->jo.accurateSinglePrecision) { - CVTSD2SS(xmm, R(xmm)); - CVTSS2SD(xmm, R(xmm)); + CVTSD2SS(input, R(input)); + CVTSS2SD(output, R(input)); + } + else if (output != input) + { + MOVAPD(output, R(input)); } } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 3487fb374f..67a01249f2 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -130,7 +130,7 @@ public: void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8), Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm); - void ForceSinglePrecisionS(Gen::X64Reg xmm); + void ForceSinglePrecisionS(Gen::X64Reg output, Gen::X64Reg input); void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input); void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index cefba76d93..b5a5c22716 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -827,10 +827,48 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 // the same location later). gprInUse |= code[i].regsOut; if (code[i].fregOut >= 0) - { fprInUse[code[i].fregOut] = true; - if (strncmp(code[i].opinfo->opname, "stfd", 4)) - fprInXmm[code[i].fregOut] = true; + } + + // Forward scan, for flags that need the other direction for calculation. + BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe; + for (u32 i = 0; i < block->m_num_instructions; i++) + { + code[i].fprIsSingle = fprIsSingle; + code[i].fprIsDuplicated = fprIsDuplicated; + code[i].fprIsStoreSafe = fprIsStoreSafe; + if (code[i].fregOut >= 0) + { + fprIsSingle[code[i].fregOut] = false; + fprIsDuplicated[code[i].fregOut] = false; + fprIsStoreSafe[code[i].fregOut] = false; + // Single, duplicated, and doesn't need PPC_FP. + if (code[i].opinfo->type == OPTYPE_SINGLEFP) + { + fprIsSingle[code[i].fregOut] = true; + fprIsDuplicated[code[i].fregOut] = true; + fprIsStoreSafe[code[i].fregOut] = true; + } + // Single and duplicated, but might be a denormal (not safe to skip PPC_FP). + // TODO: if we go directly from a load to store, skip conversion entirely? + // TODO: if we go directly from a load to a float instruction, and the value isn't used + // for anything else, we can skip PPC_FP on a load too. + if (!strncmp(code[i].opinfo->opname, "lfs", 3)) + { + fprIsSingle[code[i].fregOut] = true; + fprIsDuplicated[code[i].fregOut] = true; + } + // Paired are still floats, but the top/bottom halves may differ. + if (code[i].opinfo->type == OPTYPE_PS || code[i].opinfo->type == OPTYPE_LOADPS) + { + fprIsSingle[code[i].fregOut] = true; + fprIsStoreSafe[code[i].fregOut] = true; + } + // Careful: changing the float mode in a block breaks this optimization, since + // a previous float op might have had had FTZ off while the later store has FTZ + // on. So, discard all information we have. + if (!strncmp(code[i].opinfo->opname, "mtfs", 4)) + fprIsStoreSafe = BitSet32(0); } } return address; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 8abf4bbdfe..59c637e5b2 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -51,6 +51,13 @@ struct CodeOp //16B // we do double stores from GPRs, so we don't want to load a PowerPC floating point register into // an XMM only to move it again to a GPR afterwards. BitSet32 fprInXmm; + // whether an fpr is known to be an actual single-precision value at this point in the block. + BitSet32 fprIsSingle; + // whether an fpr is known to have identical top and bottom halves (e.g. due to a single instruction) + BitSet32 fprIsDuplicated; + // whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can safely + // skip PPC_FP. + BitSet32 fprIsStoreSafe; }; struct BlockStats