diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 6a055f7cf8..6ef87ccac9 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -64,12 +64,13 @@ void Jit64::fp_arith(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); bool single = inst.OPCD == 59; + bool round_input = single && !jit->js.op->fprIsSingle[inst.FC]; switch (inst.SUBOP5) { case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add - case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul + case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, round_input); break; //mul default: _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } @@ -81,12 +82,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst) JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); - bool single_precision = inst.OPCD == 59; - int a = inst.FA; int b = inst.FB; int c = inst.FC; int d = inst.FD; + bool single_precision = inst.OPCD == 59; + bool round_input = single_precision && !jit->js.op->fprIsSingle[c]; fpr.Lock(a, b, c, d); @@ -98,10 +99,10 @@ void Jit64::fmaddXX(UGeckoInstruction inst) // instances on different computers giving identical results. if (cpu_info.bFMA && !Core::g_want_determinism) { - if (single_precision) + if (single_precision && round_input) Force25BitPrecision(XMM0, fpr.R(c), XMM1); else - MOVSD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); // Statistics suggests b is a lot less likely to be unbound in practice, so // if we have to pick one of a or b to bind, let's make it b. fpr.BindToRegister(b, true, false); @@ -128,20 +129,20 @@ void Jit64::fmaddXX(UGeckoInstruction inst) else if (inst.SUBOP5 == 30) //nmsub { // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately - if (single_precision) + if (single_precision && round_input) Force25BitPrecision(XMM1, fpr.R(c), XMM0); else - MOVSD(XMM1, fpr.R(c)); + MOVAPD(XMM1, fpr.R(c)); MULSD(XMM1, fpr.R(a)); MOVSD(XMM0, fpr.R(b)); SUBSD(XMM0, R(XMM1)); } else { - if (single_precision) + if (single_precision && round_input) Force25BitPrecision(XMM0, fpr.R(c), XMM1); else - MOVSD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); MULSD(XMM0, fpr.R(a)); if (inst.SUBOP5 == 28) //msub SUBSD(XMM0, fpr.R(b)); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index fe36351b25..815cd77c92 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -124,6 +124,7 @@ void Jit64::ps_arith(UGeckoInstruction inst) JITDISABLE(bJITPairedOff); FALLBACK_IF(inst.Rc); + bool round_input = !jit->js.op->fprIsSingle[inst.FC]; switch (inst.SUBOP5) { case 18: // div @@ -136,7 +137,7 @@ void Jit64::ps_arith(UGeckoInstruction inst) tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst); break; case 25: // mul - tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true); + tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, round_input); break; default: _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); @@ -187,6 +188,7 @@ void Jit64::ps_muls(UGeckoInstruction inst) int d = inst.FD; int a = inst.FA; int c = inst.FC; + bool round_input = !jit->js.op->fprIsSingle[c]; fpr.Lock(a, c, d); switch (inst.SUBOP5) { @@ -199,7 +201,8 @@ void Jit64::ps_muls(UGeckoInstruction inst) default: PanicAlert("ps_muls WTF!!!"); } - Force25BitPrecision(XMM0, R(XMM0), XMM1); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); MULPD(XMM0, fpr.R(a)); fpr.BindToRegister(d, false); ForceSinglePrecisionP(fpr.RX(d), XMM0); @@ -306,6 +309,7 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) int c = inst.FC; int d = inst.FD; bool fma = cpu_info.bFMA && !Core::g_want_determinism; + bool round_input = !jit->js.op->fprIsSingle[c]; fpr.Lock(a,b,c,d); if (fma) @@ -314,16 +318,21 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) if (inst.SUBOP5 == 14) { MOVDDUP(XMM0, fpr.R(c)); - Force25BitPrecision(XMM0, R(XMM0), XMM1); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); } else if (inst.SUBOP5 == 15) { avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); - Force25BitPrecision(XMM0, R(XMM0), XMM1); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); } else { - Force25BitPrecision(XMM0, fpr.R(c), XMM1); + if (round_input) + Force25BitPrecision(XMM0, fpr.R(c), XMM1); + else + MOVAPD(XMM0, fpr.R(c)); } if (fma) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index cefba76d93..0d72e8a5a4 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -827,10 +827,21 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 // the same location later). gprInUse |= code[i].regsOut; if (code[i].fregOut >= 0) - { fprInUse[code[i].fregOut] = true; - if (strncmp(code[i].opinfo->opname, "stfd", 4)) - fprInXmm[code[i].fregOut] = true; + } + + // Forward scan, for flags that need the other direction for calculation + BitSet32 fprIsSingle; + for (u32 i = 0; i < block->m_num_instructions; i++) + { + code[i].fprIsSingle = fprIsSingle; + if (code[i].fregOut >= 0) + { + // This instruction outputs float, so we can omit the special rounding done in fmuls/fmadds + if (code[i].opinfo->type == OPTYPE_SINGLEFP || code[i].opinfo->type == OPTYPE_PS || strncmp(code[i].opinfo->opname, "lfs", 3)) + fprIsSingle[code[i].fregOut] = true; + else + fprIsSingle[code[i].fregOut] = false; } } return address; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 8abf4bbdfe..e68be7a5ee 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -51,6 +51,8 @@ struct CodeOp //16B // we do double stores from GPRs, so we don't want to load a PowerPC floating point register into // an XMM only to move it again to a GPR afterwards. BitSet32 fprInXmm; + // whether an fpr is known to be an actual single-precision value at this point in the block. + BitSet32 fprIsSingle; }; struct BlockStats