diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 04d30e46df..525f50e552 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -17,6 +17,8 @@ // ---------- #pragma once +#include + #include #include "Common/CommonTypes.h" @@ -127,7 +129,8 @@ public: bool duplicate = false); void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input); void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm, Gen::X64Reg clobber, - std::vector inputs); + std::optional Ra, std::optional Rb, + std::optional Rc); void MultiplyImmediate(u32 imm, int a, int d, bool overflow); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 918d6260d2..5daaf9e9b5 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "Common/Assert.h" @@ -92,7 +93,8 @@ void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input) SetFPRFIfNeeded(input, false); } -void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::vector inputs) +void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::optional Ra, + std::optional Rb, std::optional Rc) { // | PowerPC | x86 // ---------------------+----------+--------- @@ -107,15 +109,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: ASSERT(xmm != clobber); - // Remove duplicates from inputs - for (auto it = inputs.begin(); it != inputs.end();) - { - if (std::find(inputs.begin(), it, *it) != it) - it = inputs.erase(it); - else - ++it; - } - if (inst.OPCD != 4) { // not paired-single @@ -127,14 +120,17 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: // If any inputs are NaNs, pick the first NaN of them std::vector fixups; - for (int x : inputs) - { - RCOpArg Rx = fpr.Use(x, RCMode::Read); - RegCache::Realize(Rx); + const auto check_input = [&](const OpArg& Rx) { MOVDDUP(xmm, Rx); UCOMISD(xmm, R(xmm)); fixups.push_back(J_CC(CC_P)); - } + }; + if (Ra) + check_input(*Ra); + if (Rb && Ra != Rb) + check_input(*Rb); + if (Rc && Ra != Rc && Rb != Rc) + check_input(*Rc); // Otherwise, pick the PPC default NaN (will be finished below) XORPD(xmm, R(xmm)); @@ -152,8 +148,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: { // paired-single - std::reverse(inputs.begin(), inputs.end()); - if (cpu_info.bSSE4_1) { avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, R(xmm), R(xmm), CMP_UNORD); @@ -167,13 +161,16 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: BLENDVPD(xmm, MConst(psGeneratedQNaN)); // If any inputs are NaNs, use those instead - for (int x : inputs) - { - RCOpArg Rx = fpr.Use(x, RCMode::Read); - RegCache::Realize(Rx); + const auto check_input = [&](const OpArg& Rx) { avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, Rx, Rx, CMP_UNORD); BLENDVPD(xmm, Rx); - } + }; + if (Rc) + check_input(*Rc); + if (Rb && Rb != Rc) + check_input(*Rb); + if (Ra && Ra != Rb && Ra != Rc) + check_input(*Ra); } else { @@ -197,17 +194,20 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: MOVAPD(xmm, tmp); // If any inputs are NaNs, use those instead - for (int x : inputs) - { - RCOpArg Rx = fpr.Use(x, RCMode::Read); - RegCache::Realize(Rx); + const auto check_input = [&](const OpArg& Rx) { MOVAPD(clobber, Rx); CMPPD(clobber, R(clobber), CMP_ORD); MOVAPD(tmp, R(clobber)); ANDNPD(clobber, Rx); ANDPD(xmm, tmp); ORPD(xmm, R(clobber)); - } + }; + if (Rc) + check_input(*Rc); + if (Rb && Rb != Rc) + check_input(*Rb); + if (Ra && Ra != Rb && Ra != Rc) + check_input(*Ra); } // Turn SNaNs into QNaNs @@ -246,64 +246,69 @@ void Jit64::fp_arith(UGeckoInstruction inst) if (inst.OPCD == 59 && (inst.SUBOP5 == 18 || cpu_info.bAtom)) packed = false; - bool round_input = single && !js.op->fprIsSingle[inst.FC]; - bool preserve_inputs = m_accurate_nans; - - const auto fp_tri_op = [&](int op1, int op2, bool reversible, - void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&), - void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool roundRHS = false) { - RCX64Reg Rd = fpr.Bind(d, !single ? RCMode::ReadWrite : RCMode::Write); - RCOpArg Rop1 = fpr.Use(op1, RCMode::Read); - RCOpArg Rop2 = fpr.Use(op2, RCMode::Read); - RegCache::Realize(Rd, Rop1, Rop2); - - X64Reg dest = preserve_inputs ? XMM1 : static_cast(Rd); - if (roundRHS) - { - if (d == op1 && !preserve_inputs) - { - Force25BitPrecision(XMM0, Rop2, XMM1); - (this->*sseOp)(Rd, R(XMM0)); - } - else - { - Force25BitPrecision(dest, Rop2, XMM0); - (this->*sseOp)(dest, Rop1); - } - } - else - { - avx_op(avxOp, sseOp, dest, Rop1, Rop2, packed, reversible); - } - - HandleNaNs(inst, dest, XMM0, {op1, op2}); - if (single) - FinalizeSingleResult(Rd, R(dest), packed, true); - else - FinalizeDoubleResult(Rd, R(dest)); - }; - + void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&) = nullptr; + void (XEmitter::*sseOp)(X64Reg, const OpArg&) = nullptr; + bool reversible = false; + bool roundRHS = false; switch (inst.SUBOP5) { case 18: - fp_tri_op(a, b, false, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD, - packed ? &XEmitter::DIVPD : &XEmitter::DIVSD); + avxOp = packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD; + sseOp = packed ? &XEmitter::DIVPD : &XEmitter::DIVSD; break; case 20: - fp_tri_op(a, b, false, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD, - packed ? &XEmitter::SUBPD : &XEmitter::SUBSD); + avxOp = packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD; + sseOp = packed ? &XEmitter::SUBPD : &XEmitter::SUBSD; break; case 21: - fp_tri_op(a, b, true, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD, - packed ? &XEmitter::ADDPD : &XEmitter::ADDSD); + reversible = true; + avxOp = packed ? &XEmitter::VADDPD : &XEmitter::VADDSD; + sseOp = packed ? &XEmitter::ADDPD : &XEmitter::ADDSD; break; case 25: - fp_tri_op(a, c, true, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD, - packed ? &XEmitter::MULPD : &XEmitter::MULSD, round_input); + reversible = true; + roundRHS = single && !js.op->fprIsSingle[c]; + avxOp = packed ? &XEmitter::VMULPD : &XEmitter::VMULSD; + sseOp = packed ? &XEmitter::MULPD : &XEmitter::MULSD; break; default: ASSERT_MSG(DYNA_REC, 0, "fp_arith WTF!!!"); } + + RCX64Reg Rd = fpr.Bind(d, !single ? RCMode::ReadWrite : RCMode::Write); + RCOpArg Ra = fpr.Use(a, RCMode::Read); + RCOpArg Rarg2 = fpr.Use(arg2, RCMode::Read); + RegCache::Realize(Rd, Ra, Rarg2); + + bool preserve_inputs = m_accurate_nans; + X64Reg dest = preserve_inputs ? XMM1 : static_cast(Rd); + if (roundRHS) + { + if (a == d && !preserve_inputs) + { + Force25BitPrecision(XMM0, Rarg2, XMM1); + (this->*sseOp)(Rd, R(XMM0)); + } + else + { + Force25BitPrecision(dest, Rarg2, XMM0); + (this->*sseOp)(dest, Ra); + } + } + else + { + avx_op(avxOp, sseOp, dest, Ra, Rarg2, packed, reversible); + } + + if (inst.SUBOP5 != 25) + HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt); + else + HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2); + + if (single) + FinalizeSingleResult(Rd, R(dest), packed, true); + else + FinalizeDoubleResult(Rd, R(dest)); } void Jit64::fmaddXX(UGeckoInstruction inst) @@ -499,7 +504,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) result_xmm = Rd; } - HandleNaNs(inst, result_xmm, XMM0, {a, b, c}); + HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, Rc); if (single) FinalizeSingleResult(Rd, R(result_xmm), packed, true); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index bea3829e0a..fe42929ce1 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -3,6 +3,8 @@ #include "Core/PowerPC/Jit64/Jit.h" +#include + #include "Common/CPUDetect.h" #include "Common/CommonTypes.h" #include "Common/MsgHandler.h" @@ -77,7 +79,7 @@ void Jit64::ps_sum(UGeckoInstruction inst) default: PanicAlertFmt("ps_sum WTF!!!"); } - HandleNaNs(inst, tmp, tmp == XMM1 ? XMM0 : XMM1, {a, b, c}); + HandleNaNs(inst, tmp, tmp == XMM1 ? XMM0 : XMM1, Ra, Rb, Rc); FinalizeSingleResult(Rd, R(tmp)); } @@ -112,7 +114,7 @@ void Jit64::ps_muls(UGeckoInstruction inst) if (round_input) Force25BitPrecision(XMM1, R(XMM1), XMM0); MULPD(XMM1, Ra); - HandleNaNs(inst, XMM1, XMM0, {a, c}); + HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc); FinalizeSingleResult(Rd, R(XMM1)); }