From d3180e35165f172575790efff8bcdbf607798523 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 25 Nov 2022 18:59:20 +0100 Subject: [PATCH 1/3] Jit64: Refactor HandleNaNs operand passing --- Source/Core/Core/PowerPC/Jit64/Jit.h | 5 +- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 151 +++++++++--------- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 6 +- 3 files changed, 86 insertions(+), 76 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 04d30e46df..525f50e552 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -17,6 +17,8 @@ // ---------- #pragma once +#include + #include #include "Common/CommonTypes.h" @@ -127,7 +129,8 @@ public: bool duplicate = false); void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input); void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm, Gen::X64Reg clobber, - std::vector inputs); + std::optional Ra, std::optional Rb, + std::optional Rc); void MultiplyImmediate(u32 imm, int a, int d, bool overflow); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 918d6260d2..5daaf9e9b5 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "Common/Assert.h" @@ -92,7 +93,8 @@ void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input) SetFPRFIfNeeded(input, false); } -void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::vector inputs) +void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::optional Ra, + std::optional Rb, std::optional Rc) { // | PowerPC | x86 // ---------------------+----------+--------- @@ -107,15 +109,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: ASSERT(xmm != clobber); - // Remove duplicates from inputs - for (auto it = inputs.begin(); it != inputs.end();) - { - if (std::find(inputs.begin(), it, *it) != it) - it = inputs.erase(it); - else - ++it; - } - if (inst.OPCD != 4) { // not paired-single @@ -127,14 +120,17 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: // If any inputs are NaNs, pick the first NaN of them std::vector fixups; - for (int x : inputs) - { - RCOpArg Rx = fpr.Use(x, RCMode::Read); - RegCache::Realize(Rx); + const auto check_input = [&](const OpArg& Rx) { MOVDDUP(xmm, Rx); UCOMISD(xmm, R(xmm)); fixups.push_back(J_CC(CC_P)); - } + }; + if (Ra) + check_input(*Ra); + if (Rb && Ra != Rb) + check_input(*Rb); + if (Rc && Ra != Rc && Rb != Rc) + check_input(*Rc); // Otherwise, pick the PPC default NaN (will be finished below) XORPD(xmm, R(xmm)); @@ -152,8 +148,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: { // paired-single - std::reverse(inputs.begin(), inputs.end()); - if (cpu_info.bSSE4_1) { avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, R(xmm), R(xmm), CMP_UNORD); @@ -167,13 +161,16 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: BLENDVPD(xmm, MConst(psGeneratedQNaN)); // If any inputs are NaNs, use those instead - for (int x : inputs) - { - RCOpArg Rx = fpr.Use(x, RCMode::Read); - RegCache::Realize(Rx); + const auto check_input = [&](const OpArg& Rx) { avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, Rx, Rx, CMP_UNORD); BLENDVPD(xmm, Rx); - } + }; + if (Rc) + check_input(*Rc); + if (Rb && Rb != Rc) + check_input(*Rb); + if (Ra && Ra != Rb && Ra != Rc) + check_input(*Ra); } else { @@ -197,17 +194,20 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: MOVAPD(xmm, tmp); // If any inputs are NaNs, use those instead - for (int x : inputs) - { - RCOpArg Rx = fpr.Use(x, RCMode::Read); - RegCache::Realize(Rx); + const auto check_input = [&](const OpArg& Rx) { MOVAPD(clobber, Rx); CMPPD(clobber, R(clobber), CMP_ORD); MOVAPD(tmp, R(clobber)); ANDNPD(clobber, Rx); ANDPD(xmm, tmp); ORPD(xmm, R(clobber)); - } + }; + if (Rc) + check_input(*Rc); + if (Rb && Rb != Rc) + check_input(*Rb); + if (Ra && Ra != Rb && Ra != Rc) + check_input(*Ra); } // Turn SNaNs into QNaNs @@ -246,64 +246,69 @@ void Jit64::fp_arith(UGeckoInstruction inst) if (inst.OPCD == 59 && (inst.SUBOP5 == 18 || cpu_info.bAtom)) packed = false; - bool round_input = single && !js.op->fprIsSingle[inst.FC]; - bool preserve_inputs = m_accurate_nans; - - const auto fp_tri_op = [&](int op1, int op2, bool reversible, - void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&), - void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool roundRHS = false) { - RCX64Reg Rd = fpr.Bind(d, !single ? RCMode::ReadWrite : RCMode::Write); - RCOpArg Rop1 = fpr.Use(op1, RCMode::Read); - RCOpArg Rop2 = fpr.Use(op2, RCMode::Read); - RegCache::Realize(Rd, Rop1, Rop2); - - X64Reg dest = preserve_inputs ? XMM1 : static_cast(Rd); - if (roundRHS) - { - if (d == op1 && !preserve_inputs) - { - Force25BitPrecision(XMM0, Rop2, XMM1); - (this->*sseOp)(Rd, R(XMM0)); - } - else - { - Force25BitPrecision(dest, Rop2, XMM0); - (this->*sseOp)(dest, Rop1); - } - } - else - { - avx_op(avxOp, sseOp, dest, Rop1, Rop2, packed, reversible); - } - - HandleNaNs(inst, dest, XMM0, {op1, op2}); - if (single) - FinalizeSingleResult(Rd, R(dest), packed, true); - else - FinalizeDoubleResult(Rd, R(dest)); - }; - + void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&) = nullptr; + void (XEmitter::*sseOp)(X64Reg, const OpArg&) = nullptr; + bool reversible = false; + bool roundRHS = false; switch (inst.SUBOP5) { case 18: - fp_tri_op(a, b, false, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD, - packed ? &XEmitter::DIVPD : &XEmitter::DIVSD); + avxOp = packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD; + sseOp = packed ? &XEmitter::DIVPD : &XEmitter::DIVSD; break; case 20: - fp_tri_op(a, b, false, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD, - packed ? &XEmitter::SUBPD : &XEmitter::SUBSD); + avxOp = packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD; + sseOp = packed ? &XEmitter::SUBPD : &XEmitter::SUBSD; break; case 21: - fp_tri_op(a, b, true, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD, - packed ? &XEmitter::ADDPD : &XEmitter::ADDSD); + reversible = true; + avxOp = packed ? &XEmitter::VADDPD : &XEmitter::VADDSD; + sseOp = packed ? &XEmitter::ADDPD : &XEmitter::ADDSD; break; case 25: - fp_tri_op(a, c, true, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD, - packed ? &XEmitter::MULPD : &XEmitter::MULSD, round_input); + reversible = true; + roundRHS = single && !js.op->fprIsSingle[c]; + avxOp = packed ? &XEmitter::VMULPD : &XEmitter::VMULSD; + sseOp = packed ? &XEmitter::MULPD : &XEmitter::MULSD; break; default: ASSERT_MSG(DYNA_REC, 0, "fp_arith WTF!!!"); } + + RCX64Reg Rd = fpr.Bind(d, !single ? RCMode::ReadWrite : RCMode::Write); + RCOpArg Ra = fpr.Use(a, RCMode::Read); + RCOpArg Rarg2 = fpr.Use(arg2, RCMode::Read); + RegCache::Realize(Rd, Ra, Rarg2); + + bool preserve_inputs = m_accurate_nans; + X64Reg dest = preserve_inputs ? XMM1 : static_cast(Rd); + if (roundRHS) + { + if (a == d && !preserve_inputs) + { + Force25BitPrecision(XMM0, Rarg2, XMM1); + (this->*sseOp)(Rd, R(XMM0)); + } + else + { + Force25BitPrecision(dest, Rarg2, XMM0); + (this->*sseOp)(dest, Ra); + } + } + else + { + avx_op(avxOp, sseOp, dest, Ra, Rarg2, packed, reversible); + } + + if (inst.SUBOP5 != 25) + HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt); + else + HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2); + + if (single) + FinalizeSingleResult(Rd, R(dest), packed, true); + else + FinalizeDoubleResult(Rd, R(dest)); } void Jit64::fmaddXX(UGeckoInstruction inst) @@ -499,7 +504,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) result_xmm = Rd; } - HandleNaNs(inst, result_xmm, XMM0, {a, b, c}); + HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, Rc); if (single) FinalizeSingleResult(Rd, R(result_xmm), packed, true); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index bea3829e0a..fe42929ce1 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -3,6 +3,8 @@ #include "Core/PowerPC/Jit64/Jit.h" +#include + #include "Common/CPUDetect.h" #include "Common/CommonTypes.h" #include "Common/MsgHandler.h" @@ -77,7 +79,7 @@ void Jit64::ps_sum(UGeckoInstruction inst) default: PanicAlertFmt("ps_sum WTF!!!"); } - HandleNaNs(inst, tmp, tmp == XMM1 ? XMM0 : XMM1, {a, b, c}); + HandleNaNs(inst, tmp, tmp == XMM1 ? XMM0 : XMM1, Ra, Rb, Rc); FinalizeSingleResult(Rd, R(tmp)); } @@ -112,7 +114,7 @@ void Jit64::ps_muls(UGeckoInstruction inst) if (round_input) Force25BitPrecision(XMM1, R(XMM1), XMM0); MULPD(XMM1, Ra); - HandleNaNs(inst, XMM1, XMM0, {a, c}); + HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc); FinalizeSingleResult(Rd, R(XMM1)); } From cbceae917663d0387deae0280b1e196eb7520cb2 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 25 Nov 2022 22:36:06 +0100 Subject: [PATCH 2/3] Jit64: Correctly handle NaNs for ps_mulsX --- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index fe42929ce1..160cd77497 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -98,23 +98,26 @@ void Jit64::ps_muls(UGeckoInstruction inst) RCOpArg Ra = fpr.Use(a, RCMode::Read); RCOpArg Rc = fpr.Use(c, RCMode::Read); RCX64Reg Rd = fpr.Bind(d, RCMode::Write); - RegCache::Realize(Ra, Rc, Rd); + RCX64Reg Rc_duplicated = m_accurate_nans ? fpr.Scratch() : fpr.Scratch(XMM1); + RegCache::Realize(Ra, Rc, Rd, Rc_duplicated); switch (inst.SUBOP5) { case 12: // ps_muls0 - MOVDDUP(XMM1, Rc); + MOVDDUP(Rc_duplicated, Rc); break; case 13: // ps_muls1 - avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3); + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_duplicated, Rc, Rc, 3); break; default: PanicAlertFmt("ps_muls WTF!!!"); } if (round_input) - Force25BitPrecision(XMM1, R(XMM1), XMM0); + Force25BitPrecision(XMM1, R(Rc_duplicated), XMM0); + else if (XMM1 != Rc_duplicated) + MOVAPD(XMM1, Rc_duplicated); MULPD(XMM1, Ra); - HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc); + HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated); FinalizeSingleResult(Rd, R(XMM1)); } From 2f1a8ee1b9cb2c7e78688c2bdf7e08b5a533441f Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 26 Nov 2022 14:32:42 +0100 Subject: [PATCH 3/3] Jit64: Skip HandleNaNs for operations that can't generate NaN Operations that have two operands and can't generate a default NaN, i.e. addition and subtraction, already have the desired NaN handling on x86. We just need to make sure to not reverse the operands. This fixes ps_sum0/ps_sum1 outputting NaNs in cases where they shouldn't. (HandleNaNs assumes that a NaN in a ps0 input always results in a NaN in the ps0 output, and correspondingly for ps1.) --- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 24 ++++++++++++------- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 3 ++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 5daaf9e9b5..fc27ad7bf1 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -236,8 +236,7 @@ void Jit64::fp_arith(UGeckoInstruction inst) bool single = inst.OPCD == 4 || inst.OPCD == 59; // If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP - // at the end by - // using packed arithmetic instead. + // at the end by using packed arithmetic instead. bool packed = inst.OPCD == 4 || (inst.OPCD == 59 && js.op->fprIsDuplicated[a] && js.op->fprIsDuplicated[arg2]); // Packed divides are slower than scalar divides on basically all x86, so this optimization isn't @@ -249,10 +248,12 @@ void Jit64::fp_arith(UGeckoInstruction inst) void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&) = nullptr; void (XEmitter::*sseOp)(X64Reg, const OpArg&) = nullptr; bool reversible = false; - bool roundRHS = false; + bool round_rhs = false; + bool preserve_inputs = false; switch (inst.SUBOP5) { case 18: + preserve_inputs = m_accurate_nans; avxOp = packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD; sseOp = packed ? &XEmitter::DIVPD : &XEmitter::DIVSD; break; @@ -261,13 +262,14 @@ void Jit64::fp_arith(UGeckoInstruction inst) sseOp = packed ? &XEmitter::SUBPD : &XEmitter::SUBSD; break; case 21: - reversible = true; + reversible = !m_accurate_nans; avxOp = packed ? &XEmitter::VADDPD : &XEmitter::VADDSD; sseOp = packed ? &XEmitter::ADDPD : &XEmitter::ADDSD; break; case 25: reversible = true; - roundRHS = single && !js.op->fprIsSingle[c]; + round_rhs = single && !js.op->fprIsSingle[c]; + preserve_inputs = m_accurate_nans; avxOp = packed ? &XEmitter::VMULPD : &XEmitter::VMULSD; sseOp = packed ? &XEmitter::MULPD : &XEmitter::MULSD; break; @@ -280,9 +282,8 @@ void Jit64::fp_arith(UGeckoInstruction inst) RCOpArg Rarg2 = fpr.Use(arg2, RCMode::Read); RegCache::Realize(Rd, Ra, Rarg2); - bool preserve_inputs = m_accurate_nans; X64Reg dest = preserve_inputs ? XMM1 : static_cast(Rd); - if (roundRHS) + if (round_rhs) { if (a == d && !preserve_inputs) { @@ -300,10 +301,15 @@ void Jit64::fp_arith(UGeckoInstruction inst) avx_op(avxOp, sseOp, dest, Ra, Rarg2, packed, reversible); } - if (inst.SUBOP5 != 25) + switch (inst.SUBOP5) + { + case 18: HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt); - else + break; + case 25: HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2); + break; + } if (single) FinalizeSingleResult(Rd, R(dest), packed, true); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 160cd77497..ab6a5b7638 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -79,7 +79,8 @@ void Jit64::ps_sum(UGeckoInstruction inst) default: PanicAlertFmt("ps_sum WTF!!!"); } - HandleNaNs(inst, tmp, tmp == XMM1 ? XMM0 : XMM1, Ra, Rb, Rc); + // We're intentionally not calling HandleNaNs here. + // For addition and subtraction specifically, x86's NaN behavior matches PPC's. FinalizeSingleResult(Rd, R(tmp)); }