From 554a2fd33228503a5848cda4039e35993f7985c1 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 9 Oct 2022 12:14:47 +0200 Subject: [PATCH 1/2] JitArm64: Merge ps_mulsX and ps_maddXX They have a lot of shared code, most notably the code for rounding c. No behavior change. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 3 +- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 72 +++++-------------- .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 16 ++--- 3 files changed, 27 insertions(+), 64 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index ec65997685..e9abdbbc4b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -152,9 +152,8 @@ public: void frsqrtex(UGeckoInstruction inst); // Paired - void ps_maddXX(UGeckoInstruction inst); void ps_mergeXX(UGeckoInstruction inst); - void ps_mulsX(UGeckoInstruction inst); + void ps_arith(UGeckoInstruction inst); void ps_sel(UGeckoInstruction inst); void ps_sumX(UGeckoInstruction inst); void ps_res(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 0d786f80a0..1afb5e1683 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -73,55 +73,7 @@ void JitArm64::ps_mergeXX(UGeckoInstruction inst) "Register allocation turned singles into doubles in the middle of ps_mergeXX"); } -void JitArm64::ps_mulsX(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITPairedOff); - FALLBACK_IF(inst.Rc); - FALLBACK_IF(jo.fp_exceptions); - - const u32 a = inst.FA; - const u32 c = inst.FC; - const u32 d = inst.FD; - - const bool upper = inst.SUBOP5 == 13; - - const bool singles = fpr.IsSingle(a) && fpr.IsSingle(c); - const bool round_c = !js.op->fprIsSingle[inst.FC]; - const RegType type = singles ? RegType::Single : RegType::Register; - const u8 size = singles ? 32 : 64; - const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad; - - const ARM64Reg VA = fpr.R(a, type); - ARM64Reg VC = fpr.R(c, type); - const ARM64Reg VD = fpr.RW(d, type); - - ARM64Reg V0Q = ARM64Reg::INVALID_REG; - - if (round_c) - { - ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); - - V0Q = fpr.GetReg(); - - Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC)); - VC = reg_encoder(V0Q); - } - - m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0); - - if (V0Q != ARM64Reg::INVALID_REG) - fpr.Unlock(V0Q); - - ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(c)), - "Register allocation turned singles into doubles in the middle of ps_mulsX"); - - fpr.FixSinglePrecision(d); - - SetFPRFIfNeeded(true, VD); -} - -void JitArm64::ps_maddXX(UGeckoInstruction inst) +void JitArm64::ps_arith(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITPairedOff); @@ -134,15 +86,21 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) const u32 d = inst.FD; const u32 op5 = inst.SUBOP5; + const bool use_b = (op5 & ~0x1) != 12; // muls uses no B + + const auto singles_func = [&] { + return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && fpr.IsSingle(c); + }; + const bool singles = singles_func(); + const bool inaccurate_fma = !Config::Get(Config::SESSION_USE_FMA); - const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c); const bool round_c = !js.op->fprIsSingle[inst.FC]; const RegType type = singles ? RegType::Single : RegType::Register; const u8 size = singles ? 32 : 64; const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad; const ARM64Reg VA = reg_encoder(fpr.R(a, type)); - const ARM64Reg VB = reg_encoder(fpr.R(b, type)); + const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; ARM64Reg VC = reg_encoder(fpr.R(c, type)); const ARM64Reg VD = reg_encoder(fpr.RW(d, type)); @@ -178,6 +136,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) ARM64Reg result_reg = VD; switch (op5) { + case 12: // ps_muls0: d = a * c.ps0 + m_float_emit.FMUL(size, VD, VA, VC, 0); + break; + case 13: // ps_muls1: d = a * c.ps1 + m_float_emit.FMUL(size, VD, VA, VC, 1); + break; case 14: // ps_madds0: d = a * c.ps0 + b if (inaccurate_fma) { @@ -269,7 +233,7 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) } break; default: - ASSERT_MSG(DYNA_REC, 0, "ps_madd - invalid op"); + ASSERT_MSG(DYNA_REC, 0, "ps_arith - invalid op"); break; } @@ -292,8 +256,8 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) if (V1Q != ARM64Reg::INVALID_REG) fpr.Unlock(V1Q); - ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)), - "Register allocation turned singles into doubles in the middle of ps_maddXX"); + ASSERT_MSG(DYNA_REC, singles == singles_func(), + "Register allocation turned singles into doubles in the middle of ps_arith"); fpr.FixSinglePrecision(d); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index ccac60efb3..c3f7a87fbb 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -108,10 +108,10 @@ constexpr std::array table4{{ constexpr std::array table4_2{{ {10, &JitArm64::ps_sumX}, // ps_sum0 {11, &JitArm64::ps_sumX}, // ps_sum1 - {12, &JitArm64::ps_mulsX}, // ps_muls0 - {13, &JitArm64::ps_mulsX}, // ps_muls1 - {14, &JitArm64::ps_maddXX}, // ps_madds0 - {15, &JitArm64::ps_maddXX}, // ps_madds1 + {12, &JitArm64::ps_arith}, // ps_muls0 + {13, &JitArm64::ps_arith}, // ps_muls1 + {14, &JitArm64::ps_arith}, // ps_madds0 + {15, &JitArm64::ps_arith}, // ps_madds1 {18, &JitArm64::fp_arith}, // ps_div {20, &JitArm64::fp_arith}, // ps_sub {21, &JitArm64::fp_arith}, // ps_add @@ -119,10 +119,10 @@ constexpr std::array table4_2{{ {24, &JitArm64::ps_res}, // ps_res {25, &JitArm64::fp_arith}, // ps_mul {26, &JitArm64::ps_rsqrte}, // ps_rsqrte - {28, &JitArm64::ps_maddXX}, // ps_msub - {29, &JitArm64::ps_maddXX}, // ps_madd - {30, &JitArm64::ps_maddXX}, // ps_nmsub - {31, &JitArm64::ps_maddXX}, // ps_nmadd + {28, &JitArm64::ps_arith}, // ps_msub + {29, &JitArm64::ps_arith}, // ps_madd + {30, &JitArm64::ps_arith}, // ps_nmsub + {31, &JitArm64::ps_arith}, // ps_nmadd }}; constexpr std::array table4_3{{ From 812067ab7cdd244a9144ae59049b2fd6647e6606 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 9 Oct 2022 12:31:29 +0200 Subject: [PATCH 2/2] JitArm64: Move ps instructions from fp_arith to ps_arith This lets us simplify fp_arith without making ps_arith much more complicated. No behavior change. --- .../JitArm64/JitArm64_FloatingPoint.cpp | 198 +++++++----------- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 21 +- .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 8 +- 3 files changed, 94 insertions(+), 133 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 260da2d900..26c6dfd1b7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -69,154 +69,102 @@ void JitArm64::fp_arith(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; u32 op5 = inst.SUBOP5; - bool single = inst.OPCD == 59; - bool packed = inst.OPCD == 4; - const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX const bool use_b = op5 != 25; // fmul uses no B - const bool outputs_are_singles = single || packed; - const bool round_c = use_c && outputs_are_singles && !js.op->fprIsSingle[inst.FC]; + const bool output_is_single = inst.OPCD == 59; + const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA); + const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC]; const auto inputs_are_singles_func = [&] { - return fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) && - (!use_c || fpr.IsSingle(c, !packed)); + return fpr.IsSingle(a, true) && (!use_b || fpr.IsSingle(b, true)) && + (!use_c || fpr.IsSingle(c, true)); }; const bool inputs_are_singles = inputs_are_singles_func(); - ARM64Reg VA{}, VB{}, VC{}, VD{}; + const RegType type = + (inputs_are_singles && output_is_single) ? RegType::LowerPairSingle : RegType::LowerPair; + const RegType type_out = + output_is_single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) : + RegType::LowerPair; + const auto reg_encoder = + (inputs_are_singles && output_is_single) ? EncodeRegToSingle : EncodeRegToDouble; + + const ARM64Reg VA = reg_encoder(fpr.R(a, type)); + const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; + ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; + const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out)); ARM64Reg V0Q = ARM64Reg::INVALID_REG; ARM64Reg V1Q = ARM64Reg::INVALID_REG; - if (packed) + if (round_c) { - const RegType type = inputs_are_singles ? RegType::Single : RegType::Register; - const u8 size = inputs_are_singles ? 32 : 64; - const auto reg_encoder = inputs_are_singles ? EncodeRegToDouble : EncodeRegToQuad; + ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); - VA = reg_encoder(fpr.R(a, type)); - if (use_b) - VB = reg_encoder(fpr.R(b, type)); - if (use_c) - VC = reg_encoder(fpr.R(c, type)); - VD = reg_encoder(fpr.RW(d, type)); + V1Q = fpr.GetReg(); - if (round_c) - { - ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); - - V0Q = fpr.GetReg(); - - Force25BitPrecision(reg_encoder(V0Q), VC); - VC = reg_encoder(V0Q); - } - - switch (op5) - { - case 18: - m_float_emit.FDIV(size, VD, VA, VB); - break; - case 20: - m_float_emit.FSUB(size, VD, VA, VB); - break; - case 21: - m_float_emit.FADD(size, VD, VA, VB); - break; - case 25: - m_float_emit.FMUL(size, VD, VA, VC); - break; - default: - ASSERT_MSG(DYNA_REC, 0, "fp_arith"); - break; - } + Force25BitPrecision(reg_encoder(V1Q), VC); + VC = reg_encoder(V1Q); } - else + + ARM64Reg inaccurate_fma_temp_reg = VD; + if (inaccurate_fma && d == b) { - const RegType type = - (inputs_are_singles && single) ? RegType::LowerPairSingle : RegType::LowerPair; - const RegType type_out = - single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) : - RegType::LowerPair; - const auto reg_encoder = (inputs_are_singles && single) ? EncodeRegToSingle : EncodeRegToDouble; + V0Q = fpr.GetReg(); - VA = reg_encoder(fpr.R(a, type)); - if (use_b) - VB = reg_encoder(fpr.R(b, type)); - if (use_c) - VC = reg_encoder(fpr.R(c, type)); - VD = reg_encoder(fpr.RW(d, type_out)); + inaccurate_fma_temp_reg = reg_encoder(V0Q); + } - const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA); - - if (round_c) + switch (op5) + { + case 18: + m_float_emit.FDIV(VD, VA, VB); + break; + case 20: + m_float_emit.FSUB(VD, VA, VB); + break; + case 21: + m_float_emit.FADD(VD, VA, VB); + break; + case 25: + m_float_emit.FMUL(VD, VA, VC); + break; + // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic], + // the subtly different definitions affect how signed zeroes are handled. + // Also, PowerPC's nmadd/nmsub perform rounding before the final negation. + // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub. + case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm" + case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)" + if (inaccurate_fma) { - ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); - - V1Q = fpr.GetReg(); - - Force25BitPrecision(reg_encoder(V1Q), VC); - VC = reg_encoder(V1Q); + m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); + m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB); } - - ARM64Reg inaccurate_fma_temp_reg = VD; - if (inaccurate_fma && d == b) + else { - V0Q = fpr.GetReg(); - - inaccurate_fma_temp_reg = reg_encoder(V0Q); + m_float_emit.FNMSUB(VD, VA, VC, VB); } - - switch (op5) + if (op5 == 30) + m_float_emit.FNEG(VD, VD); + break; + case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" + case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)" + if (inaccurate_fma) { - case 18: - m_float_emit.FDIV(VD, VA, VB); - break; - case 20: - m_float_emit.FSUB(VD, VA, VB); - break; - case 21: - m_float_emit.FADD(VD, VA, VB); - break; - case 25: - m_float_emit.FMUL(VD, VA, VC); - break; - // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic], - // the subtly different definitions affect how signed zeroes are handled. - // Also, PowerPC's nmadd/nmsub perform rounding before the final negation. - // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub. - case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm" - case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)" - if (inaccurate_fma) - { - m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB); - } - else - { - m_float_emit.FNMSUB(VD, VA, VC, VB); - } - if (op5 == 30) - m_float_emit.FNEG(VD, VD); - break; - case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" - case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)" - if (inaccurate_fma) - { - m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB); - } - else - { - m_float_emit.FMADD(VD, VA, VC, VB); - } - if (op5 == 31) - m_float_emit.FNEG(VD, VD); - break; - default: - ASSERT_MSG(DYNA_REC, 0, "fp_arith"); - break; + m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); + m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB); } + else + { + m_float_emit.FMADD(VD, VA, VC, VB); + } + if (op5 == 31) + m_float_emit.FNEG(VD, VD); + break; + default: + ASSERT_MSG(DYNA_REC, 0, "fp_arith"); + break; } if (V0Q != ARM64Reg::INVALID_REG) @@ -224,7 +172,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst) if (V1Q != ARM64Reg::INVALID_REG) fpr.Unlock(V1Q); - if (outputs_are_singles) + if (output_is_single) { ASSERT_MSG(DYNA_REC, inputs_are_singles == inputs_are_singles_func(), "Register allocation turned singles into doubles in the middle of fp_arith"); @@ -232,7 +180,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst) fpr.FixSinglePrecision(d); } - SetFPRFIfNeeded(outputs_are_singles, VD); + SetFPRFIfNeeded(output_is_single, VD); } void JitArm64::fp_logic(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 1afb5e1683..85d22f6183 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -86,22 +86,23 @@ void JitArm64::ps_arith(UGeckoInstruction inst) const u32 d = inst.FD; const u32 op5 = inst.SUBOP5; - const bool use_b = (op5 & ~0x1) != 12; // muls uses no B + const bool use_c = op5 == 25 || (op5 & ~0x13) == 12; // mul, muls, and all kinds of maddXX + const bool use_b = op5 != 25 && (op5 & ~0x1) != 12; // mul and muls don't use B const auto singles_func = [&] { - return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && fpr.IsSingle(c); + return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c)); }; const bool singles = singles_func(); const bool inaccurate_fma = !Config::Get(Config::SESSION_USE_FMA); - const bool round_c = !js.op->fprIsSingle[inst.FC]; + const bool round_c = use_c && !js.op->fprIsSingle[inst.FC]; const RegType type = singles ? RegType::Single : RegType::Register; const u8 size = singles ? 32 : 64; const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad; const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; - ARM64Reg VC = reg_encoder(fpr.R(c, type)); + ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; const ARM64Reg VD = reg_encoder(fpr.RW(d, type)); ARM64Reg V0Q = ARM64Reg::INVALID_REG; @@ -188,6 +189,18 @@ void JitArm64::ps_arith(UGeckoInstruction inst) result_reg = V0; } break; + case 18: // ps_div + m_float_emit.FDIV(size, VD, VA, VB); + break; + case 20: // ps_sub + m_float_emit.FSUB(size, VD, VA, VB); + break; + case 21: // ps_add + m_float_emit.FADD(size, VD, VA, VB); + break; + case 25: // ps_mul + m_float_emit.FMUL(size, VD, VA, VC); + break; case 28: // ps_msub: d = a * c - b case 30: // ps_nmsub: d = -(a * c - b) if (inaccurate_fma) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index c3f7a87fbb..2e4c72f4f6 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -112,12 +112,12 @@ constexpr std::array table4_2{{ {13, &JitArm64::ps_arith}, // ps_muls1 {14, &JitArm64::ps_arith}, // ps_madds0 {15, &JitArm64::ps_arith}, // ps_madds1 - {18, &JitArm64::fp_arith}, // ps_div - {20, &JitArm64::fp_arith}, // ps_sub - {21, &JitArm64::fp_arith}, // ps_add + {18, &JitArm64::ps_arith}, // ps_div + {20, &JitArm64::ps_arith}, // ps_sub + {21, &JitArm64::ps_arith}, // ps_add {23, &JitArm64::ps_sel}, // ps_sel {24, &JitArm64::ps_res}, // ps_res - {25, &JitArm64::fp_arith}, // ps_mul + {25, &JitArm64::ps_arith}, // ps_mul {26, &JitArm64::ps_rsqrte}, // ps_rsqrte {28, &JitArm64::ps_arith}, // ps_msub {29, &JitArm64::ps_arith}, // ps_madd