From 2c243895c877d2d041f85d6ea725a9733c373e46 Mon Sep 17 00:00:00 2001 From: RipleyTom Date: Wed, 14 Feb 2024 18:41:58 +0100 Subject: [PATCH 1/8] Fix minus accuracy difference in spu_re_acc --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 578f15d0c9..e2d2fcd2dd 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -6016,15 +6016,17 @@ public: const auto div_result = the_one / div; - // from ps3 hardware testing: Inf => NaN and NaN => Zero + // From ps3 hardware testing: Inf => NaN and NaN => Zero, Signed Zero => Zero + // This results in full accuracy within 1ulp(Currently x86 seems to be rounding up?) const auto result_and = bitcast(div_result) & 0x7fffffffu; const auto result_cmp_inf = sext(result_and == splat(0x7F800000u)); const auto result_cmp_nan = sext(result_and <= splat(0x7F800000u)); + const auto and_mask_zero = bitcast(sext(result_and != splat(0u))); const auto and_mask = bitcast(result_cmp_nan) & splat(0xFFFFFFFFu); const auto or_mask = bitcast(result_cmp_inf) & splat(0xFFFFFFFu); - return bitcast((bitcast(div_result) & and_mask) | or_mask); + return bitcast(((bitcast(div_result) & and_mask) & and_mask_zero) | or_mask); }); const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); From f1399f65cee35a0e6d28657cf0510e4864aeeb83 Mon Sep 17 00:00:00 2001 From: RipleyTom Date: Thu, 15 Feb 2024 01:12:05 +0100 Subject: [PATCH 2/8] Simplify spu patterns and fix sqrt pattern --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 123 ++++++++------------------- 1 file changed, 36 insertions(+), 87 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index e2d2fcd2dd..59198d6017 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -5624,9 +5624,9 @@ public: // Presumably 1/x might result in Zero/NaN when a/x doesn't if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::relaxed) { - auto full_fm_accurate = [&](const auto& a, const auto& div) + auto full_fm_accurate = [&](const auto& num, const auto& div) { - const auto div_result = a / div; + const auto div_result = num / div; const auto result_and = bitcast(div_result) & 0x7fffffffu; const auto result_cmp_inf = sext(result_and == splat(0x7F800000u)); const auto result_cmp_nan = sext(result_and <= splat(0x7F800000u)); @@ -6034,78 +6034,39 @@ public: auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool { - auto match_fnms = [&](f32 float_value) + auto match_fnms = [&](f32 float_value, const auto& eval_sqrt) { - auto res = match_expr(a, fnms(MT, MT, fsplat(float_value))); + const auto res = match_expr(a, fnms(eval_sqrt, spu_rsqrte(MT), fsplat(float_value))); if (std::get<0>(res)) return res; - return match_expr(b, fnms(MT, MT, fsplat(float_value))); + return match_expr(b, fnms(eval_sqrt, spu_rsqrte(MT), fsplat(float_value))); }; - auto match_fm_half = [&]() + auto match_fm_half = [&](const auto& eval_sqrt) { - auto res = match_expr(a, fm(MT, fsplat(0.5))); + const auto res = match_expr(a, fm(fsplat(0.5f), eval_sqrt)); if (std::get<0>(res)) return res; - res = match_expr(a, fm(fsplat(0.5), MT)); - if (std::get<0>(res)) - return res; - - res = match_expr(b, fm(MT, fsplat(0.5))); - if (std::get<0>(res)) - return res; - - return match_expr(b, fm(fsplat(0.5), MT)); + return match_expr(b, fm(fsplat(0.5f), eval_sqrt)); }; - - if (auto [ok_fnma, a1, b1] = match_fnms(float_value); ok_fnma) + // eval_sqrt = x * spu_resqrt(x) + // eval_sqrt + (1 - (spu_resqrt(x) * eval_sqrt)) * (0.5 * eval_sqrt) + // FMA(FNMS(spu_resqrt(x) <*> eval_sqrt, float_value) <*> FM(0.5f, eval_sqrt), eval_sqrt) + if (auto [ok_fm_c, x, maybe_x] = match_expr(c, fm(MT, spu_rsqrte(MT))); ok_fm_c && x.eq(maybe_x)) { - if (auto [ok_fm2, fm_half_mul] = match_fm_half(); ok_fm2 && fm_half_mul.eq(b1)) + if (auto [ok_fnms, maybe_x_2] = match_fnms(float_value, c); ok_fnms && x.eq(maybe_x_2)) { - if (fm_half_mul.eq(b1)) + if (auto [ok_fm] = match_fm_half(c); ok_fm) { - if (auto [ok_fm1, a3, b3] = match_expr(c, fm(MT, MT)); ok_fm1 && a3.eq(a1)) - { - if (auto [ok_sqrte, src] = match_expr(a3, spu_rsqrte(MT)); ok_sqrte && src.eq(b3)) - { - erase_stores(a, b, c, a3); - set_vr(op.rt4, fsqrt(fabs(src))); - return true; - } - } - else if (auto [ok_fm1, a3, b3] = match_expr(c, fm(MT, MT)); ok_fm1 && b3.eq(a1)) - { - if (auto [ok_sqrte, src] = match_expr(b3, spu_rsqrte(MT)); ok_sqrte && src.eq(a3)) - { - erase_stores(a, b, c, b3); - set_vr(op.rt4, fsqrt(fabs(src))); - return true; - } - } - } - else if (fm_half_mul.eq(a1)) - { - if (auto [ok_fm1, a3, b3] = match_expr(c, fm(MT, MT)); ok_fm1 && a3.eq(b1)) - { - if (auto [ok_sqrte, src] = match_expr(a3, spu_rsqrte(MT)); ok_sqrte && src.eq(b3)) - { - erase_stores(a, b, c, a3); - set_vr(op.rt4, fsqrt(fabs(src))); - return true; - } - } - else if (auto [ok_fm1, a3, b3] = match_expr(c, fm(MT, MT)); ok_fm1 && b3.eq(b1)) - { - if (auto [ok_sqrte, src] = match_expr(b3, spu_rsqrte(MT)); ok_sqrte && src.eq(a3)) - { - erase_stores(a, b, c, b3); - set_vr(op.rt4, fsqrt(fabs(src))); - return true; - } - } + // Try to delete spu_rsqrte as it's expensive + auto [ok_final_fm, to_del] = match_expr(c, fm(x, MT)); + ensure(ok_final_fm); + erase_stores(a, b, c, to_del); + set_vr(op.rt4, fsqrt(fabs(x))); + return true; } } } @@ -6121,7 +6082,7 @@ public: auto check_accurate_reciprocal_pattern_for_float = [&](f32 float_value) -> bool { - // FMA(FNMS(div, spu_re(div), float_value), spu_re(div), spu_re(div)) + // FMA(FNMS(div <*> spu_re(div), float_value), spu_re(div), spu_re(div)) if (auto [ok_fnms, div] = match_expr(a, fnms(MT, b, fsplat(float_value))); ok_fnms && op.rb == op.rc) { if (auto [ok_re] = match_expr(b, spu_re(div)); ok_re) @@ -6132,18 +6093,7 @@ public: } } - // FMA(FNMS(spu_re(div), div, float_value), spu_re(div), spu_re(div)) - if (auto [ok_fnms, div] = match_expr(a, fnms(b, MT, fsplat(float_value))); ok_fnms && op.rb == op.rc) - { - if (auto [ok_re] = match_expr(b, spu_re(div)); ok_re) - { - erase_stores(a, b, c); - set_vr(op.rt4, re_accurate(div, fsplat(float_value))); - return true; - } - } - - // FMA(spu_re(div), FNMS(div, spu_re(div), float_value), spu_re(div)) + // FMA(spu_re(div), FNMS(div <*> spu_re(div), float_value), spu_re(div)) if (auto [ok_fnms, div] = match_expr(b, fnms(MT, a, fsplat(float_value))); ok_fnms && op.ra == op.rc) { if (auto [ok_re] = match_expr(a, spu_re(div)); ok_re) @@ -6154,17 +6104,6 @@ public: } } - // FMA(spu_re(div), FNMS(spu_re(div), div, float_value), spu_re(div)) - if (auto [ok_fnms, div] = match_expr(b, fnms(a, MT, fsplat(float_value))); ok_fnms && op.ra == op.rc) - { - if (auto [ok_re] = match_expr(a, spu_re(div)); ok_re) - { - erase_stores(a, b, c); - set_vr(op.rt4, re_accurate(div, fsplat(float_value))); - return true; - } - } - return false; }; @@ -6216,13 +6155,23 @@ public: { spu_log.todo("[%s:0x%05x] Unmatched spu_re(a) found in FMA", m_hash, m_pos); } - - if (auto [ok_re, mystery] = match_expr(b, spu_re(MT)); ok_re) + else if (auto [ok_re, mystery] = match_expr(b, spu_re(MT)); ok_re) { spu_log.todo("[%s:0x%05x] Unmatched spu_re(b) found in FMA", m_hash, m_pos); } - - if (auto [ok_resq, mystery] = match_expr(c, spu_rsqrte(MT)); ok_resq) + else if (auto [ok_re, mystery] = match_expr(c, spu_re(MT)); ok_re) + { + spu_log.todo("[%s:0x%05x] Unmatched spu_re(c) found in FMA", m_hash, m_pos); + } + else if (auto [ok_resq, mystery] = match_expr(a, spu_rsqrte(MT)); ok_resq) + { + spu_log.todo("[%s:0x%05x] Unmatched spu_rsqrte(a) found in FMA", m_hash, m_pos); + } + else if (auto [ok_resq, mystery] = match_expr(b, spu_rsqrte(MT)); ok_resq) + { + spu_log.todo("[%s:0x%05x] Unmatched spu_rsqrte(b) found in FMA", m_hash, m_pos); + } + else if (auto [ok_resq, mystery] = match_expr(c, spu_rsqrte(MT)); ok_resq) { spu_log.todo("[%s:0x%05x] Unmatched spu_rsqrte(c) found in FMA", m_hash, m_pos); } From 1578d938ebefe9bcf63d2f0fc599e35efa12355e Mon Sep 17 00:00:00 2001 From: RipleyTom Date: Thu, 15 Feb 2024 01:12:36 +0100 Subject: [PATCH 3/8] Add alternative reciprocal pattern --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 59198d6017..4c5399ac7d 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -6018,7 +6018,7 @@ public: // From ps3 hardware testing: Inf => NaN and NaN => Zero, Signed Zero => Zero // This results in full accuracy within 1ulp(Currently x86 seems to be rounding up?) - const auto result_and = bitcast(div_result) & 0x7fffffffu; + const auto result_and = bitcast(div_result) & 0x7FFFFFFFu; const auto result_cmp_inf = sext(result_and == splat(0x7F800000u)); const auto result_cmp_nan = sext(result_and <= splat(0x7F800000u)); @@ -6113,6 +6113,27 @@ public: if (check_accurate_reciprocal_pattern_for_float(std::bit_cast(std::bit_cast(1.0f) + 1))) return; + // GOW 3(uses 1.0f * spu_re(div) instead of just spu_re(div) in the pattern) + auto check_alternative_reciprocal_pattern_for_float = [&](f32 float_value) -> bool + { + if (auto [ok_fm, div] = match_expr(c, fm(spu_re(MT), fsplat(1.0f))); ok_fm) + { + if (auto [ok_fnms] = match_expr(a, fnms(c, div, fsplat(1.0f))); ok_fnms) + { + if (auto [ok_spure] = match_expr(b, spu_re(div)); ok_spure) + { + erase_stores(a, b, c); + set_vr(op.rt4, re_accurate(div, fsplat(float_value))); + return true; + } + } + } + return false; + }; + + if (check_alternative_reciprocal_pattern_for_float()) + return; + // NFS Most Wanted doesn't like this if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::relaxed) { From a4fae6885ea98ae8bde37d02732097cb9c9a0d84 Mon Sep 17 00:00:00 2001 From: RipleyTom Date: Thu, 15 Feb 2024 10:09:47 +0100 Subject: [PATCH 4/8] Further simplifications --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 76 +++++++--------------------- 1 file changed, 18 insertions(+), 58 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 4c5399ac7d..3714705748 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -6034,40 +6034,19 @@ public: auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool { - auto match_fnms = [&](f32 float_value, const auto& eval_sqrt) - { - const auto res = match_expr(a, fnms(eval_sqrt, spu_rsqrte(MT), fsplat(float_value))); - if (std::get<0>(res)) - return res; - - return match_expr(b, fnms(eval_sqrt, spu_rsqrte(MT), fsplat(float_value))); - }; - - auto match_fm_half = [&](const auto& eval_sqrt) - { - const auto res = match_expr(a, fm(fsplat(0.5f), eval_sqrt)); - if (std::get<0>(res)) - return res; - - return match_expr(b, fm(fsplat(0.5f), eval_sqrt)); - }; - // eval_sqrt = x * spu_resqrt(x) // eval_sqrt + (1 - (spu_resqrt(x) * eval_sqrt)) * (0.5 * eval_sqrt) // FMA(FNMS(spu_resqrt(x) <*> eval_sqrt, float_value) <*> FM(0.5f, eval_sqrt), eval_sqrt) if (auto [ok_fm_c, x, maybe_x] = match_expr(c, fm(MT, spu_rsqrte(MT))); ok_fm_c && x.eq(maybe_x)) { - if (auto [ok_fnms, maybe_x_2] = match_fnms(float_value, c); ok_fnms && x.eq(maybe_x_2)) + const auto full_expr = fma(a, b, c); + if (auto [ok_fma] = match_expr(full_expr, fma(fnms(spu_rsqrte(x), c, fsplat(float_value)), fm(fsplat(0.5f), c), c)); ok_fma) { - if (auto [ok_fm] = match_fm_half(c); ok_fm) - { - // Try to delete spu_rsqrte as it's expensive - auto [ok_final_fm, to_del] = match_expr(c, fm(x, MT)); - ensure(ok_final_fm); - erase_stores(a, b, c, to_del); - set_vr(op.rt4, fsqrt(fabs(x))); - return true; - } + auto [ok_final_fm, to_del] = match_expr(c, fm(x, MT)); + ensure(ok_final_fm); + erase_stores(a, b, c, to_del); + set_vr(op.rt4, fsqrt(fabs(x))); + return true; } } @@ -6082,21 +6061,11 @@ public: auto check_accurate_reciprocal_pattern_for_float = [&](f32 float_value) -> bool { - // FMA(FNMS(div <*> spu_re(div), float_value), spu_re(div), spu_re(div)) - if (auto [ok_fnms, div] = match_expr(a, fnms(MT, b, fsplat(float_value))); ok_fnms && op.rb == op.rc) + // FMA(FNMS(div <*> spu_re(div), float_value) <*> spu_re(div), spu_re(div)) + if (auto [ok_c, div] = match_expr(c, spu_re(MT)); ok_c) { - if (auto [ok_re] = match_expr(b, spu_re(div)); ok_re) - { - erase_stores(a, b, c); - set_vr(op.rt4, re_accurate(div, fsplat(float_value))); - return true; - } - } - - // FMA(spu_re(div), FNMS(div <*> spu_re(div), float_value), spu_re(div)) - if (auto [ok_fnms, div] = match_expr(b, fnms(MT, a, fsplat(float_value))); ok_fnms && op.ra == op.rc) - { - if (auto [ok_re] = match_expr(a, spu_re(div)); ok_re) + const auto full_expr = fma(a, b, c); + if (auto [ok_fma] = match_expr(full_expr, fma(fnms(div, c, fsplat(float_value)), c, c)); ok_fma) { erase_stores(a, b, c); set_vr(op.rt4, re_accurate(div, fsplat(float_value))); @@ -6114,25 +6083,16 @@ public: return; // GOW 3(uses 1.0f * spu_re(div) instead of just spu_re(div) in the pattern) - auto check_alternative_reciprocal_pattern_for_float = [&](f32 float_value) -> bool + if (auto [ok_fm, div] = match_expr(c, fm(spu_re(MT), fsplat(1.0f))); ok_fm) { - if (auto [ok_fm, div] = match_expr(c, fm(spu_re(MT), fsplat(1.0f))); ok_fm) + const auto full_expr = fma(a, b, c); + if (auto [ok_fma] = match_expr(full_expr, fma(fnms(c, div, fsplat(1.0f)), spu_re(div), c)); ok_fma) { - if (auto [ok_fnms] = match_expr(a, fnms(c, div, fsplat(1.0f))); ok_fnms) - { - if (auto [ok_spure] = match_expr(b, spu_re(div)); ok_spure) - { - erase_stores(a, b, c); - set_vr(op.rt4, re_accurate(div, fsplat(float_value))); - return true; - } - } + erase_stores(a, b, c); + set_vr(op.rt4, re_accurate(div, fsplat(1.0f))); + return; } - return false; - }; - - if (check_alternative_reciprocal_pattern_for_float()) - return; + } // NFS Most Wanted doesn't like this if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::relaxed) From 2979c618dec504fd51330cecf080633c1b8a4dcc Mon Sep 17 00:00:00 2001 From: RipleyTom Date: Fri, 16 Feb 2024 22:09:55 +0100 Subject: [PATCH 5/8] Change full_expr's scope --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 3714705748..ff9554c368 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -6031,6 +6031,7 @@ public: const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); static const auto MT = match(); + const auto full_expr = fma(a, b, c); auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool { @@ -6039,7 +6040,6 @@ public: // FMA(FNMS(spu_resqrt(x) <*> eval_sqrt, float_value) <*> FM(0.5f, eval_sqrt), eval_sqrt) if (auto [ok_fm_c, x, maybe_x] = match_expr(c, fm(MT, spu_rsqrte(MT))); ok_fm_c && x.eq(maybe_x)) { - const auto full_expr = fma(a, b, c); if (auto [ok_fma] = match_expr(full_expr, fma(fnms(spu_rsqrte(x), c, fsplat(float_value)), fm(fsplat(0.5f), c), c)); ok_fma) { auto [ok_final_fm, to_del] = match_expr(c, fm(x, MT)); @@ -6064,7 +6064,6 @@ public: // FMA(FNMS(div <*> spu_re(div), float_value) <*> spu_re(div), spu_re(div)) if (auto [ok_c, div] = match_expr(c, spu_re(MT)); ok_c) { - const auto full_expr = fma(a, b, c); if (auto [ok_fma] = match_expr(full_expr, fma(fnms(div, c, fsplat(float_value)), c, c)); ok_fma) { erase_stores(a, b, c); @@ -6085,7 +6084,6 @@ public: // GOW 3(uses 1.0f * spu_re(div) instead of just spu_re(div) in the pattern) if (auto [ok_fm, div] = match_expr(c, fm(spu_re(MT), fsplat(1.0f))); ok_fm) { - const auto full_expr = fma(a, b, c); if (auto [ok_fma] = match_expr(full_expr, fma(fnms(c, div, fsplat(1.0f)), spu_re(div), c)); ok_fma) { erase_stores(a, b, c); @@ -6157,7 +6155,7 @@ public: spu_log.todo("[%s:0x%05x] Unmatched spu_rsqrte(c) found in FMA", m_hash, m_pos); } - set_vr(op.rt4, fma(a, b, c)); + set_vr(op.rt4, full_expr); } template From ef5cffdead8cc94e0af2ffdab6af9b71715008c8 Mon Sep 17 00:00:00 2001 From: RipleyTom Date: Fri, 16 Feb 2024 22:59:07 +0100 Subject: [PATCH 6/8] Fixes --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 100 ++++++++++++++------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index ff9554c368..ff2705d69c 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -5637,18 +5637,18 @@ public: }; // FM(a, re_accurate(div)) - if (const auto [ok_re_acc, div, one] = match_expr(b, re_accurate(match(), match())); ok_re_acc) + if (const auto [ok_re_acc, div] = match_expr(b, re_accurate(match())); ok_re_acc) { full_fm_accurate(a, div); - erase_stores(one, b); + erase_stores(b); return; } // FM(re_accurate(div), b) - if (const auto [ok_re_acc, div, one] = match_expr(a, re_accurate(match(), match())); ok_re_acc) + if (const auto [ok_re_acc, div] = match_expr(a, re_accurate(match())); ok_re_acc) { full_fm_accurate(b, div); - erase_stores(one, a); + erase_stores(a); return; } } @@ -5973,10 +5973,10 @@ public: return llvm_calli{"spu_fma", {std::forward(a), std::forward(b), std::forward(c)}}.set_order_equality_hint(1, 1, 0); } - template - static llvm_calli re_accurate(T&& a, U&& b) + template + static llvm_calli re_accurate(T&& a) { - return {"spu_re_acc", {std::forward(a), std::forward(b)}}; + return {"spu_re_acc", {std::forward(a)}}; } void FMA(spu_opcode_t op) @@ -5995,26 +5995,18 @@ public: const auto b = value(ci->getOperand(1)); const auto c = value(ci->getOperand(2)); - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - const auto ma = sext(fcmp_uno(a != fsplat(0.))); - const auto mb = sext(fcmp_uno(b != fsplat(0.))); - const auto ca = bitcast(bitcast(a) & mb); - const auto cb = bitcast(bitcast(b) & ma); - return fma32x4(eval(ca), eval(cb), c); - } - else - { - return fma32x4(a, b, c); - } + const auto ma = sext(fcmp_uno(a != fsplat(0.))); + const auto mb = sext(fcmp_uno(b != fsplat(0.))); + const auto ca = bitcast(bitcast(a) & mb); + const auto cb = bitcast(bitcast(b) & ma); + + return fma32x4(eval(ca), eval(cb), c); }); register_intrinsic("spu_re_acc", [&](llvm::CallInst* ci) { const auto div = value(ci->getOperand(0)); - const auto the_one = value(ci->getOperand(1)); - - const auto div_result = the_one / div; + const auto div_result = fsplat(1.0f) / div; // From ps3 hardware testing: Inf => NaN and NaN => Zero, Signed Zero => Zero // This results in full accuracy within 1ulp(Currently x86 seems to be rounding up?) @@ -6029,6 +6021,8 @@ public: return bitcast(((bitcast(div_result) & and_mask) & and_mask_zero) | or_mask); }); + constexpr f32 ONEISH = std::bit_cast(std::bit_cast(1.0f) + 1); + const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); static const auto MT = match(); const auto full_expr = fma(a, b, c); @@ -6056,57 +6050,67 @@ public: if (check_sqrt_pattern_for_float(1.0f)) return; - if (check_sqrt_pattern_for_float(std::bit_cast(std::bit_cast(1.0f) + 1))) + if (check_sqrt_pattern_for_float(ONEISH)) return; - auto check_accurate_reciprocal_pattern_for_float = [&](f32 float_value) -> bool + // Full reciprocal patterns + // FMA(FNMS(div <*> spu_re(div), float_value) <*> spu_re(div), spu_re(div)) + if (auto [ok_c, div] = match_expr(c, spu_re(MT)); ok_c) { - // FMA(FNMS(div <*> spu_re(div), float_value) <*> spu_re(div), spu_re(div)) - if (auto [ok_c, div] = match_expr(c, spu_re(MT)); ok_c) + auto check_accurate_reciprocal_pattern_for_float = [&](f32 float_value) -> bool { if (auto [ok_fma] = match_expr(full_expr, fma(fnms(div, c, fsplat(float_value)), c, c)); ok_fma) { erase_stores(a, b, c); - set_vr(op.rt4, re_accurate(div, fsplat(float_value))); + set_vr(op.rt4, re_accurate(div)); return true; } - } + return false; + }; - return false; - }; + if (check_accurate_reciprocal_pattern_for_float(1.0f)) + return; - if (check_accurate_reciprocal_pattern_for_float(1.0f)) - return; + if (check_accurate_reciprocal_pattern_for_float(ONEISH)) + return; - if (check_accurate_reciprocal_pattern_for_float(std::bit_cast(std::bit_cast(1.0f) + 1))) - return; - - // GOW 3(uses 1.0f * spu_re(div) instead of just spu_re(div) in the pattern) - if (auto [ok_fm, div] = match_expr(c, fm(spu_re(MT), fsplat(1.0f))); ok_fm) - { - if (auto [ok_fma] = match_expr(full_expr, fma(fnms(c, div, fsplat(1.0f)), spu_re(div), c)); ok_fma) + // Generate dynamic pattern for when float is unknown because of scope + if (auto [ok_fma, cursed_float] = match_expr(full_expr, fma(fnms(div, c, MT), c, c)); ok_fma) { erase_stores(a, b, c); - set_vr(op.rt4, re_accurate(div, fsplat(1.0f))); + const auto bitcast_float = bitcast(cursed_float); + set_vr(op.rt4, select(bitcast_float == splat(0x3F800000) | bitcast_float == splat(0x3F800001), re_accurate(div), fma(fnms(spu_re(div), div, cursed_float), spu_re(div), spu_re(div)))); return; } } - // NFS Most Wanted doesn't like this if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::relaxed) { - // Those patterns are not safe vs non optimization as inaccuracy from spu_re will spread with early fm before the accuracy is improved - // Match division (fast) - // FMA(FNMS(fm(diva<*> spu_re(divb)), divb, diva), spu_re(divb), fm(diva<*> spu_re(divb))) + // FMA(FNMS(fm(diva <*> spu_re(divb)), divb, diva), spu_re(divb), fm(diva <*> spu_re(divb))) + // NFS: Most Wanted doesn't like this pattern + + auto full_fast_div = [&](const auto& diva, const auto& divb) + { + const auto div_result = diva / divb; + const auto result_and = bitcast(div_result) & 0x7FFFFFFFu; + const auto result_cmp_inf = sext(result_and == splat(0x7F800000u)); + const auto result_cmp_nan = sext(result_and <= splat(0x7F800000u)); + const auto and_mask_zero = bitcast(sext(result_and != splat(0u))); + const auto and_mask = bitcast(result_cmp_nan) & splat(0xFFFFFFFFu); + const auto or_mask = bitcast(result_cmp_inf) & splat(0xFFFFFFFu); + const auto final_result = bitcast(((bitcast(div_result) & and_mask) & and_mask_zero) | or_mask); + set_vr(op.rt4, final_result); + }; + if (auto [ok_fnma, divb, diva] = match_expr(a, fnms(c, MT, MT)); ok_fnma) { if (auto [ok_fm, fm1, fm2] = match_expr(c, fm(MT, MT)); ok_fm && ((fm1.eq(diva) && fm2.eq(b)) || (fm1.eq(b) && fm2.eq(diva)))) { if (auto [ok_re] = match_expr(b, spu_re(divb)); ok_re) { - erase_stores(b, c); - set_vr(op.rt4, diva / divb); + erase_stores(a, b, c); + full_fast_div(diva, divb); return; } } @@ -6119,8 +6123,8 @@ public: { if (auto [ok_re] = match_expr(a, spu_re(divb)); ok_re) { - erase_stores(a, c); - set_vr(op.rt4, diva / divb); + erase_stores(a, b, c); + full_fast_div(diva, divb); return; } } From 54ce1343cf0310fa93dea087344c1bd166747a02 Mon Sep 17 00:00:00 2001 From: RipleyTom Date: Sat, 17 Feb 2024 06:25:02 +0100 Subject: [PATCH 7/8] Dynamic sqrt pattern --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 36 +++++++++++++++++----------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index ff2705d69c..2fd8b4bba1 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -6027,12 +6027,12 @@ public: static const auto MT = match(); const auto full_expr = fma(a, b, c); - auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool + // eval_sqrt = x * spu_resqrt(x) + // eval_sqrt + (1 - (spu_resqrt(x) * eval_sqrt)) * (0.5 * eval_sqrt) + // FMA(FNMS(spu_resqrt(x) <*> eval_sqrt, float_value) <*> FM(0.5f, eval_sqrt), eval_sqrt) + if (auto [ok_fm_c, x, maybe_x] = match_expr(c, fm(MT, spu_rsqrte(MT))); ok_fm_c && x.eq(maybe_x)) { - // eval_sqrt = x * spu_resqrt(x) - // eval_sqrt + (1 - (spu_resqrt(x) * eval_sqrt)) * (0.5 * eval_sqrt) - // FMA(FNMS(spu_resqrt(x) <*> eval_sqrt, float_value) <*> FM(0.5f, eval_sqrt), eval_sqrt) - if (auto [ok_fm_c, x, maybe_x] = match_expr(c, fm(MT, spu_rsqrte(MT))); ok_fm_c && x.eq(maybe_x)) + auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool { if (auto [ok_fma] = match_expr(full_expr, fma(fnms(spu_rsqrte(x), c, fsplat(float_value)), fm(fsplat(0.5f), c), c)); ok_fma) { @@ -6042,16 +6042,24 @@ public: set_vr(op.rt4, fsqrt(fabs(x))); return true; } + return false; + }; + + if (check_sqrt_pattern_for_float(1.0f)) + return; + + if (check_sqrt_pattern_for_float(ONEISH)) + return; + + // Generate dynamic pattern for when float is unknown because of scope + if (auto [ok_fma, cursed_float] = match_expr(full_expr, fma(fnms(spu_rsqrte(x), c, MT), fm(fsplat(0.5f), c), c)); ok_fma) + { + erase_stores(a, b, c); + const auto bitcast_float = bitcast(cursed_float); + set_vr(op.rt4, select(bitcast_float == splat(0x3F800000) | bitcast_float == splat(0x3F800001), fsqrt(fabs(x)), fma(fnms(spu_rsqrte(x), c, cursed_float), fm(fsplat(0.5f), fm(x, spu_rsqrte(x))), fm(x, spu_rsqrte(x))))); + return; } - - return false; - }; - - if (check_sqrt_pattern_for_float(1.0f)) - return; - - if (check_sqrt_pattern_for_float(ONEISH)) - return; + } // Full reciprocal patterns // FMA(FNMS(div <*> spu_re(div), float_value) <*> spu_re(div), spu_re(div)) From 2b96a3e4efc3d343cb082bd764d766c6b47188a2 Mon Sep 17 00:00:00 2001 From: RipleyTom Date: Sat, 17 Feb 2024 09:20:00 +0100 Subject: [PATCH 8/8] More patterns --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 45 +++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 2fd8b4bba1..597987cade 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -6051,12 +6051,41 @@ public: if (check_sqrt_pattern_for_float(ONEISH)) return; - // Generate dynamic pattern for when float is unknown because of scope - if (auto [ok_fma, cursed_float] = match_expr(full_expr, fma(fnms(spu_rsqrte(x), c, MT), fm(fsplat(0.5f), c), c)); ok_fma) + // Generate dynamic pattern for when floats are unknown because of scope + if (auto [ok_fma, fnms_float, fm_float] = match_expr(full_expr, fma(fnms(spu_rsqrte(x), c, MT), fm(MT, c), c)); ok_fma) { erase_stores(a, b, c); - const auto bitcast_float = bitcast(cursed_float); - set_vr(op.rt4, select(bitcast_float == splat(0x3F800000) | bitcast_float == splat(0x3F800001), fsqrt(fabs(x)), fma(fnms(spu_rsqrte(x), c, cursed_float), fm(fsplat(0.5f), fm(x, spu_rsqrte(x))), fm(x, spu_rsqrte(x))))); + const auto bitcast_float = bitcast(fnms_float); + set_vr(op.rt4, select(fcmp_uno(fm_float == fsplat(0.5f)) & (bitcast_float == splat(0x3F800000) | bitcast_float == splat(0x3F800001)), fsqrt(fabs(x)), fma(fnms(spu_rsqrte(x), c, fnms_float), fm(fm_float, fm(x, spu_rsqrte(x))), fm(x, spu_rsqrte(x))))); + return; + } + } + + if (auto [ok_c, x] = match_expr(c, spu_rsqrte(MT)); ok_c) + { + auto check_accurate_resqrt_for_float = [&](f32 float_value) -> bool + { + if (auto [ok_fma] = match_expr(full_expr, fma(fnms(fm(c, c), x, fsplat(float_value)), fm(fsplat(0.5f), c), c)); ok_fma) + { + erase_stores(a, b, c); + set_vr(op.rt4, fsplat(1.0f)/fsqrt(fabs(x))); + return true; + } + return false; + }; + + if (check_accurate_resqrt_for_float(1.0f)) + return; + + if (check_accurate_resqrt_for_float(ONEISH)) + return; + + // Generate dynamic pattern for when floats are unknown because of scope + if (auto [ok_fma, fnms_float, fm_float] = match_expr(full_expr, fma(fnms(fm(c, c), x, MT), fm(MT, c),c)); ok_fma) + { + erase_stores(a, b, c); + const auto bitcast_float = bitcast(fnms_float); + set_vr(op.rt4, select(fcmp_uno(fm_float == fsplat(0.5f)) & (bitcast_float == splat(0x3F800000) | bitcast_float == splat(0x3F800001)), fsplat(1.0f)/fsqrt(fabs(x)), fma(fnms(fm(spu_rsqrte(x), spu_rsqrte(x)), x, fnms_float), fm(fm_float, spu_rsqrte(x)), spu_rsqrte(x)))); return; } } @@ -6082,12 +6111,12 @@ public: if (check_accurate_reciprocal_pattern_for_float(ONEISH)) return; - // Generate dynamic pattern for when float is unknown because of scope - if (auto [ok_fma, cursed_float] = match_expr(full_expr, fma(fnms(div, c, MT), c, c)); ok_fma) + // Generate dynamic pattern for when floats are unknown because of scope + if (auto [ok_fma, fnms_float] = match_expr(full_expr, fma(fnms(div, c, MT), c, c)); ok_fma) { erase_stores(a, b, c); - const auto bitcast_float = bitcast(cursed_float); - set_vr(op.rt4, select(bitcast_float == splat(0x3F800000) | bitcast_float == splat(0x3F800001), re_accurate(div), fma(fnms(spu_re(div), div, cursed_float), spu_re(div), spu_re(div)))); + const auto bitcast_float = bitcast(fnms_float); + set_vr(op.rt4, select(bitcast_float == splat(0x3F800000) | bitcast_float == splat(0x3F800001), re_accurate(div), fma(fnms(spu_re(div), div, fnms_float), spu_re(div), spu_re(div)))); return; } }