From 2acb6ed60d3dfd5875982609ade52e5d20ee7505 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Wed, 1 Sep 2021 19:49:27 +0300 Subject: [PATCH] SPU LLVM: optimize SHUFB for permutation-only shuffles Drop constant generation when unused. --- rpcs3/Emu/Cell/SPURecompiler.cpp | 46 +++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 7aadb2d94d..63425b79cb 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -7589,6 +7589,9 @@ public: return; } + // (TODO: implement via known-bits-lookup) Check whether shuffle mask doesn't contain fixed value selectors + const auto [perm_only, dummy1] = match_expr(c, match() & 31); + const auto a = get_vr(op.ra); const auto b = get_vr(op.rb); @@ -7598,9 +7601,14 @@ public: if (auto [ok, bs] = match_expr(b, byteswap(match())); ok) { // Undo endian swapping, and rely on pshufb/vperm2b to re-reverse endianness - if (m_use_avx512_icl && (op.ra != op.rb)) { + if (perm_only) + { + set_vr(op.rt4, vperm2b(as, bs, c)); + return; + } + const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); const auto mm = select(noncast(m) >= 0, splat(0), m); const auto ab = vperm2b(as, bs, c); @@ -7611,19 +7619,26 @@ public: const auto x = avg(noncast(sext((c & 0xc0) == 0xc0)), noncast(sext((c & 0xe0) == 0xc0))); const auto ax = pshufb(as, c); const auto bx = pshufb(bs, c); - set_vr(op.rt4, select(noncast(c << 3) >= 0, ax, bx) | x); + + if (perm_only) + set_vr(op.rt4, select(noncast(c << 3) >= 0, ax, bx)); + else + set_vr(op.rt4, select(noncast(c << 3) >= 0, ax, bx) | x); return; } if (auto [ok, data] = get_const_vector(b.value, m_pos); ok) { - const bool all_bytes_equiv = data == v128::from8p(data._u8[0]); - if (all_bytes_equiv) + if (data == v128::from8p(data._u8[0])) { // See above const auto x = avg(noncast(sext((c & 0xc0) == 0xc0)), noncast(sext((c & 0xe0) == 0xc0))); const auto ax = pshufb(as, c); - set_vr(op.rt4, select(noncast(c << 3) >= 0, ax, b) | x); + + if (perm_only) + set_vr(op.rt4, select(noncast(c << 3) >= 0, ax, b)); + else + set_vr(op.rt4, select(noncast(c << 3) >= 0, ax, b) | x); return; } } @@ -7633,13 +7648,16 @@ public: { if (auto [ok, data] = get_const_vector(a.value, m_pos); ok) { - const bool all_bytes_equiv = data == v128::from8p(data._u8[0]); - if (all_bytes_equiv) + if (data == v128::from8p(data._u8[0])) { // See above const auto x = avg(noncast(sext((c & 0xc0) == 0xc0)), noncast(sext((c & 0xe0) == 0xc0))); const auto bx = pshufb(bs, c); - set_vr(op.rt4, select(noncast(c << 3) >= 0, a, bx) | x); + + if (perm_only) + set_vr(op.rt4, select(noncast(c << 3) >= 0, a, bx)); + else + set_vr(op.rt4, select(noncast(c << 3) >= 0, a, bx) | x); return; } } @@ -7647,6 +7665,12 @@ public: if (m_use_avx512_icl && (op.ra != op.rb || m_interp_magn)) { + if (perm_only) + { + set_vr(op.rt4, vperm2b(b, a, eval(~c))); + return; + } + const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); const auto mm = select(noncast(m) >= 0, splat(0), m); const auto cr = eval(~c); @@ -7659,7 +7683,11 @@ public: const auto cr = eval(c ^ 0xf); const auto ax = pshufb(a, cr); const auto bx = pshufb(b, cr); - set_vr(op.rt4, select(noncast(cr << 3) >= 0, ax, bx) | x); + + if (perm_only) + set_vr(op.rt4, select(noncast(cr << 3) >= 0, ax, bx)); + else + set_vr(op.rt4, select(noncast(cr << 3) >= 0, ax, bx) | x); } void MPYA(spu_opcode_t op)