mirror of https://github.com/RPCS3/rpcs3.git
PPU/SPU LLVM: Emulate VPERM2B with a 256 bit wide VPERMB
- Save 1 uop by using 256 wide VPERMB instead of VPERM2B. (Compiles down to a vinserti128 and vpermb)
This commit is contained in:
parent
68fdc49528
commit
f06c8b22e8
|
@ -3442,6 +3442,70 @@ public:
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Emulate the behavior of VPERM2B by using a 256 bit wide VPERMB
|
||||||
|
template <typename T1, typename T2, typename T3>
|
||||||
|
value_t<u8[16]> vperm2b256to128(T1 a, T2 b, T3 c)
|
||||||
|
{
|
||||||
|
value_t<u8[16]> result;
|
||||||
|
|
||||||
|
const auto data0 = a.eval(m_ir);
|
||||||
|
const auto data1 = b.eval(m_ir);
|
||||||
|
const auto index = c.eval(m_ir);
|
||||||
|
|
||||||
|
// May be slower than non constant path?
|
||||||
|
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
|
||||||
|
{
|
||||||
|
// Convert VPERM2B index back to LLVM vector shuffle mask
|
||||||
|
v128 mask{};
|
||||||
|
|
||||||
|
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
|
||||||
|
|
||||||
|
if (cv)
|
||||||
|
{
|
||||||
|
for (u32 i = 0; i < 16; i++)
|
||||||
|
{
|
||||||
|
const u64 b = cv->getElementAsInteger(i);
|
||||||
|
mask._u8[i] = b & 0x1f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
|
||||||
|
{
|
||||||
|
result.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef(reinterpret_cast<const u8*>(&mask), 16));
|
||||||
|
result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
|
||||||
|
result.value = m_ir->CreateShuffleVector(data0, data1, result.value);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto zeroes = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
|
||||||
|
const auto zeroes32 = llvm::ConstantAggregateZero::get(get_type<u8[32]>());
|
||||||
|
|
||||||
|
value_t<u8[32]> intermediate;
|
||||||
|
value_t<u8[32]> shuffle;
|
||||||
|
value_t<u8[32]> shuffleindex;
|
||||||
|
|
||||||
|
u8 mask32[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||||
|
u8 mask16[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||||
|
|
||||||
|
// insert the second source operand into the same vector as the first source operand and expand to 256 bit width
|
||||||
|
shuffle.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef(reinterpret_cast<const u8*>(&mask32), 32));
|
||||||
|
shuffle.value = m_ir->CreateZExt(shuffle.value, get_type<u32[32]>());
|
||||||
|
intermediate.value = m_ir->CreateShuffleVector(data0, data1, shuffle.value);
|
||||||
|
|
||||||
|
// expand the shuffle index to 256 bits with zeroes
|
||||||
|
shuffleindex.value = m_ir->CreateShuffleVector(index, zeroes, shuffle.value);
|
||||||
|
|
||||||
|
// permute
|
||||||
|
intermediate.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_avx512_permvar_qi_256), {intermediate.value, shuffleindex.value});
|
||||||
|
|
||||||
|
// convert the 256 bit vector back to 128 bits
|
||||||
|
result.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef(reinterpret_cast<const u8*>(&mask16), 16));
|
||||||
|
result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
|
||||||
|
result.value = m_ir->CreateShuffleVector(intermediate.value, zeroes32, result.value);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
llvm::Value* load_const(llvm::GlobalVariable* g, llvm::Value* i)
|
llvm::Value* load_const(llvm::GlobalVariable* g, llvm::Value* i)
|
||||||
{
|
{
|
||||||
return m_ir->CreateLoad(m_ir->CreateGEP(g, {m_ir->getInt64(0), m_ir->CreateZExtOrTrunc(i, get_type<u64>())}));
|
return m_ir->CreateLoad(m_ir->CreateGEP(g, {m_ir->getInt64(0), m_ir->CreateZExtOrTrunc(i, get_type<u64>())}));
|
||||||
|
|
|
@ -1374,7 +1374,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
|
||||||
if (m_use_avx512_icl && op.ra != op.rb)
|
if (m_use_avx512_icl && op.ra != op.rb)
|
||||||
{
|
{
|
||||||
const auto i = eval(~c);
|
const auto i = eval(~c);
|
||||||
set_vr(op.vd, vperm2b(b, a, i));
|
set_vr(op.vd, vperm2b256to128(b, a, i));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7645,13 +7645,13 @@ public:
|
||||||
{
|
{
|
||||||
if (perm_only)
|
if (perm_only)
|
||||||
{
|
{
|
||||||
set_vr(op.rt4, vperm2b(as, bs, c));
|
set_vr(op.rt4, vperm2b256to128(as, bs, c));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
|
const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
|
||||||
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
|
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
|
||||||
const auto ab = vperm2b(as, bs, c);
|
const auto ab = vperm2b256to128(as, bs, c);
|
||||||
set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
|
set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -7707,14 +7707,14 @@ public:
|
||||||
{
|
{
|
||||||
if (perm_only)
|
if (perm_only)
|
||||||
{
|
{
|
||||||
set_vr(op.rt4, vperm2b(b, a, eval(~c)));
|
set_vr(op.rt4, vperm2b256to128(b, a, eval(~c)));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
|
const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
|
||||||
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
|
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
|
||||||
const auto cr = eval(~c);
|
const auto cr = eval(~c);
|
||||||
const auto ab = vperm2b(b, a, cr);
|
const auto ab = vperm2b256to128(b, a, cr);
|
||||||
set_vr(op.rt4, select(noncast<s8[16]>(cr) >= 0, mm, ab));
|
set_vr(op.rt4, select(noncast<s8[16]>(cr) >= 0, mm, ab));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue