Rewrite cpu_translator::rol, add fshl and fshr

Use new funnel shift intrinsics
This commit is contained in:
Nekotekina 2019-04-19 14:28:27 +03:00
parent 42448cf3e5
commit ac473eb400
3 changed files with 153 additions and 29 deletions

View File

@ -667,6 +667,120 @@ inline llvm_shr<T1, llvm_const_int<typename is_llvm_expr<T1>::type>> operator >>
return {a1, {c}};
}
template <typename A1, typename A2, typename A3, typename T = llvm_common_t<A1, A2, A3>>
struct llvm_fshl
{
using type = T;
llvm_expr_t<A1> a1;
llvm_expr_t<A2> a2;
llvm_expr_t<A3> a3;
static_assert(llvm_value_t<T>::is_sint || llvm_value_t<T>::is_uint, "llvm_fshl<>: invalid type");
static constexpr bool is_ok = llvm_value_t<T>::is_sint || llvm_value_t<T>::is_uint;
static llvm::Function* get_fshl(llvm::IRBuilder<>* ir)
{
const auto module = ir->GetInsertBlock()->getParent()->getParent();
return llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::fshl, {llvm_value_t<T>::get_type(ir->getContext())});
}
static llvm::Value* fold(llvm::IRBuilder<>* ir, llvm::Value* v1, llvm::Value* v2, llvm::Value* v3)
{
// Compute constant result.
const u64 size = v3->getType()->getScalarSizeInBits();
const auto val = ir->CreateURem(v3, llvm::ConstantInt::get(v3->getType(), size));
const auto shl = ir->CreateShl(v1, val);
const auto shr = ir->CreateLShr(v2, ir->CreateSub(llvm::ConstantInt::get(v3->getType(), size - 1), val));
return ir->CreateOr(shl, ir->CreateLShr(shr, 1));
}
llvm::Value* eval(llvm::IRBuilder<>* ir) const
{
const auto v1 = a1.eval(ir);
const auto v2 = a2.eval(ir);
const auto v3 = a3.eval(ir);
if (llvm::isa<llvm::Constant>(v1) && llvm::isa<llvm::Constant>(v2) && llvm::isa<llvm::Constant>(v3))
{
return fold(ir, v1, v2, v3);
}
return ir->CreateCall(get_fshl(ir), {v1, v2, v3});
}
};
template <typename A1, typename A2, typename A3, typename T = llvm_common_t<A1, A2, A3>>
struct llvm_fshr
{
using type = T;
llvm_expr_t<A1> a1;
llvm_expr_t<A2> a2;
llvm_expr_t<A3> a3;
static_assert(llvm_value_t<T>::is_sint || llvm_value_t<T>::is_uint, "llvm_fshr<>: invalid type");
static constexpr bool is_ok = llvm_value_t<T>::is_sint || llvm_value_t<T>::is_uint;
static llvm::Function* get_fshr(llvm::IRBuilder<>* ir)
{
const auto module = ir->GetInsertBlock()->getParent()->getParent();
return llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::fshr, {llvm_value_t<T>::get_type(ir->getContext())});
}
static llvm::Value* fold(llvm::IRBuilder<>* ir, llvm::Value* v1, llvm::Value* v2, llvm::Value* v3)
{
// Compute constant result.
const u64 size = v3->getType()->getScalarSizeInBits();
const auto val = ir->CreateURem(v3, llvm::ConstantInt::get(v3->getType(), size));
const auto shr = ir->CreateLShr(v2, val);
const auto shl = ir->CreateShl(v1, ir->CreateSub(llvm::ConstantInt::get(v3->getType(), size - 1), val));
return ir->CreateOr(shr, ir->CreateShl(shl, 1));
}
llvm::Value* eval(llvm::IRBuilder<>* ir) const
{
const auto v1 = a1.eval(ir);
const auto v2 = a2.eval(ir);
const auto v3 = a3.eval(ir);
if (llvm::isa<llvm::Constant>(v1) && llvm::isa<llvm::Constant>(v2) && llvm::isa<llvm::Constant>(v3))
{
return fold(ir, v1, v2, v3);
}
return ir->CreateCall(get_fshr(ir), {v1, v2, v3});
}
};
template <typename A1, typename A2, typename T = llvm_common_t<A1, A2>>
struct llvm_rol
{
using type = T;
llvm_expr_t<A1> a1;
llvm_expr_t<A2> a2;
static_assert(llvm_value_t<T>::is_sint || llvm_value_t<T>::is_uint, "llvm_rol<>: invalid type");
static constexpr bool is_ok = llvm_value_t<T>::is_sint || llvm_value_t<T>::is_uint;
llvm::Value* eval(llvm::IRBuilder<>* ir) const
{
const auto v1 = a1.eval(ir);
const auto v2 = a2.eval(ir);
if (llvm::isa<llvm::Constant>(v1) && llvm::isa<llvm::Constant>(v2))
{
return llvm_fshl<A1, A1, A2>::fold(ir, v1, v1, v2);
}
return ir->CreateCall(llvm_fshl<A1, A1, A2>::get_fshl(ir), {v1, v1, v2});
}
};
template <typename A1, typename A2, typename T = llvm_common_t<A1, A2>>
struct llvm_and
{
@ -1296,12 +1410,22 @@ public:
return llvm_max<T, U>{std::forward<T>(a), std::forward<U>(b)};
}
// Rotate left
template <typename T>
static inline auto rol(T a, T b)
template <typename T, typename U, typename V, typename = std::enable_if_t<llvm_fshl<T, U, V>::is_ok>>
static auto fshl(T&& a, U&& b, V&& c)
{
static constexpr u64 mask = value_t<typename T::type>::esize - 1;
return a << (b & mask) | a >> (-b & mask);
return llvm_fshl<T, U, V>{std::forward<T>(a), std::forward<U>(b), std::forward<V>(c)};
}
template <typename T, typename U, typename V, typename = std::enable_if_t<llvm_fshr<T, U, V>::is_ok>>
static auto fshr(T&& a, U&& b, V&& c)
{
return llvm_fshr<T, U, V>{std::forward<T>(a), std::forward<U>(b), std::forward<V>(c)};
}
template <typename T, typename U, typename = std::enable_if_t<llvm_rol<T, U>::is_ok>>
static auto rol(T&& a, U&& b)
{
return llvm_rol<T, U>{std::forward<T>(a), std::forward<U>(b)};
}
// Add with saturation

View File

@ -1301,20 +1301,20 @@ void PPUTranslator::VRFIZ(ppu_opcode_t op)
void PPUTranslator::VRLB(ppu_opcode_t op)
{
const auto ab = GetVrs(VrType::vi8, op.va, op.vb);
SetVr(op.vd, RotateLeft(ab[0], ab[1]));
const auto [a, b] = get_vrs<u8[16]>(op.va, op.vb);
set_vr(op.vd, rol(a, b));
}
void PPUTranslator::VRLH(ppu_opcode_t op)
{
const auto ab = GetVrs(VrType::vi16, op.va, op.vb);
SetVr(op.vd, RotateLeft(ab[0], ab[1]));
const auto [a, b] = get_vrs<u16[8]>(op.va, op.vb);
set_vr(op.vd, rol(a, b));
}
void PPUTranslator::VRLW(ppu_opcode_t op)
{
const auto ab = GetVrs(VrType::vi32, op.va, op.vb);
SetVr(op.vd, RotateLeft(ab[0], ab[1]));
const auto [a, b] = get_vrs<u32[4]>(op.va, op.vb);
set_vr(op.vd, rol(a, b));
}
void PPUTranslator::VRSQRTEFP(ppu_opcode_t op)

View File

@ -5141,23 +5141,23 @@ public:
void ROTQBI(spu_opcode_t op)
{
const auto a = get_vr<u64[2]>(op.ra);
const auto b = eval((get_vr<u64[2]>(op.rb) >> 32) & 0x7);
set_vr(op.rt, a << zshuffle<u64[2]>(b, 1, 1) | zshuffle<u64[2]>(a, 1, 0) >> 56 >> zshuffle<u64[2]>(8 - b, 1, 1));
const auto a = get_vr(op.ra);
const auto b = zshuffle<u32[4]>(get_vr(op.rb) & 0x7, 3, 3, 3, 3);
set_vr(op.rt, fshl(a, zshuffle<u32[4]>(a, 3, 0, 1, 2), b));
}
void ROTQMBI(spu_opcode_t op)
{
const auto a = get_vr<u64[2]>(op.ra);
const auto b = eval(-(get_vr<u64[2]>(op.rb) >> 32) & 0x7);
set_vr(op.rt, a >> zshuffle<u64[2]>(b, 1, 1) | zshuffle<u64[2]>(a, 1, 2) << 56 << zshuffle<u64[2]>(8 - b, 1, 1));
const auto a = get_vr(op.ra);
const auto b = zshuffle<u32[4]>(-get_vr(op.rb) & 0x7, 3, 3, 3, 3);
set_vr(op.rt, fshr(zshuffle<u32[4]>(a, 1, 2, 3, 4), a, b));
}
void SHLQBI(spu_opcode_t op)
{
const auto a = get_vr<u64[2]>(op.ra);
const auto b = eval((get_vr<u64[2]>(op.rb) >> 32) & 0x7);
set_vr(op.rt, a << zshuffle<u64[2]>(b, 1, 1) | zshuffle<u64[2]>(a, 2, 0) >> 56 >> zshuffle<u64[2]>(8 - b, 1, 1));
const auto a = get_vr(op.ra);
const auto b = zshuffle<u32[4]>(get_vr(op.rb) & 0x7, 3, 3, 3, 3);
set_vr(op.rt, fshl(a, zshuffle<u32[4]>(a, 4, 0, 1, 2), b));
}
void ROTQBY(spu_opcode_t op)
@ -5233,23 +5233,23 @@ public:
void ROTQBII(spu_opcode_t op)
{
const auto a = get_vr<u64[2]>(op.ra);
const auto b = eval(get_imm<u64[2]>(op.i7, false) & 0x7);
set_vr(op.rt, a << b | zshuffle<u64[2]>(a, 1, 0) >> 56 >> (8 - b));
const auto a = get_vr(op.ra);
const auto b = eval(get_imm(op.i7, false) & 0x7);
set_vr(op.rt, fshl(a, zshuffle<u32[4]>(a, 3, 0, 1, 2), b));
}
void ROTQMBII(spu_opcode_t op)
{
const auto a = get_vr<u64[2]>(op.ra);
const auto b = eval(-get_imm<u64[2]>(op.i7, false) & 0x7);
set_vr(op.rt, a >> b | zshuffle<u64[2]>(a, 1, 2) << 56 << (8 - b));
const auto a = get_vr(op.ra);
const auto b = eval(-get_imm(op.i7, false) & 0x7);
set_vr(op.rt, fshr(zshuffle<u32[4]>(a, 1, 2, 3, 4), a, b));
}
void SHLQBII(spu_opcode_t op)
{
const auto a = get_vr<u64[2]>(op.ra);
const auto b = eval(get_imm<u64[2]>(op.i7, false) & 0x7);
set_vr(op.rt, a << b | zshuffle<u64[2]>(a, 2, 0) >> 56 >> (8 - b));
const auto a = get_vr(op.ra);
const auto b = eval(get_imm(op.i7, false) & 0x7);
set_vr(op.rt, fshl(a, zshuffle<u32[4]>(a, 4, 0, 1, 2), b));
}
void ROTQBYI(spu_opcode_t op)