PPU Precise/LLVM: Support NJ modes (#8617)

This commit is contained in:
Eladash 2020-07-25 09:41:41 +03:00 committed by GitHub
parent 3354c800d7
commit 917069e31a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 98 additions and 55 deletions

View File

@ -359,6 +359,8 @@ public:
} }
const g_ppu_scale_table; const g_ppu_scale_table;
constexpr u32 ppu_inf_u32 = 0x7F800000u;
static const f32 ppu_inf_f32 = std::bit_cast<f32>(ppu_inf_u32);
constexpr u32 ppu_nan_u32 = 0x7FC00000u; constexpr u32 ppu_nan_u32 = 0x7FC00000u;
static const f32 ppu_nan_f32 = std::bit_cast<f32>(ppu_nan_u32); static const f32 ppu_nan_f32 = std::bit_cast<f32>(ppu_nan_u32);
static const v128 ppu_vec_nans = v128::from32p(ppu_nan_u32); static const v128 ppu_vec_nans = v128::from32p(ppu_nan_u32);
@ -403,6 +405,14 @@ v128 vec_handle_nan(__m128 result, Args... args)
return vec_handle_nan(v128::fromF(result), v128::fromF(args)...); return vec_handle_nan(v128::fromF(result), v128::fromF(args)...);
} }
// Flush denormals to zero if NJ is 1
inline v128 vec_handle_denormal(ppu_thread& ppu, v128 a)
{
const auto mask = v128::from32p(ppu.jm_mask);
const auto nz = v128::fromV(_mm_srli_epi32(v128::eq32(mask & a, v128{}).vi, 1));
return v128::andnot(nz, a);
}
bool ppu_interpreter::MFVSCR(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::MFVSCR(ppu_thread& ppu, ppu_opcode_t op)
{ {
ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat} | (u32{ppu.nj} << 16)); ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat} | (u32{ppu.nj} << 16));
@ -414,6 +424,7 @@ bool ppu_interpreter::MTVSCR(ppu_thread& ppu, ppu_opcode_t op)
const u32 vscr = ppu.vr[op.vb]._u32[3]; const u32 vscr = ppu.vr[op.vb]._u32[3];
ppu.sat = (vscr & 1) != 0; ppu.sat = (vscr & 1) != 0;
ppu.nj = (vscr & 0x10000) != 0; ppu.nj = (vscr & 0x10000) != 0;
ppu.jm_mask = ppu.nj ? ppu_inf_u32 : 0x7fff'ffff;
return true; return true;
} }
@ -427,10 +438,10 @@ bool ppu_interpreter::VADDCUW(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VADDFP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::VADDFP(ppu_thread& ppu, ppu_opcode_t op)
{ {
const auto a = ppu.vr[op.va]; const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
const auto b = ppu.vr[op.vb]; const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
const auto result = v128::addfs(a, b); const auto result = v128::addfs(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
return true; return true;
} }
@ -958,26 +969,26 @@ bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
{ {
const auto a = ppu.vr[op.va].vf; const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]).vf;
const auto b = ppu.vr[op.vb].vf; const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
const auto c = ppu.vr[op.vc].vf; const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]).vf;
const auto result = _mm_add_ps(_mm_mul_ps(a, c), b); const auto result = _mm_add_ps(_mm_mul_ps(a, c), b);
ppu.vr[op.vd] = vec_handle_nan(result); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result));
return true; return true;
} }
bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
{ {
const auto a = ppu.vr[op.va]; const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
const auto b = ppu.vr[op.vb]; const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
const auto c = ppu.vr[op.vc]; const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]);
ppu.vr[op.rd] = vec_handle_nan(v128::fma32f(a, c, b), a, b, c); ppu.vr[op.rd] = vec_handle_denormal(ppu, vec_handle_nan(v128::fma32f(a, c, b), a, b, c));
return true; return true;
} }
bool ppu_interpreter::VMAXFP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::VMAXFP(ppu_thread& ppu, ppu_opcode_t op)
{ {
ppu.vr[op.vd] = vec_handle_nan(_mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf)); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(_mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf)));
return true; return true;
} }
@ -1123,7 +1134,7 @@ bool ppu_interpreter::VMINFP(ppu_thread& ppu, ppu_opcode_t op)
const auto a = ppu.vr[op.va].vf; const auto a = ppu.vr[op.va].vf;
const auto b = ppu.vr[op.vb].vf; const auto b = ppu.vr[op.vb].vf;
const auto result = _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a)); const auto result = _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a));
ppu.vr[op.vd] = vec_handle_nan(result, a, b); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
return true; return true;
} }
@ -1463,18 +1474,18 @@ bool ppu_interpreter_fast::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op)
const auto a = _mm_sub_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf); const auto a = _mm_sub_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf);
const auto b = _mm_set1_ps(-0.0f); const auto b = _mm_set1_ps(-0.0f);
const auto result = _mm_xor_ps(a, b); const auto result = _mm_xor_ps(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b); ppu.vr[op.vd] = vec_handle_nan(result);
return true; return true;
} }
bool ppu_interpreter_precise::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter_precise::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op)
{ {
const auto m = _mm_set1_ps(-0.0f); const auto m = _mm_set1_ps(-0.0f);
const auto a = ppu.vr[op.va]; const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
const auto c = ppu.vr[op.vc]; const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]);
const auto b = v128::fromF(_mm_xor_ps(ppu.vr[op.vb].vf, m)); const auto b = v128::fromF(_mm_xor_ps(ppu.vr[op.vb].vf, m));
const auto r = v128::fromF(_mm_xor_ps(v128::fma32f(a, c, b).vf, m)); const auto r = v128::fromF(_mm_xor_ps(v128::fma32f(a, c, b).vf, m));
ppu.vr[op.rd] = vec_handle_nan(r, a, b, c); ppu.vr[op.rd] = vec_handle_denormal(ppu, vec_handle_nan(r, a, b, c));
return true; return true;
} }
@ -1874,15 +1885,15 @@ bool ppu_interpreter_precise::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VREFP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::VREFP(ppu_thread& ppu, ppu_opcode_t op)
{ {
const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f); const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
const auto b = ppu.vr[op.vb].vf; const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
const auto result = _mm_div_ps(a, b); const auto result = _mm_div_ps(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
return true; return true;
} }
bool ppu_interpreter::VRFIM(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::VRFIM(ppu_thread& ppu, ppu_opcode_t op)
{ {
const auto b = ppu.vr[op.vb]; const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
v128 d; v128 d;
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
@ -1890,7 +1901,7 @@ bool ppu_interpreter::VRFIM(ppu_thread& ppu, ppu_opcode_t op)
d._f[w] = std::floor(b._f[w]); d._f[w] = std::floor(b._f[w]);
} }
ppu.vr[op.vd] = vec_handle_nan(d, b); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
return true; return true;
} }
@ -1904,13 +1915,13 @@ bool ppu_interpreter::VRFIN(ppu_thread& ppu, ppu_opcode_t op)
d._f[w] = std::nearbyint(b._f[w]); d._f[w] = std::nearbyint(b._f[w]);
} }
ppu.vr[op.vd] = vec_handle_nan(d, b); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
return true; return true;
} }
bool ppu_interpreter::VRFIP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::VRFIP(ppu_thread& ppu, ppu_opcode_t op)
{ {
const auto b = ppu.vr[op.vb]; const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
v128 d; v128 d;
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
@ -1918,7 +1929,7 @@ bool ppu_interpreter::VRFIP(ppu_thread& ppu, ppu_opcode_t op)
d._f[w] = std::ceil(b._f[w]); d._f[w] = std::ceil(b._f[w]);
} }
ppu.vr[op.vd] = vec_handle_nan(d, b); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
return true; return true;
} }
@ -1932,7 +1943,7 @@ bool ppu_interpreter::VRFIZ(ppu_thread& ppu, ppu_opcode_t op)
d._f[w] = std::truncf(b._f[w]); d._f[w] = std::truncf(b._f[w]);
} }
ppu.vr[op.vd] = vec_handle_nan(d, b); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
return true; return true;
} }
@ -1978,9 +1989,9 @@ bool ppu_interpreter::VRLW(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VRSQRTEFP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::VRSQRTEFP(ppu_thread& ppu, ppu_opcode_t op)
{ {
const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f); const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
const auto b = ppu.vr[op.vb].vf; const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
const auto result = _mm_div_ps(a, _mm_sqrt_ps(b)); const auto result = _mm_div_ps(a, _mm_sqrt_ps(b));
ppu.vr[op.vd] = vec_handle_nan(result, a, b); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
return true; return true;
} }
@ -2277,10 +2288,10 @@ bool ppu_interpreter::VSUBCUW(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VSUBFP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::VSUBFP(ppu_thread& ppu, ppu_opcode_t op)
{ {
const auto a = ppu.vr[op.va]; const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
const auto b = ppu.vr[op.vb]; const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
const auto result = v128::subfs(a, b); const auto result = v128::subfs(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b); ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
return true; return true;
} }

View File

@ -1613,6 +1613,7 @@ extern void ppu_initialize(const ppu_module& info)
non_win32, non_win32,
accurate_fma, accurate_fma,
accurate_ppu_vector_nan, accurate_ppu_vector_nan,
java_mode_handling,
__bitset_enum_max __bitset_enum_max
}; };
@ -1630,6 +1631,10 @@ extern void ppu_initialize(const ppu_module& info)
{ {
settings += ppu_settings::accurate_ppu_vector_nan; settings += ppu_settings::accurate_ppu_vector_nan;
} }
if (g_cfg.core.llvm_ppu_jm_handling)
{
settings += ppu_settings::java_mode_handling;
}
// Write version, hash, CPU, settings // Write version, hash, CPU, settings
fmt::append(obj_name, "v3-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); fmt::append(obj_name, "v3-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));

View File

@ -186,7 +186,10 @@ public:
exception, the corresponding element in the target vr is cleared to '0'. In both cases, the '0' exception, the corresponding element in the target vr is cleared to '0'. In both cases, the '0'
has the same sign as the denormalized or underflowing value. has the same sign as the denormalized or underflowing value.
*/ */
bool nj = false; bool nj = true;
// Optimization: precomputed java-mode mask for handling denormals
u32 jm_mask = 0x7f80'0000;
u32 raddr{0}; // Reservation addr u32 raddr{0}; // Reservation addr
u64 rtime{0}; u64 rtime{0};

View File

@ -46,6 +46,8 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
thread_struct.insert(thread_struct.end(), 3, GetType<bool>()); // so, ov, ca thread_struct.insert(thread_struct.end(), 3, GetType<bool>()); // so, ov, ca
thread_struct.insert(thread_struct.end(), 1, GetType<u8>()); // cnt thread_struct.insert(thread_struct.end(), 1, GetType<u8>()); // cnt
thread_struct.insert(thread_struct.end(), 2, GetType<bool>()); // sat, nj thread_struct.insert(thread_struct.end(), 2, GetType<bool>()); // sat, nj
thread_struct.emplace_back(ArrayType::get(GetType<char>(), 2)); // Padding
thread_struct.insert(thread_struct.end(), 1, GetType<u32>()); // jm_mask
m_thread_type = StructType::create(m_context, thread_struct, "context_t"); m_thread_type = StructType::create(m_context, thread_struct, "context_t");
@ -231,6 +233,25 @@ Value* PPUTranslator::VecHandleNan(Value* val)
return val; return val;
} }
Value* PPUTranslator::VecHandleDenormal(Value* val)
{
const auto type = val->getType();
const auto value = type == GetType<u32[4]>() ? val : m_ir->CreateBitCast(val, GetType<u32[4]>());
const auto mask = SExt(m_ir->CreateICmpEQ(m_ir->CreateAnd(value, Broadcast(RegLoad(m_jm_mask), 4)), ConstantVector::getSplat(4, m_ir->getInt32(0))), GetType<s32[4]>());
const auto nz = m_ir->CreateLShr(mask, 1);
const auto result = m_ir->CreateAnd(m_ir->CreateNot(nz), value);
return type == GetType<u32[4]>() ? result : m_ir->CreateBitCast(result, type);
}
Value* PPUTranslator::VecHandleResult(Value* val)
{
val = g_cfg.core.llvm_ppu_accurate_vector_nan ? VecHandleNan(val) : val;
val = g_cfg.core.llvm_ppu_jm_handling ? VecHandleDenormal(val) : val;
return val;
}
Value* PPUTranslator::GetAddr(u64 _add) Value* PPUTranslator::GetAddr(u64 _add)
{ {
if (m_reloc) if (m_reloc)
@ -609,7 +630,9 @@ void PPUTranslator::MFVSCR(ppu_opcode_t op)
void PPUTranslator::MTVSCR(ppu_opcode_t op) void PPUTranslator::MTVSCR(ppu_opcode_t op)
{ {
const auto vscr = m_ir->CreateExtractElement(GetVr(op.vb, VrType::vi32), m_ir->getInt32(m_is_be ? 3 : 0)); const auto vscr = m_ir->CreateExtractElement(GetVr(op.vb, VrType::vi32), m_ir->getInt32(m_is_be ? 3 : 0));
RegStore(Trunc(m_ir->CreateLShr(vscr, 16), GetType<bool>()), m_nj); const auto nj = Trunc(m_ir->CreateLShr(vscr, 16), GetType<bool>());
RegStore(nj, m_nj);
if (g_cfg.core.llvm_ppu_jm_handling) RegStore(m_ir->CreateSelect(nj, m_ir->getInt32(0x7f80'0000), m_ir->getInt32(0x7fff'ffff)), m_jm_mask);
RegStore(Trunc(vscr, GetType<bool>()), m_sat); RegStore(Trunc(vscr, GetType<bool>()), m_sat);
} }
@ -625,7 +648,7 @@ void PPUTranslator::VADDFP(ppu_opcode_t op)
const auto a = get_vr<f32[4]>(op.va); const auto a = get_vr<f32[4]>(op.va);
const auto b = get_vr<f32[4]>(op.vb); const auto b = get_vr<f32[4]>(op.vb);
set_vr(op.vd, vec_handle_nan(a + b)); set_vr(op.vd, vec_handle_result(a + b));
} }
void PPUTranslator::VADDSBS(ppu_opcode_t op) void PPUTranslator::VADDSBS(ppu_opcode_t op)
@ -930,7 +953,7 @@ void PPUTranslator::VMADDFP(ppu_opcode_t op)
if (data == v128{}) if (data == v128{})
{ {
set_vr(op.vd, vec_handle_nan(a * c)); set_vr(op.vd, vec_handle_result(a * c));
ppu_log.notice("LLVM: VMADDFP with 0 addend at [0x%08x]", m_addr + (m_reloc ? m_reloc->addr : 0)); ppu_log.notice("LLVM: VMADDFP with 0 addend at [0x%08x]", m_addr + (m_reloc ? m_reloc->addr : 0));
return; return;
} }
@ -938,7 +961,7 @@ void PPUTranslator::VMADDFP(ppu_opcode_t op)
if (m_use_fma) if (m_use_fma)
{ {
SetVr(op.vd, VecHandleNan(m_ir->CreateCall(get_intrinsic<f32[4]>(llvm::Intrinsic::fma), { a.value, c.value, b.value }))); SetVr(op.vd, VecHandleResult(m_ir->CreateCall(get_intrinsic<f32[4]>(llvm::Intrinsic::fma), { a.value, c.value, b.value })));
return; return;
} }
@ -948,13 +971,13 @@ void PPUTranslator::VMADDFP(ppu_opcode_t op)
const auto xc = m_ir->CreateFPExt(c.value, get_type<f64[4]>()); const auto xc = m_ir->CreateFPExt(c.value, get_type<f64[4]>());
const auto xr = m_ir->CreateCall(get_intrinsic<f64[4]>(llvm::Intrinsic::fmuladd), {xa, xc, xb}); const auto xr = m_ir->CreateCall(get_intrinsic<f64[4]>(llvm::Intrinsic::fmuladd), {xa, xc, xb});
SetVr(op.vd, VecHandleNan(m_ir->CreateFPTrunc(xr, get_type<f32[4]>()))); SetVr(op.vd, VecHandleResult(m_ir->CreateFPTrunc(xr, get_type<f32[4]>())));
} }
void PPUTranslator::VMAXFP(ppu_opcode_t op) void PPUTranslator::VMAXFP(ppu_opcode_t op)
{ {
const auto ab = GetVrs(VrType::vf, op.va, op.vb); const auto ab = GetVrs(VrType::vf, op.va, op.vb);
SetVr(op.vd, VecHandleNan(m_ir->CreateSelect(m_ir->CreateFCmpOGT(ab[0], ab[1]), ab[0], ab[1]))); SetVr(op.vd, VecHandleResult(m_ir->CreateSelect(m_ir->CreateFCmpOGT(ab[0], ab[1]), ab[0], ab[1])));
} }
void PPUTranslator::VMAXSB(ppu_opcode_t op) void PPUTranslator::VMAXSB(ppu_opcode_t op)
@ -1026,7 +1049,7 @@ void PPUTranslator::VMHRADDSHS(ppu_opcode_t op)
void PPUTranslator::VMINFP(ppu_opcode_t op) void PPUTranslator::VMINFP(ppu_opcode_t op)
{ {
const auto ab = GetVrs(VrType::vf, op.va, op.vb); const auto ab = GetVrs(VrType::vf, op.va, op.vb);
SetVr(op.vd, VecHandleNan(m_ir->CreateSelect(m_ir->CreateFCmpOLT(ab[0], ab[1]), ab[0], ab[1]))); SetVr(op.vd, VecHandleResult(m_ir->CreateSelect(m_ir->CreateFCmpOLT(ab[0], ab[1]), ab[0], ab[1])));
} }
void PPUTranslator::VMINSB(ppu_opcode_t op) void PPUTranslator::VMINSB(ppu_opcode_t op)
@ -1236,7 +1259,7 @@ void PPUTranslator::VNMSUBFP(ppu_opcode_t op)
if (data == v128{}) if (data == v128{})
{ {
set_vr(op.vd, vec_handle_nan(-a * c)); set_vr(op.vd, vec_handle_result(-a * c));
ppu_log.notice("LLVM: VNMSUBFP with 0 addend at [0x%08x]", m_addr + (m_reloc ? m_reloc->addr : 0)); ppu_log.notice("LLVM: VNMSUBFP with 0 addend at [0x%08x]", m_addr + (m_reloc ? m_reloc->addr : 0));
return; return;
} }
@ -1245,7 +1268,7 @@ void PPUTranslator::VNMSUBFP(ppu_opcode_t op)
// Differs from the emulated path with regards to negative zero // Differs from the emulated path with regards to negative zero
if (m_use_fma) if (m_use_fma)
{ {
SetVr(op.vd, VecHandleNan(m_ir->CreateFNeg(m_ir->CreateCall(get_intrinsic<f32[4]>(llvm::Intrinsic::fma), { a.value, c.value, m_ir->CreateFNeg(b.value) })))); SetVr(op.vd, VecHandleResult(m_ir->CreateFNeg(m_ir->CreateCall(get_intrinsic<f32[4]>(llvm::Intrinsic::fma), { a.value, c.value, m_ir->CreateFNeg(b.value) }))));
return; return;
} }
@ -1255,7 +1278,7 @@ void PPUTranslator::VNMSUBFP(ppu_opcode_t op)
const auto xc = m_ir->CreateFPExt(c.value, get_type<f64[4]>()); const auto xc = m_ir->CreateFPExt(c.value, get_type<f64[4]>());
const auto xr = m_ir->CreateFNeg(m_ir->CreateFSub(m_ir->CreateFMul(xa, xc), xb)); const auto xr = m_ir->CreateFNeg(m_ir->CreateFSub(m_ir->CreateFMul(xa, xc), xb));
SetVr(op.vd, VecHandleNan(m_ir->CreateFPTrunc(xr, get_type<f32[4]>()))); SetVr(op.vd, VecHandleResult(m_ir->CreateFPTrunc(xr, get_type<f32[4]>())));
} }
void PPUTranslator::VNOR(ppu_opcode_t op) void PPUTranslator::VNOR(ppu_opcode_t op)
@ -1361,28 +1384,28 @@ void PPUTranslator::VPKUWUS(ppu_opcode_t op)
void PPUTranslator::VREFP(ppu_opcode_t op) void PPUTranslator::VREFP(ppu_opcode_t op)
{ {
const auto result = VecHandleNan(m_ir->CreateFDiv(ConstantVector::getSplat(4, ConstantFP::get(GetType<f32>(), 1.0)), GetVr(op.vb, VrType::vf))); const auto result = VecHandleResult(m_ir->CreateFDiv(ConstantVector::getSplat(4, ConstantFP::get(GetType<f32>(), 1.0)), GetVr(op.vb, VrType::vf)));
SetVr(op.vd, result); SetVr(op.vd, result);
} }
void PPUTranslator::VRFIM(ppu_opcode_t op) void PPUTranslator::VRFIM(ppu_opcode_t op)
{ {
SetVr(op.vd, VecHandleNan(Call(GetType<f32[4]>(), "llvm.floor.v4f32", GetVr(op.vb, VrType::vf)))); SetVr(op.vd, VecHandleResult(Call(GetType<f32[4]>(), "llvm.floor.v4f32", GetVr(op.vb, VrType::vf))));
} }
void PPUTranslator::VRFIN(ppu_opcode_t op) void PPUTranslator::VRFIN(ppu_opcode_t op)
{ {
SetVr(op.vd, VecHandleNan(Call(GetType<f32[4]>(), "llvm.nearbyint.v4f32", GetVr(op.vb, VrType::vf)))); SetVr(op.vd, VecHandleResult(Call(GetType<f32[4]>(), "llvm.nearbyint.v4f32", GetVr(op.vb, VrType::vf))));
} }
void PPUTranslator::VRFIP(ppu_opcode_t op) void PPUTranslator::VRFIP(ppu_opcode_t op)
{ {
SetVr(op.vd, VecHandleNan(Call(GetType<f32[4]>(), "llvm.ceil.v4f32", GetVr(op.vb, VrType::vf)))); SetVr(op.vd, VecHandleResult(Call(GetType<f32[4]>(), "llvm.ceil.v4f32", GetVr(op.vb, VrType::vf))));
} }
void PPUTranslator::VRFIZ(ppu_opcode_t op) void PPUTranslator::VRFIZ(ppu_opcode_t op)
{ {
SetVr(op.vd, VecHandleNan(Call(GetType<f32[4]>(), "llvm.trunc.v4f32", GetVr(op.vb, VrType::vf)))); SetVr(op.vd, VecHandleResult(Call(GetType<f32[4]>(), "llvm.trunc.v4f32", GetVr(op.vb, VrType::vf))));
} }
void PPUTranslator::VRLB(ppu_opcode_t op) void PPUTranslator::VRLB(ppu_opcode_t op)
@ -1407,7 +1430,7 @@ void PPUTranslator::VRSQRTEFP(ppu_opcode_t op)
{ {
const auto result = m_ir->CreateFDiv(ConstantVector::getSplat(4, ConstantFP::get(GetType<f32>(), 1.0)), Call(GetType<f32[4]>(), "llvm.sqrt.v4f32", GetVr(op.vb, VrType::vf))); const auto result = m_ir->CreateFDiv(ConstantVector::getSplat(4, ConstantFP::get(GetType<f32>(), 1.0)), Call(GetType<f32[4]>(), "llvm.sqrt.v4f32", GetVr(op.vb, VrType::vf)));
SetVr(op.vd, VecHandleNan(result)); SetVr(op.vd, VecHandleResult(result));
} }
void PPUTranslator::VSEL(ppu_opcode_t op) void PPUTranslator::VSEL(ppu_opcode_t op)
@ -1565,7 +1588,7 @@ void PPUTranslator::VSUBFP(ppu_opcode_t op)
{ {
const auto a = get_vr<f32[4]>(op.va); const auto a = get_vr<f32[4]>(op.va);
const auto b = get_vr<f32[4]>(op.vb); const auto b = get_vr<f32[4]>(op.vb);
SetVr(op.vd, VecHandleNan(eval(a - b).eval(m_ir))); SetVr(op.vd, VecHandleResult(eval(a - b).eval(m_ir)));
} }
void PPUTranslator::VSUBSBS(ppu_opcode_t op) void PPUTranslator::VSUBSBS(ppu_opcode_t op)

View File

@ -52,9 +52,9 @@ class PPUTranslator final : public cpu_translator
llvm::Value* m_mtocr_table{}; llvm::Value* m_mtocr_table{};
llvm::Value* m_globals[173]; llvm::Value* m_globals[175];
llvm::Value** const m_g_cr = m_globals + 99; llvm::Value** const m_g_cr = m_globals + 99;
llvm::Value* m_locals[173]; llvm::Value* m_locals[175];
llvm::Value** const m_gpr = m_locals + 3; llvm::Value** const m_gpr = m_locals + 3;
llvm::Value** const m_fpr = m_locals + 35; llvm::Value** const m_fpr = m_locals + 35;
llvm::Value** const m_vr = m_locals + 67; llvm::Value** const m_vr = m_locals + 67;
@ -77,6 +77,7 @@ class PPUTranslator final : public cpu_translator
DEF_VALUE(m_cnt, m_g_cnt, 170) // XER.CNT DEF_VALUE(m_cnt, m_g_cnt, 170) // XER.CNT
DEF_VALUE(m_sat, m_g_sat, 171) // VSCR.SAT bit, sticky saturation flag DEF_VALUE(m_sat, m_g_sat, 171) // VSCR.SAT bit, sticky saturation flag
DEF_VALUE(m_nj, m_g_nj, 172) // VSCR.NJ bit, non-Java mode DEF_VALUE(m_nj, m_g_nj, 172) // VSCR.NJ bit, non-Java mode
DEF_VALUE(m_jm_mask, m_g_jm_mask, 174) // Java-Mode helper mask
#undef DEF_VALUE #undef DEF_VALUE
public: public:
@ -102,15 +103,14 @@ public:
} }
llvm::Value* VecHandleNan(llvm::Value* val); llvm::Value* VecHandleNan(llvm::Value* val);
llvm::Value* VecHandleDenormal(llvm::Value* val);
llvm::Value* VecHandleResult(llvm::Value* val);
template <typename T> template <typename T>
auto vec_handle_nan(T&& expr) auto vec_handle_result(T&& expr)
{ {
value_t<typename T::type> result; value_t<typename T::type> result;
if (g_cfg.core.llvm_ppu_accurate_vector_nan) result.value = VecHandleResult(expr.eval(m_ir));
result.value = VecHandleNan(expr.eval(m_ir));
else
result.value = expr.eval(m_ir);
return result; return result;
} }

View File

@ -52,6 +52,7 @@ struct cfg_root : cfg::node
cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false }; cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false };
cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true }; cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true };
cfg::_bool llvm_accurate_dfma{ this, "LLVM Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively cfg::_bool llvm_accurate_dfma{ this, "LLVM Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
cfg::_bool llvm_ppu_jm_handling{ this, "PPU LLVM Java Mode Handling", false }; // Respect current Java Mode for alti-vec ops by PPU LLVM
cfg::_bool llvm_ppu_accurate_vector_nan{ this, "PPU LLVM Accurate Vector NaN values", false }; cfg::_bool llvm_ppu_accurate_vector_nan{ this, "PPU LLVM Accurate Vector NaN values", false };
cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip) cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)