PPU Precise: Fix fused float ops misaccuracy by using fma

Also cleanup add64_flags, remove redundent copy of s_ppu_itype.
This commit is contained in:
Eladash 2019-09-14 12:00:05 +03:00 committed by Ivan
parent 4d46e66833
commit c16319f959
3 changed files with 23 additions and 14 deletions

View File

@ -288,8 +288,6 @@ struct add_flags_result_t
{
T result;
bool carry;
bool zero;
bool sign;
add_flags_result_t() = default;
@ -297,8 +295,6 @@ struct add_flags_result_t
add_flags_result_t(T a, T b)
: result(a + b)
, carry(result < a)
, zero(result == 0)
, sign(result >> (sizeof(T) * 8 - 1) != 0)
{
}
@ -309,8 +305,6 @@ struct add_flags_result_t
add_flags_result_t r(result, c);
result = r.result;
carry |= r.carry;
zero = r.zero;
sign = r.sign;
}
};
@ -904,12 +898,28 @@ bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op)
return true;
}
bool ppu_interpreter::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd].vf = _mm_add_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf);
return true;
}
bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto& a = ppu.vr[op.va]._f;
const auto& b = ppu.vr[op.vb]._f;
const auto& c = ppu.vr[op.vc]._f;
auto& d = ppu.vr[op.rd]._f;
// TODO: Optimize
for (u32 i = 0; i < 4; i++)
{
d[i] = f32(f64{a[i]} * f64{c[i]} + f64{b[i]});
}
return true;
}
bool ppu_interpreter::VMAXFP(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd].vf = _mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf);
@ -4942,7 +4952,7 @@ bool ppu_interpreter_fast::FMSUB(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter_precise::FMSUB(ppu_thread& ppu, ppu_opcode_t op)
{
const f64 res = ppu.fpr[op.frd] = ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb];
const f64 res = ppu.fpr[op.frd] = std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]);
ppu_fpcc_set(ppu, res, 0., op.rc);
return true;
}
@ -4955,7 +4965,7 @@ bool ppu_interpreter_fast::FMADD(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter_precise::FMADD(ppu_thread& ppu, ppu_opcode_t op)
{
const f64 res = ppu.fpr[op.frd] = ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb];
const f64 res = ppu.fpr[op.frd] = std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]);
ppu_fpcc_set(ppu, res, 0., op.rc);
return true;
}
@ -4968,7 +4978,7 @@ bool ppu_interpreter_fast::FNMSUB(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter_precise::FNMSUB(ppu_thread& ppu, ppu_opcode_t op)
{
const f64 res = ppu.fpr[op.frd] = -(ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb]);
const f64 res = ppu.fpr[op.frd] = -std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]);
ppu_fpcc_set(ppu, res, 0., op.rc);
return true;
}
@ -4981,7 +4991,7 @@ bool ppu_interpreter_fast::FNMADD(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter_precise::FNMADD(ppu_thread& ppu, ppu_opcode_t op)
{
const f64 res = ppu.fpr[op.frd] = -(ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb]);
const f64 res = ppu.fpr[op.frd] = -std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]);
ppu_fpcc_set(ppu, res, 0., op.rc);
return true;
}

View File

@ -40,7 +40,6 @@ struct ppu_interpreter
static bool VCMPGTUW(ppu_thread&, ppu_opcode_t);
static bool VEXPTEFP(ppu_thread&, ppu_opcode_t);
static bool VLOGEFP(ppu_thread&, ppu_opcode_t);
static bool VMADDFP(ppu_thread&, ppu_opcode_t);
static bool VMAXFP(ppu_thread&, ppu_opcode_t);
static bool VMAXSB(ppu_thread&, ppu_opcode_t);
static bool VMAXSH(ppu_thread&, ppu_opcode_t);
@ -373,6 +372,7 @@ struct ppu_interpreter_precise final : ppu_interpreter
static bool VSUM4UBS(ppu_thread&, ppu_opcode_t);
static bool VCTSXS(ppu_thread&, ppu_opcode_t);
static bool VCTUXS(ppu_thread&, ppu_opcode_t);
static bool VMADDFP(ppu_thread&, ppu_opcode_t);
static bool FDIVS(ppu_thread&, ppu_opcode_t);
static bool FSUBS(ppu_thread&, ppu_opcode_t);
@ -439,6 +439,7 @@ struct ppu_interpreter_fast final : ppu_interpreter
static bool VSUM4UBS(ppu_thread&, ppu_opcode_t);
static bool VCTSXS(ppu_thread&, ppu_opcode_t);
static bool VCTUXS(ppu_thread&, ppu_opcode_t);
static bool VMADDFP(ppu_thread&, ppu_opcode_t);
static bool FDIVS(ppu_thread&, ppu_opcode_t);
static bool FSUBS(ppu_thread&, ppu_opcode_t);

View File

@ -928,8 +928,6 @@ void ppu_thread::stack_pop_verbose(u32 addr, u32 size) noexcept
LOG_ERROR(PPU, "Invalid thread" HERE);
}
const ppu_decoder<ppu_itype> s_ppu_itype;
extern u64 get_timebased_time();
extern ppu_function_t ppu_get_syscall(u64 code);