PPU Precise: Fix fused float ops misaccuracy by using fma

Also cleanup add64_flags, remove redundent copy of s_ppu_itype.
2019-09-14 12:00:05 +03:00 · 2019-09-14 12:00:05 +03:00 · c16319f959
parent 4d46e66833
commit c16319f959
3 changed files with 23 additions and 14 deletions
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -288,8 +288,6 @@ struct add_flags_result_t
 {
 	T result;
 	bool carry;
-	bool zero;
-	bool sign;

 	add_flags_result_t() = default;

@ -297,8 +295,6 @@ struct add_flags_result_t
 	add_flags_result_t(T a, T b)
 		: result(a + b)
 		, carry(result < a)
-		, zero(result == 0)
-		, sign(result >> (sizeof(T) * 8 - 1) != 0)
 	{
 	}

@ -309,8 +305,6 @@ struct add_flags_result_t
 		add_flags_result_t r(result, c);
 		result = r.result;
 		carry |= r.carry;
-		zero = r.zero;
-		sign = r.sign;
 	}
 };

@ -904,12 +898,28 @@ bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op)
 	return true;
 }

-bool ppu_interpreter::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
+bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
 {
 	ppu.vr[op.vd].vf = _mm_add_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf);
 	return true;
 }

+bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
+{
+	const auto& a = ppu.vr[op.va]._f;
+	const auto& b = ppu.vr[op.vb]._f;
+	const auto& c = ppu.vr[op.vc]._f;
+	auto& d = ppu.vr[op.rd]._f;
+
+	// TODO: Optimize
+	for (u32 i = 0; i < 4; i++)
+	{
+		d[i] = f32(f64{a[i]} * f64{c[i]} + f64{b[i]});
+	}
+
+	return true;
+}
+
 bool ppu_interpreter::VMAXFP(ppu_thread& ppu, ppu_opcode_t op)
 {
 	ppu.vr[op.vd].vf = _mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf);
@ -4942,7 +4952,7 @@ bool ppu_interpreter_fast::FMSUB(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter_precise::FMSUB(ppu_thread& ppu, ppu_opcode_t op)
 {
-	const f64 res = ppu.fpr[op.frd] = ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb];
+	const f64 res = ppu.fpr[op.frd] = std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]);
 	ppu_fpcc_set(ppu, res, 0., op.rc);
 	return true;
 }
@ -4955,7 +4965,7 @@ bool ppu_interpreter_fast::FMADD(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter_precise::FMADD(ppu_thread& ppu, ppu_opcode_t op)
 {
-	const f64 res = ppu.fpr[op.frd] = ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb];
+	const f64 res = ppu.fpr[op.frd] = std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]);
 	ppu_fpcc_set(ppu, res, 0., op.rc);
 	return true;
 }
@ -4968,7 +4978,7 @@ bool ppu_interpreter_fast::FNMSUB(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter_precise::FNMSUB(ppu_thread& ppu, ppu_opcode_t op)
 {
-	const f64 res = ppu.fpr[op.frd] = -(ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb]);
+	const f64 res = ppu.fpr[op.frd] = -std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]);
 	ppu_fpcc_set(ppu, res, 0., op.rc);
 	return true;
 }
@ -4981,7 +4991,7 @@ bool ppu_interpreter_fast::FNMADD(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter_precise::FNMADD(ppu_thread& ppu, ppu_opcode_t op)
 {
-	const f64 res = ppu.fpr[op.frd] = -(ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb]);
+	const f64 res = ppu.fpr[op.frd] = -std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]);
 	ppu_fpcc_set(ppu, res, 0., op.rc);
 	return true;
 }
--- a/rpcs3/Emu/Cell/PPUInterpreter.h
+++ b/rpcs3/Emu/Cell/PPUInterpreter.h
@ -40,7 +40,6 @@ struct ppu_interpreter
 	static bool VCMPGTUW(ppu_thread&, ppu_opcode_t);
 	static bool VEXPTEFP(ppu_thread&, ppu_opcode_t);
 	static bool VLOGEFP(ppu_thread&, ppu_opcode_t);
-	static bool VMADDFP(ppu_thread&, ppu_opcode_t);
 	static bool VMAXFP(ppu_thread&, ppu_opcode_t);
 	static bool VMAXSB(ppu_thread&, ppu_opcode_t);
 	static bool VMAXSH(ppu_thread&, ppu_opcode_t);
@ -373,6 +372,7 @@ struct ppu_interpreter_precise final : ppu_interpreter
 	static bool VSUM4UBS(ppu_thread&, ppu_opcode_t);
 	static bool VCTSXS(ppu_thread&, ppu_opcode_t);
 	static bool VCTUXS(ppu_thread&, ppu_opcode_t);
+	static bool VMADDFP(ppu_thread&, ppu_opcode_t);

 	static bool FDIVS(ppu_thread&, ppu_opcode_t);
 	static bool FSUBS(ppu_thread&, ppu_opcode_t);
@ -439,6 +439,7 @@ struct ppu_interpreter_fast final : ppu_interpreter
 	static bool VSUM4UBS(ppu_thread&, ppu_opcode_t);
 	static bool VCTSXS(ppu_thread&, ppu_opcode_t);
 	static bool VCTUXS(ppu_thread&, ppu_opcode_t);
+	static bool VMADDFP(ppu_thread&, ppu_opcode_t);

 	static bool FDIVS(ppu_thread&, ppu_opcode_t);
 	static bool FSUBS(ppu_thread&, ppu_opcode_t);
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -928,8 +928,6 @@ void ppu_thread::stack_pop_verbose(u32 addr, u32 size) noexcept
 	LOG_ERROR(PPU, "Invalid thread" HERE);
 }

-const ppu_decoder<ppu_itype> s_ppu_itype;
-
 extern u64 get_timebased_time();
 extern ppu_function_t ppu_get_syscall(u64 code);