JIT64: Fix fmul rounding issues

Thanks to magumagu's softfp experiments, we know a lot more about the Wii's strange floating point unit than we used to. In particular, when doing a single-precision floating point multiply (fmulsx), it rounds the right hand side's mantissa so as to lose the low 28 bits (of the 53-bit mantissa). Emulating this behavior in Dolphin fixes a bunch of issues with games that require extremely precise emulation of floating point hardware, especially game replays. Fortunately, we can do this with rather little CPU cost; just ~5 extra instructions per multiply, instead of the vast load of a pure-software float implementation. This doesn't make floating-point behavior at all perfect. I still suspect fmadd rounding might not be quite right, since the Wii uses fused instructions and Dolphin doesn't, and NaN/infinity/exception handling is probably off in various ways... but it's definitely way better than before. This appears to fix replays in Mario Kart Wii, Mario Kart Double Dash, and Super Smash Brothers Brawl. I wouldn't be surprised if it fixes a bunch of other stuff too. The changes to instructions other than fmulsx may not be strictly necessary, but I included them for completeness, since it feels wrong to fix some instructions but not others, since some games we didn't test might rely on them.
2014-08-18 19:58:00 -07:00 · 2014-08-18 19:58:00 -07:00 · 4f18f6078f
parent b749a74d33
commit 4f18f6078f
5 changed files with 90 additions and 20 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -117,10 +117,10 @@ public:
 	// is set or not.
 	Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);

-	void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
+	void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
 	typedef u32 (*Operation)(u32 a, u32 b);
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
-	void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
+	void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);

 	// OPCODES
 	void unknown_instruction(UGeckoInstruction _inst);
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -14,10 +14,28 @@ static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x800000
 static const u64 GC_ALIGNED16(psAbsMask2[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
 static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};

-void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
+void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS)
 {
 	fpr.Lock(d, a, b);
-	if (d == a)
+	if (roundRHS)
+	{
+		if (d == a)
+		{
+			fpr.BindToRegister(d, true);
+			MOVSD(XMM0, fpr.R(b));
+			Force25BitPrecision(XMM0, XMM1);
+			(this->*op)(fpr.RX(d), R(XMM0));
+		}
+		else
+		{
+			fpr.BindToRegister(d, d == b);
+			if (d != b)
+				MOVSD(fpr.RX(d), fpr.R(b));
+			Force25BitPrecision(fpr.RX(d), XMM0);
+			(this->*op)(fpr.RX(d), fpr.R(a));
+		}
+	}
+	else if (d == a)
 	{
 		fpr.BindToRegister(d, true);
 		if (!single)
@ -88,7 +106,7 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 	case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD); break; //div
 	case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD); break; //sub
 	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true,  single, &XEmitter::ADDSD); break; //add
-	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD); break; //mul
+	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, single); break; //mul
 	default:
 		_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
 	}
@ -111,24 +129,25 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 	int d = inst.FD;

 	fpr.Lock(a, b, c, d);
-	MOVSD(XMM0, fpr.R(a));
+	MOVSD(XMM0, fpr.R(c));
+	Force25BitPrecision(XMM0, XMM1);
 	switch (inst.SUBOP5)
 	{
 	case 28: //msub
-		MULSD(XMM0, fpr.R(c));
+		MULSD(XMM0, fpr.R(a));
 		SUBSD(XMM0, fpr.R(b));
 		break;
 	case 29: //madd
-		MULSD(XMM0, fpr.R(c));
+		MULSD(XMM0, fpr.R(a));
 		ADDSD(XMM0, fpr.R(b));
 		break;
 	case 30: //nmsub
-		MULSD(XMM0, fpr.R(c));
+		MULSD(XMM0, fpr.R(a));
 		SUBSD(XMM0, fpr.R(b));
 		PXOR(XMM0, M((void*)&psSignBits2));
 		break;
 	case 31: //nmadd
-		MULSD(XMM0, fpr.R(c));
+		MULSD(XMM0, fpr.R(a));
 		ADDSD(XMM0, fpr.R(b));
 		PXOR(XMM0, M((void*)&psSignBits2));
 		break;
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@ -113,11 +113,29 @@ add a,b,a
 */

 //There's still a little bit more optimization that can be squeezed out of this
-void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg))
+void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), bool roundRHS)
 {
 	fpr.Lock(d, a, b);

-	if (d == a)
+	if (roundRHS)
+	{
+		if (d == a)
+		{
+			fpr.BindToRegister(d, true);
+			MOVAPD(XMM0, fpr.R(b));
+			Force25BitPrecision(XMM0, XMM1);
+			(this->*op)(fpr.RX(d), R(XMM0));
+		}
+		else
+		{
+			fpr.BindToRegister(d, d == b);
+			if (d != b)
+				MOVAPD(fpr.RX(d), fpr.R(b));
+			Force25BitPrecision(fpr.RX(d), XMM0);
+			(this->*op)(fpr.RX(d), fpr.R(a));
+		}
+	}
+	else if (d == a)
 	{
 		fpr.BindToRegister(d, true);
 		(this->*op)(fpr.RX(d), fpr.R(b));
@ -166,7 +184,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
 		tri_op(inst.FD, inst.FA, inst.FB, true,  &XEmitter::ADDPD);
 		break;
 	case 25: // mul
-		tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD);
+		tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, true);
 		break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
@ -230,15 +248,17 @@ void Jit64::ps_muls(UGeckoInstruction inst)
 	case 12:
 		// Single multiply scalar high
 		// TODO - faster version for when regs are different
-		MOVAPD(XMM0, fpr.R(a));
 		MOVDDUP(XMM1, fpr.R(c));
+		Force25BitPrecision(XMM1, XMM0);
+		MOVAPD(XMM0, fpr.R(a));
 		MULPD(XMM0, R(XMM1));
 		MOVAPD(fpr.R(d), XMM0);
 		break;
 	case 13:
 		// TODO - faster version for when regs are different
-		MOVAPD(XMM0, fpr.R(a));
 		MOVAPD(XMM1, fpr.R(c));
+		Force25BitPrecision(XMM1, XMM0);
+		MOVAPD(XMM0, fpr.R(a));
 		SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
 		MULPD(XMM0, R(XMM1));
 		MOVAPD(fpr.R(d), XMM0);
@ -300,35 +320,46 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
 	int d = inst.FD;
 	fpr.Lock(a,b,c,d);

-	MOVAPD(XMM0, fpr.R(a));
 	switch (inst.SUBOP5)
 	{
 	case 14: //madds0
 		MOVDDUP(XMM1, fpr.R(c));
+		Force25BitPrecision(XMM1, XMM0);
+		MOVAPD(XMM0, fpr.R(a));
 		MULPD(XMM0, R(XMM1));
 		ADDPD(XMM0, fpr.R(b));
 		break;
 	case 15: //madds1
 		MOVAPD(XMM1, fpr.R(c));
 		SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
+		Force25BitPrecision(XMM1, XMM0);
+		MOVAPD(XMM0, fpr.R(a));
 		MULPD(XMM0, R(XMM1));
 		ADDPD(XMM0, fpr.R(b));
 		break;
 	case 28: //msub
-		MULPD(XMM0, fpr.R(c));
+		MOVAPD(XMM0, fpr.R(c));
+		Force25BitPrecision(XMM0, XMM1);
+		MULPD(XMM0, fpr.R(a));
 		SUBPD(XMM0, fpr.R(b));
 		break;
 	case 29: //madd
-		MULPD(XMM0, fpr.R(c));
+		MOVAPD(XMM0, fpr.R(c));
+		Force25BitPrecision(XMM0, XMM1);
+		MULPD(XMM0, fpr.R(a));
 		ADDPD(XMM0, fpr.R(b));
 		break;
 	case 30: //nmsub
-		MULPD(XMM0, fpr.R(c));
+		MOVAPD(XMM0, fpr.R(c));
+		Force25BitPrecision(XMM0, XMM1);
+		MULPD(XMM0, fpr.R(a));
 		SUBPD(XMM0, fpr.R(b));
 		PXOR(XMM0, M((void*)&psSignBits));
 		break;
 	case 31: //nmadd
-		MULPD(XMM0, fpr.R(c));
+		MOVAPD(XMM0, fpr.R(c));
+		Force25BitPrecision(XMM0, XMM1);
+		MULPD(XMM0, fpr.R(a));
 		ADDPD(XMM0, fpr.R(b));
 		PXOR(XMM0, M((void*)&psSignBits));
 		break;
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@ -519,6 +519,25 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm)
 	}
 }

+static const u64 GC_ALIGNED16(psMantissaTruncate[2]) = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL};
+static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000};
+
+// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
+// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
+// rounding to nearest as it does.
+// It needs a temp, so let the caller pass that in.
+void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp)
+{
+	if (jit->jo.accurateSinglePrecision)
+	{
+		// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
+		MOVAPD(tmp, R(xmm));
+		PAND(xmm, M((void*)&psMantissaTruncate));
+		PAND(tmp, M((void*)&psRoundBit));
+		PADDQ(xmm, R(tmp));
+	}
+}
+
 static u32 GC_ALIGNED16(temp32);
 static u64 GC_ALIGNED16(temp64);

--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@ -56,6 +56,7 @@ public:

 	void ForceSinglePrecisionS(Gen::X64Reg xmm);
 	void ForceSinglePrecisionP(Gen::X64Reg xmm);
+	void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp);

 	// EAX might get trashed
 	void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);