JIT64: use FMA instructions

2014-09-06 03:41:17 -07:00 · 2014-09-06 03:41:17 -07:00 · 9734f0c834
parent 4289221584
commit 9734f0c834
3 changed files with 104 additions and 38 deletions
--- a/Source/Core/Core/Core.cpp
+++ b/Source/Core/Core/Core.cpp
@ -49,6 +49,7 @@
 #include "Core/HW/Wiimote.h"
 #include "Core/IPC_HLE/WII_IPC_HLE_Device_usb.h"
 #include "Core/IPC_HLE/WII_Socket.h"
+#include "Core/PowerPC/JitInterface.h"
 #include "Core/PowerPC/PowerPC.h"

 #ifdef USE_GDBSTUB
@ -728,6 +729,8 @@ void UpdateWantDeterminism(bool initial)
 		g_want_determinism = new_want_determinism;
 		WiiSockMan::GetInstance().UpdateWantDeterminism(new_want_determinism);
 		g_video_backend->UpdateWantDeterminism(new_want_determinism);
+		// We need to clear the cache because some parts of the JIT depend on want_determinism, e.g. use of FMA.
+		JitInterface::ClearCache();

 		Core::PauseAndLock(false, was_unpaused);
 	}
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -90,9 +90,44 @@ void Jit64::fmaddXX(UGeckoInstruction inst)

 	fpr.Lock(a, b, c, d);

-	// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
-	if (inst.SUBOP5 == 30) //nmsub
+	// While we don't know if any games are actually affected (replays seem to work with all the usual
+	// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
+	// be extra careful and don't use FMA, even if in theory it might be okay.
+	// Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
+	// to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
+	// instances on different computers giving identical results.
+	if (cpu_info.bFMA && !Core::g_want_determinism)
 	{
+		if (single_precision)
+			Force25BitPrecision(XMM0, fpr.R(c), XMM1);
+		else
+			MOVSD(XMM0, fpr.R(c));
+		// Statistics suggests b is a lot less likely to be unbound in practice, so
+		// if we have to pick one of a or b to bind, let's make it b.
+		fpr.BindToRegister(b, true, false);
+		switch (inst.SUBOP5)
+		{
+		case 28: //msub
+			VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
+			break;
+		case 29: //madd
+			VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
+			break;
+			// PowerPC and x86 define NMADD/NMSUB differently
+			// x86: D = -A*C (+/-) B
+			// PPC: D = -(A*C (+/-) B)
+			// so we have to swap them; the ADD/SUB here isn't a typo.
+		case 30: //nmsub
+			VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
+			break;
+		case 31: //nmadd
+			VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
+			break;
+		}
+	}
+	else if (inst.SUBOP5 == 30) //nmsub
+	{
+		// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
 		if (single_precision)
 			Force25BitPrecision(XMM1, fpr.R(c), XMM0);
 		else
@ -115,6 +150,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 		if (inst.SUBOP5 == 31) //nmadd
 			PXOR(XMM0, M((void*)&psSignBits));
 	}
+
 	fpr.BindToRegister(d, false);
 	//YES it is necessary to dupe the result :(
 	//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@ -305,50 +305,77 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
 	int b = inst.FB;
 	int c = inst.FC;
 	int d = inst.FD;
+	bool fma = cpu_info.bFMA && !Core::g_want_determinism;
 	fpr.Lock(a,b,c,d);

-	switch (inst.SUBOP5)
+	if (fma)
+		fpr.BindToRegister(b, true, false);
+
+	if (inst.SUBOP5 == 14)
 	{
-	case 14: //madds0
 		MOVDDUP(XMM0, fpr.R(c));
 		Force25BitPrecision(XMM0, R(XMM0), XMM1);
-		MULPD(XMM0, fpr.R(a));
-		ADDPD(XMM0, fpr.R(b));
-		break;
-	case 15: //madds1
+	}
+	else if (inst.SUBOP5 == 15)
+	{
 		avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
 		Force25BitPrecision(XMM0, R(XMM0), XMM1);
-		MULPD(XMM0, fpr.R(a));
-		ADDPD(XMM0, fpr.R(b));
-		break;
-	case 28: //msub
-		Force25BitPrecision(XMM0, fpr.R(c), XMM1);
-		MULPD(XMM0, fpr.R(a));
-		SUBPD(XMM0, fpr.R(b));
-		break;
-	case 29: //madd
-		Force25BitPrecision(XMM0, fpr.R(c), XMM1);
-		MULPD(XMM0, fpr.R(a));
-		ADDPD(XMM0, fpr.R(b));
-		break;
-	case 30: //nmsub
-		Force25BitPrecision(XMM0, fpr.R(c), XMM1);
-		MULPD(XMM0, fpr.R(a));
-		SUBPD(XMM0, fpr.R(b));
-		PXOR(XMM0, M((void*)&psSignBits));
-		break;
-	case 31: //nmadd
-		Force25BitPrecision(XMM0, fpr.R(c), XMM1);
-		MULPD(XMM0, fpr.R(a));
-		ADDPD(XMM0, fpr.R(b));
-		PXOR(XMM0, M((void*)&psSignBits));
-		break;
-	default:
-		_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
-		//FallBackToInterpreter(inst);
-		//fpr.UnlockAll();
-		return;
 	}
+	else
+	{
+		Force25BitPrecision(XMM0, fpr.R(c), XMM1);
+	}
+
+	if (fma)
+	{
+		switch (inst.SUBOP5)
+		{
+		case 14: //madds0
+		case 15: //madds1
+		case 29: //madd
+			VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
+			break;
+		case 28: //msub
+			VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
+			break;
+		case 30: //nmsub
+			VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
+			break;
+		case 31: //nmadd
+			VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
+			break;
+		}
+	}
+	else
+	{
+		switch (inst.SUBOP5)
+		{
+		case 14: //madds0
+		case 15: //madds1
+		case 29: //madd
+			MULPD(XMM0, fpr.R(a));
+			ADDPD(XMM0, fpr.R(b));
+			break;
+		case 28: //msub
+			MULPD(XMM0, fpr.R(a));
+			SUBPD(XMM0, fpr.R(b));
+			break;
+		case 30: //nmsub
+			MULPD(XMM0, fpr.R(a));
+			SUBPD(XMM0, fpr.R(b));
+			PXOR(XMM0, M((void*)&psSignBits));
+			break;
+		case 31: //nmadd
+			MULPD(XMM0, fpr.R(a));
+			ADDPD(XMM0, fpr.R(b));
+			PXOR(XMM0, M((void*)&psSignBits));
+			break;
+		default:
+			_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
+			return;
+		}
+	}
+
 	fpr.BindToRegister(d, false);
 	ForceSinglePrecisionP(fpr.RX(d), XMM0);
 	SetFPRFIfNeeded(inst, fpr.RX(d));