From f423a9c41d78bba3389b5923808b064695e234d4 Mon Sep 17 00:00:00 2001
From: cottonvibes <cottonvibes@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Mon, 7 Nov 2011 10:20:56 +0000
Subject: [PATCH] microVU: - Rewrote and simplified the TriAce gamefix VU
 interpreter: - Implemented a TriAce gamefix for vu0 interpreter

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4960 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/VUops.cpp            |  50 +++++++++--
 pcsx2/x86/microVU_Misc.inl | 180 +++++++++++++++++++++----------------
 2 files changed, 150 insertions(+), 80 deletions(-)

diff --git a/pcsx2/VUops.cpp b/pcsx2/VUops.cpp
index bcf40df61e..c1debb8fa7 100644
--- a/pcsx2/VUops.cpp
+++ b/pcsx2/VUops.cpp
@@ -339,6 +339,36 @@ static __fi float vuDouble(u32 f)
 }
 #endif
 
+static __fi float vuADD_TriAceHack(u32 a, u32 b) {
+	// On VU0 TriAce Games use ADDi and expects these bit-perfect results:
+	//if (a == 0xb3e2a619 && b == 0x42546666) return vuDouble(0x42546666);
+	//if (a == 0x8b5b19e9 && b == 0xc7f079b3) return vuDouble(0xc7f079b3);
+	if (a == 0x4b1ed4a8 && b == 0x43a02666) return vuDouble(0x4b1ed5e7);
+	//if (a == 0x7d1ca47b && b == 0x42f23333) return vuDouble(0x7d1ca47b);
+
+	// In the 3rd case, some other rounding error is giving us incorrect
+	// operands ('a' is wrong); and therefor an incorrect result.
+	// We're getting:        0x4b1ed4a8 + 0x43a02666 = 0x4b1ed5e8
+	// We should be getting: 0x4b1ed4a7 + 0x43a02666 = 0x4b1ed5e7
+	// microVU gets the correct operands and result. The interps likely
+	// don't get it due to rounding towards nearest in other calculations.
+
+	if (0) {
+		// microVU uses something like this to get TriAce games working,
+		// but VU interpreters don't seem to need it currently:
+		s32 aExp = (a >> 23) & 0xff;
+		s32 bExp = (b >> 23) & 0xff;
+		if (aExp - bExp >= 25) b &= 0x80000000;
+		if (aExp - bExp <=-25) a &= 0x80000000;
+		float ret = vuDouble(a) + vuDouble(b);
+		DevCon.WriteLn("aExp = %d, bExp = %d", aExp, bExp);
+		DevCon.WriteLn("0x%08x + 0x%08x = 0x%08x", a, b, (u32&)ret);
+		DevCon.WriteLn("%f + %f = %f", vuDouble(a), vuDouble(b), ret);
+		return ret;
+	}
+	return vuDouble(a) + vuDouble(b);
+}
+
 void _vuABS(VURegs * VU) {
 	if (_Ft_ == 0) return;
 
@@ -367,11 +397,21 @@ static __fi void _vuADDi(VURegs * VU) {
 	if (_Fd_ == 0) dst = &RDzero;
 	else dst = &VU->VF[_Fd_];
 
-	if (_X){ dst->i.x = VU_MACx_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.x) + vuDouble(VU->VI[REG_I].UL));} else VU_MACx_CLEAR(VU);
-	if (_Y){ dst->i.y = VU_MACy_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.y) + vuDouble(VU->VI[REG_I].UL));} else VU_MACy_CLEAR(VU);
-	if (_Z){ dst->i.z = VU_MACz_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.z) + vuDouble(VU->VI[REG_I].UL));} else VU_MACz_CLEAR(VU);
-	if (_W){ dst->i.w = VU_MACw_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.w) + vuDouble(VU->VI[REG_I].UL));} else VU_MACw_CLEAR(VU);
-	VU_STAT_UPDATE(VU);
+	if (!CHECK_VUADDSUBHACK) {
+		if (_X){ dst->i.x = VU_MACx_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.x) + vuDouble(VU->VI[REG_I].UL));} else VU_MACx_CLEAR(VU);
+		if (_Y){ dst->i.y = VU_MACy_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.y) + vuDouble(VU->VI[REG_I].UL));} else VU_MACy_CLEAR(VU);
+		if (_Z){ dst->i.z = VU_MACz_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.z) + vuDouble(VU->VI[REG_I].UL));} else VU_MACz_CLEAR(VU);
+		if (_W){ dst->i.w = VU_MACw_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.w) + vuDouble(VU->VI[REG_I].UL));} else VU_MACw_CLEAR(VU);
+		VU_STAT_UPDATE(VU);
+	}
+	else {
+		if (_X){ dst->i.x = VU_MACx_UPDATE(VU, vuADD_TriAceHack(VU->VF[_Fs_].i.x, VU->VI[REG_I].UL));} else VU_MACx_CLEAR(VU);
+		if (_Y){ dst->i.y = VU_MACy_UPDATE(VU, vuADD_TriAceHack(VU->VF[_Fs_].i.y, VU->VI[REG_I].UL));} else VU_MACy_CLEAR(VU);
+		if (_Z){ dst->i.z = VU_MACz_UPDATE(VU, vuADD_TriAceHack(VU->VF[_Fs_].i.z, VU->VI[REG_I].UL));} else VU_MACz_CLEAR(VU);
+		if (_W){ dst->i.w = VU_MACw_UPDATE(VU, vuADD_TriAceHack(VU->VF[_Fs_].i.w, VU->VI[REG_I].UL));} else VU_MACw_CLEAR(VU);
+		VU_STAT_UPDATE(VU);
+	}
+	
 }/*Reworked from define to function. asadr*/
 
 static __fi void _vuADDq(VURegs * VU) {
diff --git a/pcsx2/x86/microVU_Misc.inl b/pcsx2/x86/microVU_Misc.inl
index b1c6aad445..73d1a8cf78 100644
--- a/pcsx2/x86/microVU_Misc.inl
+++ b/pcsx2/x86/microVU_Misc.inl
@@ -214,8 +214,7 @@ void mVUmergeRegs(const xmm& dest, const xmm& src, int xyzw, bool modXYZW)
 //------------------------------------------------------------------
 
 // Backup Volatile Regs (EAX, ECX, EDX, MM0~7, XMM0~7, are all volatile according to 32bit Win/Linux ABI)
-__fi void mVUbackupRegs(microVU& mVU, bool toMemory = false)
-{
+__fi void mVUbackupRegs(microVU& mVU, bool toMemory = false) {
 	if (toMemory) {
 		for(int i = 0; i < 8; i++) {
 			xMOVAPS(ptr128[&mVU.xmmBackup[i][0]], xmm(i));
@@ -228,8 +227,7 @@ __fi void mVUbackupRegs(microVU& mVU, bool toMemory = false)
 }
 
 // Restore Volatile Regs
-__fi void mVUrestoreRegs(microVU& mVU, bool fromMemory = false)
-{
+__fi void mVUrestoreRegs(microVU& mVU, bool fromMemory = false) {
 	if (fromMemory) {
 		for(int i = 0; i < 8; i++) {
 			xMOVAPS(xmm(i), ptr128[&mVU.xmmBackup[i][0]]);
@@ -238,6 +236,20 @@ __fi void mVUrestoreRegs(microVU& mVU, bool fromMemory = false)
 	else xMOVAPS(xmmPQ, ptr128[&mVU.xmmBackup[xmmPQ.Id][0]]);
 }
 
+_mVUt void __fc mVUprintRegs() {
+	microVU& mVU = mVUx;
+	for(int i = 0; i < 8; i++) {
+		Console.WriteLn("xmm%d = [0x%08x,0x%08x,0x%08x,0x%08x]", i,
+			mVU.xmmBackup[i][0], mVU.xmmBackup[i][1],
+			mVU.xmmBackup[i][2], mVU.xmmBackup[i][3]);
+	}
+	for(int i = 0; i < 8; i++) {
+		Console.WriteLn("xmm%d = [%f,%f,%f,%f]", i,
+			(float&)mVU.xmmBackup[i][0], (float&)mVU.xmmBackup[i][1],
+			(float&)mVU.xmmBackup[i][2], (float&)mVU.xmmBackup[i][3]);
+	}
+}
+
 // Gets called by mVUaddrFix at execution-time
 static void __fc mVUwarningRegAccess(u32 prog, u32 pc) {
 	Console.Error("microVU0 Warning: Accessing VU1 Regs! [%04x] [%x]", pc, prog);
@@ -288,12 +300,13 @@ __fi void mVUaddrFix(mV, const x32& gprReg)
 // Micro VU - Custom SSE Instructions
 //------------------------------------------------------------------
 
-struct SSEMaskPair { u32 mask1[4], mask2[4]; };
+struct SSEMasks { u32 MIN_MAX_1[4], MIN_MAX_2[4], ADD_SS[4]; };
 
-static const __aligned16 SSEMaskPair MIN_MAX =
+static const __aligned16 SSEMasks sseMasks =
 {
 	{0xffffffff, 0x80000000, 0xffffffff, 0x80000000},
-	{0x00000000, 0x40000000, 0x00000000, 0x40000000}
+	{0x00000000, 0x40000000, 0x00000000, 0x40000000},
+	{0x80000000, 0xffffffff, 0xffffffff, 0xffffffff}
 };
 
 
@@ -306,21 +319,21 @@ void MIN_MAX_PS(microVU& mVU, const xmm& to, const xmm& from, const xmm& t1in, c
 	if (0) { // use double comparison
 		// ZW
 		xPSHUF.D(t1, to, 0xfa);
-		xPAND   (t1, ptr128[MIN_MAX.mask1]);
-		xPOR    (t1, ptr128[MIN_MAX.mask2]);
+		xPAND   (t1, ptr128[sseMasks.MIN_MAX_1]);
+		xPOR    (t1, ptr128[sseMasks.MIN_MAX_2]);
 		xPSHUF.D(t2, from, 0xfa);
-		xPAND   (t2, ptr128[MIN_MAX.mask1]);
-		xPOR    (t2, ptr128[MIN_MAX.mask2]);
+		xPAND   (t2, ptr128[sseMasks.MIN_MAX_1]);
+		xPOR    (t2, ptr128[sseMasks.MIN_MAX_2]);
 		if (min) xMIN.PD(t1, t2);
 		else     xMAX.PD(t1, t2);
 
 		// XY
 		xPSHUF.D(t2, from, 0x50);
-		xPAND   (t2, ptr128[MIN_MAX.mask1]);
-		xPOR    (t2, ptr128[MIN_MAX.mask2]);
+		xPAND   (t2, ptr128[sseMasks.MIN_MAX_1]);
+		xPOR    (t2, ptr128[sseMasks.MIN_MAX_2]);
 		xPSHUF.D(to, to, 0x50);
-		xPAND   (to, ptr128[MIN_MAX.mask1]);
-		xPOR    (to, ptr128[MIN_MAX.mask2]);
+		xPAND   (to, ptr128[sseMasks.MIN_MAX_1]);
+		xPOR    (to, ptr128[sseMasks.MIN_MAX_2]);
 		if (min) xMIN.PD(to, t2);
 		else     xMAX.PD(to, t2);
 
@@ -355,83 +368,100 @@ void MIN_MAX_SS(mV, const xmm& to, const xmm& from, const xmm& t1in, bool min)
 {
 	const xmm& t1 = t1in.IsEmpty() ? mVU.regAlloc->allocReg() : t1in;
 	xSHUF.PS(to, from, 0);
-	xPAND	(to, ptr128[MIN_MAX.mask1]);
-	xPOR	(to, ptr128[MIN_MAX.mask2]);
+	xPAND	(to, ptr128[sseMasks.MIN_MAX_1]);
+	xPOR	(to, ptr128[sseMasks.MIN_MAX_2]);
 	xPSHUF.D(t1, to, 0xee);
 	if (min) xMIN.PD(to, t1);
 	else	 xMAX.PD(to, t1);
 	if (t1 != t1in) mVU.regAlloc->clearNeeded(t1);
 }
 
-// Warning: Modifies all vectors in 'to' and 'from', and Modifies xmmT1 and xmmT2
-void ADD_SS(microVU& mVU, const xmm& to, const xmm& from, const xmm& t1in, const xmm& t2in)
+// Not Used! - TriAce games only need a portion of this code to boot (see function below)
+// What this code attempts to do is do a floating point ADD with only 1 guard bit,
+// whereas FPU calculations that follow the IEEE standard have 3 guard bits (guard|round|sticky)
+// Warning: Modifies all vectors in 'to' and 'from', and Modifies t1in
+void ADD_SS_Single_Guard_Bit(microVU& mVU, const xmm& to, const xmm& from, const xmm& t1in)
 {
 	const xmm& t1 = t1in.IsEmpty() ? mVU.regAlloc->allocReg() : t1in;
-	const xmm& t2 = t2in.IsEmpty() ? mVU.regAlloc->allocReg() : t2in;
 
-	xMOVAPS(t1, to);
-	xMOVAPS(t2, from);
-	xMOVD(ecx, to);
-	xSHR(ecx, 23);
-	xMOVD(eax, from);
-	xSHR(eax, 23);
-	xAND(ecx, 0xff);
-	xAND(eax, 0xff);
-	xSUB(ecx, eax); //ecx = exponent difference
+	xMOVD(eax, to);
+	xMOVD(ecx, from);
+	xSHR (eax, 23);
+	xSHR (ecx, 23);
+	xAND (eax, 0xff);
+	xAND (ecx, 0xff);
+	xSUB (ecx, eax); // Exponent Difference
 
-	xCMP(ecx, 25);
-	xForwardJGE8 case2;
-	xCMP(ecx, 0);
-	xForwardJG8 case3;
-	xForwardJE8 toend1;
-	xCMP(ecx, -25);
-	xForwardJLE8 case4;
+	xForwardJL8 case_neg;
+	xForwardJE8 case_end1;
 
-	// negative small
-		xNOT(ecx); // -ecx - 1
-		xMOV(eax, 0xffffffff);
-		xSHL(eax, cl);
-		xPCMP.EQB(to, to);
-		xMOVDZX(from, eax);
-		xMOVSS(to, from);
-		xPCMP.EQB(from, from);
-	xForwardJump8 toend2;
+	xCMP (ecx, 24);
+	xForwardJLE8 case_pos_small;
 
-	case2.SetTarget(); // positive large
-		xMOV(eax, 0x80000000);
-		xPCMP.EQB(from, from);
-		xMOVDZX(to, eax);
-		xMOVSS(from, to);
-		xPCMP.EQB(to, to);
-	xForwardJump8 toend3;
+	// case_pos_big:
+	xPAND(to, ptr128[sseMasks.ADD_SS]);
+	xForwardJump8 case_end2;
 
-	case3.SetTarget(); // positive small
-		xDEC(ecx);
-		xMOV(eax, 0xffffffff);
-		xSHL(eax, cl);
-		xPCMP.EQB(from, from);
-		xMOVDZX(to, eax);
-		xMOVSS(from, to);
-		xPCMP.EQB(to, to);
-	xForwardJump8 toend4;
+	case_pos_small.SetTarget();
+	xDEC   (ecx);
+	xMOV   (eax, 0xffffffff);
+	xSHL   (eax, cl);
+	xMOVDZX(t1, eax);
+	xPAND  (to, t1);
+	xForwardJump8 case_end3;
 
-	case4.SetTarget(); // negative large
-		xMOV(eax, 0x80000000);
-		xPCMP.EQB(to, to);
-		xMOVDZX(from, eax);
-		xMOVSS(to, from);
-		xPCMP.EQB(from, from);
+	case_neg.SetTarget();
+	xCMP (ecx, -24);
+	xForwardJGE8 case_neg_small;
 
-	toend1.SetTarget();
-	toend2.SetTarget();
-	toend3.SetTarget();
-	toend4.SetTarget();
+	// case_neg_big:
+	xPAND(from, ptr128[sseMasks.ADD_SS]);
+	xForwardJump8 case_end4;
+
+	case_neg_small.SetTarget();
+	xNOT   (ecx); // -ecx - 1
+	xMOV   (eax, 0xffffffff);
+	xSHL   (eax, cl);
+	xMOVDZX(t1, eax);
+	xPAND  (from, t1);
+
+	case_end1.SetTarget();
+	case_end2.SetTarget();
+	case_end3.SetTarget();
+	case_end4.SetTarget();
 
-	xAND.PS(to,   t1); // to   contains mask
-	xAND.PS(from, t2); // from contains mask
 	xADD.SS(to, from);
 	if (t1 != t1in) mVU.regAlloc->clearNeeded(t1);
-	if (t2 != t2in) mVU.regAlloc->clearNeeded(t2);
+}
+
+// Turns out only this is needed to get TriAce games booting with mVU
+// Modifies from's lower vector
+void ADD_SS_TriAceHack(microVU& mVU, const xmm& to, const xmm& from)
+{
+	xMOVD(eax, to);
+	xMOVD(ecx, from);
+	xSHR (eax, 23);
+	xSHR (ecx, 23);
+	xAND (eax, 0xff);
+	xAND (ecx, 0xff);
+	xSUB (ecx, eax); // Exponent Difference
+
+	xCMP (ecx, -25);
+	xForwardJLE8 case_neg_big;
+	xCMP (ecx,  25);
+	xForwardJL8  case_end1;
+
+	// case_pos_big:
+	xPAND(to, ptr128[sseMasks.ADD_SS]);
+	xForwardJump8 case_end2;
+
+	case_neg_big.SetTarget();
+	xPAND(from, ptr128[sseMasks.ADD_SS]);
+
+	case_end1.SetTarget();
+	case_end2.SetTarget();
+
+	xADD.SS(to, from);
 }
 
 #define clampOp(opX, isPS) {					\
@@ -464,7 +494,7 @@ void SSE_MINSS(mV, const xmm& to, const xmm& from, const xmm& t1 = xEmptyReg, co
 void SSE_ADD2SS(mV, const xmm& to, const xmm& from, const xmm& t1 = xEmptyReg, const xmm& t2 = xEmptyReg)
 {
 	if (!CHECK_VUADDSUBHACK) { clampOp(xADD.SS, 0); }
-	else					 { ADD_SS(mVU, to, from, t1, t2); }
+	else					 { ADD_SS_TriAceHack(mVU, to, from); }
 }
 
 // Does same as SSE_ADDPS since tri-ace games only need SS implementation of VUADDSUBHACK...