From 857ab07f1cc7ab77c3449d1a4cb25220876c2223 Mon Sep 17 00:00:00 2001
From: refractionpcsx2 <refraction@gmail.com>
Date: Mon, 6 Sep 2021 17:47:34 +0100
Subject: [PATCH] VUInt: Fix macro flags and implement EFU ops correctly.

Also now using the microVU style Tri-Ace hack as the interpreters seem to now need it
---
 pcsx2/VUops.cpp | 99 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 33 deletions(-)

diff --git a/pcsx2/VUops.cpp b/pcsx2/VUops.cpp
index e15c85e469..47131495eb 100644
--- a/pcsx2/VUops.cpp
+++ b/pcsx2/VUops.cpp
@@ -477,7 +477,7 @@ static __fi float vuADD_TriAceHack(u32 a, u32 b) {
 	// On VU0 TriAce Games use ADDi and expects these bit-perfect results:
 	//if (a == 0xb3e2a619 && b == 0x42546666) return vuDouble(0x42546666);
 	//if (a == 0x8b5b19e9 && b == 0xc7f079b3) return vuDouble(0xc7f079b3);
-	if (a == 0x4b1ed4a8 && b == 0x43a02666) return vuDouble(0x4b1ed5e7);
+	//if (a == 0x4b1ed4a8 && b == 0x43a02666) return vuDouble(0x4b1ed5e7);
 	//if (a == 0x7d1ca47b && b == 0x42f23333) return vuDouble(0x7d1ca47b);
 
 	// In the 3rd case, some other rounding error is giving us incorrect
@@ -487,20 +487,19 @@ static __fi float vuADD_TriAceHack(u32 a, u32 b) {
 	// microVU gets the correct operands and result. The interps likely
 	// don't get it due to rounding towards nearest in other calculations.
 
-	if (0) {
-		// microVU uses something like this to get TriAce games working,
-		// but VU interpreters don't seem to need it currently:
-		s32 aExp = (a >> 23) & 0xff;
-		s32 bExp = (b >> 23) & 0xff;
-		if (aExp - bExp >= 25) b &= 0x80000000;
-		if (aExp - bExp <=-25) a &= 0x80000000;
-		float ret = vuDouble(a) + vuDouble(b);
-		DevCon.WriteLn("aExp = %d, bExp = %d", aExp, bExp);
-		DevCon.WriteLn("0x%08x + 0x%08x = 0x%08x", a, b, (u32&)ret);
-		DevCon.WriteLn("%f + %f = %f", vuDouble(a), vuDouble(b), ret);
-		return ret;
-	}
-	return vuDouble(a) + vuDouble(b);
+	// microVU uses something like this to get TriAce games working,
+	// but VU interpreters don't seem to need it currently:
+
+	// Update Sept 2021, now the interpreters don't suck, they do - Refraction
+	s32 aExp = (a >> 23) & 0xff;
+	s32 bExp = (b >> 23) & 0xff;
+	if (aExp - bExp >= 25) b &= 0x80000000;
+	if (aExp - bExp <=-25) a &= 0x80000000;
+	float ret = vuDouble(a) + vuDouble(b);
+	//DevCon.WriteLn("aExp = %d, bExp = %d", aExp, bExp);
+	//DevCon.WriteLn("0x%08x + 0x%08x = 0x%08x", a, b, (u32&)ret);
+	//DevCon.WriteLn("%f + %f = %f", vuDouble(a), vuDouble(b), ret);
+	return ret;
 }
 
 void _vuABS(VURegs * VU) {
@@ -2231,20 +2230,40 @@ static __ri void _vuERLENG(VURegs * VU) {
 	VU->p.F = p;
 }
 
+
+static __ri float _vuCalculateEATAN(float inputvalue) {
+	float eatanconst[9] = { 0.999999344348907f, -0.333298563957214f, 0.199465364217758f, -0.13085337519646f,
+						0.096420042216778f, -0.055909886956215f, 0.021861229091883f, -0.004054057877511f,
+						0.785398185253143f };
+
+	float result = (eatanconst[0] * inputvalue) + (eatanconst[1] * pow(inputvalue, 3)) + (eatanconst[2] * pow(inputvalue, 5))
+		+ (eatanconst[3] * pow(inputvalue, 7)) + (eatanconst[4] * pow(inputvalue, 9)) + (eatanconst[5] * pow(inputvalue, 11))
+		+ (eatanconst[6] * pow(inputvalue, 13)) + (eatanconst[7] * pow(inputvalue, 15));
+
+	result += eatanconst[8];
+
+	result = vuDouble(*(u32*)&result);
+
+	return result;
+}
+
+static __ri void _vuEATAN(VURegs* VU) {
+	float p = _vuCalculateEATAN(vuDouble(VU->VF[_Fs_].UL[_Fsf_]));
+	VU->p.F = p;
+}
+
 static __ri void _vuEATANxy(VURegs * VU) {
-	DevCon.Warning("EATANxy");
 	float p = 0;
 	if(vuDouble(VU->VF[_Fs_].i.x) != 0) {
-		p = atan2(vuDouble(VU->VF[_Fs_].i.y), vuDouble(VU->VF[_Fs_].i.x));
+		p = _vuCalculateEATAN(vuDouble(VU->VF[_Fs_].i.y) / vuDouble(VU->VF[_Fs_].i.x));
 	}
 	VU->p.F = p;
 }
 
 static __ri void _vuEATANxz(VURegs * VU) {
-	DevCon.Warning("EATANxz");
 	float p = 0;
 	if(vuDouble(VU->VF[_Fs_].i.x) != 0) {
-		p = atan2(vuDouble(VU->VF[_Fs_].i.z), vuDouble(VU->VF[_Fs_].i.x));
+		p = _vuCalculateEATAN(vuDouble(VU->VF[_Fs_].i.z) / vuDouble(VU->VF[_Fs_].i.x));
 	}
 	VU->p.F = p;
 }
@@ -2282,19 +2301,23 @@ static __ri void _vuERSQRT(VURegs * VU) {
 }
 
 static __ri void _vuESIN(VURegs * VU) {
-	DevCon.Warning("ESIN");
-	float p = sin(vuDouble(VU->VF[_Fs_].UL[_Fsf_]));
-	VU->p.F = p;
-}
+	float sinconsts[5] = { 1.0f, -0.166666567325592f, 0.008333025500178f, -0.000198074136279f, 0.000002601886990f };
+	float p = vuDouble(VU->VF[_Fs_].UL[_Fsf_]);
 
-static __ri void _vuEATAN(VURegs * VU) {
-	DevCon.Warning("EATAN");
-	float p = atan(vuDouble(VU->VF[_Fs_].UL[_Fsf_]));
-	VU->p.F = p;
+	p = (sinconsts[0] * p) + (sinconsts[1] * pow(p, 3)) + (sinconsts[2] * pow(p, 5)) + (sinconsts[3] * pow(p, 7)) + (sinconsts[4] * pow(p, 9));
+	VU->p.F = vuDouble(*(u32*)&p);
 }
 
 static __ri void _vuEEXP(VURegs * VU) {
-	float p = exp(-(vuDouble(VU->VF[_Fs_].UL[_Fsf_])));
+	float consts[6] = { 0.249998688697815f, 0.031257584691048f, 0.002591371303424f,
+						0.000171562001924f, 0.000005430199963f, 0.000000690600018f };
+	float p = vuDouble(VU->VF[_Fs_].UL[_Fsf_]);
+
+	p = 1.0f + (consts[0] * p) + (consts[1] * pow(p, 2)) + (consts[2] * pow(p, 3)) + (consts[3] * pow(p, 4)) + (consts[4] * pow(p, 5)) + (consts[5] * pow(p, 6));
+	p = pow(p, 4);
+	p = vuDouble(*(u32*)&p);
+	p = 1 / p;
+
 	VU->p.F = p;
 }
 
@@ -4266,14 +4289,24 @@ _vuRegsTables(VU1, VU1regs, Fnptr_VuRegsN)
 
 static __fi void SYNCMSFLAGS()
 {
-	VU0.VI[REG_STATUS_FLAG].UL = VU0.statusflag;
+	VU0.VI[REG_STATUS_FLAG].UL = (VU0.VI[REG_STATUS_FLAG].UL & 0xC30) | (VU0.statusflag & 0xF) | ((VU0.statusflag & 0xF) << 6);
 	VU0.VI[REG_MAC_FLAG].UL = VU0.macflag;
 }
 
+static __fi void SYNCCLIPFLAG()
+{
+	VU0.VI[REG_CLIP_FLAG].UL = VU0.clipflag;
+}
+
+static __fi void SYNCSTATUSFLAG()
+{
+	VU0.VI[REG_STATUS_FLAG].UL = (VU0.VI[REG_STATUS_FLAG].UL & 0x3F) | (VU0.statusflag & 0xFC0);
+}
+
 static __fi void SYNCFDIV()
 {
 	VU0.VI[REG_Q].UL = VU0.q.UL;
-	VU0.VI[REG_STATUS_FLAG].UL = VU0.statusflag;
+	VU0.VI[REG_STATUS_FLAG].UL = (VU0.VI[REG_STATUS_FLAG].UL & 0x3CF) | (VU0.statusflag & 0x30) | ((VU0.statusflag & 0x30) << 6);
 }
 
 void VABS()  { VU0.code = cpuRegs.code; _vuABS(&VU0); }
@@ -4370,7 +4403,7 @@ void VITOF0()  { VU0.code = cpuRegs.code; _vuITOF0(&VU0); }
 void VITOF4()  { VU0.code = cpuRegs.code; _vuITOF4(&VU0); }
 void VITOF12() { VU0.code = cpuRegs.code; _vuITOF12(&VU0); }
 void VITOF15() { VU0.code = cpuRegs.code; _vuITOF15(&VU0); }
-void VCLIPw()  { VU0.code = cpuRegs.code; _vuCLIP(&VU0); VU0.VI[REG_CLIP_FLAG].UL = VU0.clipflag; }
+void VCLIPw()  { VU0.code = cpuRegs.code; _vuCLIP(&VU0); SYNCCLIPFLAG(); }
 
 void VDIV()    { VU0.code = cpuRegs.code; _vuDIV(&VU0); SYNCFDIV(); }
 void VSQRT()   { VU0.code = cpuRegs.code; _vuSQRT(&VU0); SYNCFDIV(); }
@@ -4404,14 +4437,14 @@ void VWAITQ()  { VU0.code = cpuRegs.code; _vuWAITQ(&VU0); }
 void VFSAND()  { VU0.code = cpuRegs.code; _vuFSAND(&VU0); }
 void VFSEQ()   { VU0.code = cpuRegs.code; _vuFSEQ(&VU0); }
 void VFSOR()   { VU0.code = cpuRegs.code; _vuFSOR(&VU0); }
-void VFSSET()  { VU0.code = cpuRegs.code; _vuFSSET(&VU0); }
+void VFSSET()  { VU0.code = cpuRegs.code; _vuFSSET(&VU0); SYNCSTATUSFLAG(); }
 void VFMAND()  { VU0.code = cpuRegs.code; _vuFMAND(&VU0); }
 void VFMEQ()   { VU0.code = cpuRegs.code; _vuFMEQ(&VU0); }
 void VFMOR()   { VU0.code = cpuRegs.code; _vuFMOR(&VU0); }
 void VFCAND()  { VU0.code = cpuRegs.code; _vuFCAND(&VU0); }
 void VFCEQ()   { VU0.code = cpuRegs.code; _vuFCEQ(&VU0); }
 void VFCOR()   { VU0.code = cpuRegs.code; _vuFCOR(&VU0); }
-void VFCSET()  { VU0.code = cpuRegs.code; _vuFCSET(&VU0); }
+void VFCSET()  { VU0.code = cpuRegs.code; _vuFCSET(&VU0); SYNCCLIPFLAG(); }
 void VFCGET()  { VU0.code = cpuRegs.code; _vuFCGET(&VU0); }
 void VXITOP()  { VU0.code = cpuRegs.code; _vuXITOP(&VU0); }