From f423a9c41d78bba3389b5923808b064695e234d4 Mon Sep 17 00:00:00 2001 From: cottonvibes Date: Mon, 7 Nov 2011 10:20:56 +0000 Subject: [PATCH] microVU: - Rewrote and simplified the TriAce gamefix VU interpreter: - Implemented a TriAce gamefix for vu0 interpreter git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4960 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/VUops.cpp | 50 +++++++++-- pcsx2/x86/microVU_Misc.inl | 180 +++++++++++++++++++++---------------- 2 files changed, 150 insertions(+), 80 deletions(-) diff --git a/pcsx2/VUops.cpp b/pcsx2/VUops.cpp index bcf40df61e..c1debb8fa7 100644 --- a/pcsx2/VUops.cpp +++ b/pcsx2/VUops.cpp @@ -339,6 +339,36 @@ static __fi float vuDouble(u32 f) } #endif +static __fi float vuADD_TriAceHack(u32 a, u32 b) { + // On VU0 TriAce Games use ADDi and expects these bit-perfect results: + //if (a == 0xb3e2a619 && b == 0x42546666) return vuDouble(0x42546666); + //if (a == 0x8b5b19e9 && b == 0xc7f079b3) return vuDouble(0xc7f079b3); + if (a == 0x4b1ed4a8 && b == 0x43a02666) return vuDouble(0x4b1ed5e7); + //if (a == 0x7d1ca47b && b == 0x42f23333) return vuDouble(0x7d1ca47b); + + // In the 3rd case, some other rounding error is giving us incorrect + // operands ('a' is wrong); and therefor an incorrect result. + // We're getting: 0x4b1ed4a8 + 0x43a02666 = 0x4b1ed5e8 + // We should be getting: 0x4b1ed4a7 + 0x43a02666 = 0x4b1ed5e7 + // microVU gets the correct operands and result. The interps likely + // don't get it due to rounding towards nearest in other calculations. + + if (0) { + // microVU uses something like this to get TriAce games working, + // but VU interpreters don't seem to need it currently: + s32 aExp = (a >> 23) & 0xff; + s32 bExp = (b >> 23) & 0xff; + if (aExp - bExp >= 25) b &= 0x80000000; + if (aExp - bExp <=-25) a &= 0x80000000; + float ret = vuDouble(a) + vuDouble(b); + DevCon.WriteLn("aExp = %d, bExp = %d", aExp, bExp); + DevCon.WriteLn("0x%08x + 0x%08x = 0x%08x", a, b, (u32&)ret); + DevCon.WriteLn("%f + %f = %f", vuDouble(a), vuDouble(b), ret); + return ret; + } + return vuDouble(a) + vuDouble(b); +} + void _vuABS(VURegs * VU) { if (_Ft_ == 0) return; @@ -367,11 +397,21 @@ static __fi void _vuADDi(VURegs * VU) { if (_Fd_ == 0) dst = &RDzero; else dst = &VU->VF[_Fd_]; - if (_X){ dst->i.x = VU_MACx_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.x) + vuDouble(VU->VI[REG_I].UL));} else VU_MACx_CLEAR(VU); - if (_Y){ dst->i.y = VU_MACy_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.y) + vuDouble(VU->VI[REG_I].UL));} else VU_MACy_CLEAR(VU); - if (_Z){ dst->i.z = VU_MACz_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.z) + vuDouble(VU->VI[REG_I].UL));} else VU_MACz_CLEAR(VU); - if (_W){ dst->i.w = VU_MACw_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.w) + vuDouble(VU->VI[REG_I].UL));} else VU_MACw_CLEAR(VU); - VU_STAT_UPDATE(VU); + if (!CHECK_VUADDSUBHACK) { + if (_X){ dst->i.x = VU_MACx_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.x) + vuDouble(VU->VI[REG_I].UL));} else VU_MACx_CLEAR(VU); + if (_Y){ dst->i.y = VU_MACy_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.y) + vuDouble(VU->VI[REG_I].UL));} else VU_MACy_CLEAR(VU); + if (_Z){ dst->i.z = VU_MACz_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.z) + vuDouble(VU->VI[REG_I].UL));} else VU_MACz_CLEAR(VU); + if (_W){ dst->i.w = VU_MACw_UPDATE(VU, vuDouble(VU->VF[_Fs_].i.w) + vuDouble(VU->VI[REG_I].UL));} else VU_MACw_CLEAR(VU); + VU_STAT_UPDATE(VU); + } + else { + if (_X){ dst->i.x = VU_MACx_UPDATE(VU, vuADD_TriAceHack(VU->VF[_Fs_].i.x, VU->VI[REG_I].UL));} else VU_MACx_CLEAR(VU); + if (_Y){ dst->i.y = VU_MACy_UPDATE(VU, vuADD_TriAceHack(VU->VF[_Fs_].i.y, VU->VI[REG_I].UL));} else VU_MACy_CLEAR(VU); + if (_Z){ dst->i.z = VU_MACz_UPDATE(VU, vuADD_TriAceHack(VU->VF[_Fs_].i.z, VU->VI[REG_I].UL));} else VU_MACz_CLEAR(VU); + if (_W){ dst->i.w = VU_MACw_UPDATE(VU, vuADD_TriAceHack(VU->VF[_Fs_].i.w, VU->VI[REG_I].UL));} else VU_MACw_CLEAR(VU); + VU_STAT_UPDATE(VU); + } + }/*Reworked from define to function. asadr*/ static __fi void _vuADDq(VURegs * VU) { diff --git a/pcsx2/x86/microVU_Misc.inl b/pcsx2/x86/microVU_Misc.inl index b1c6aad445..73d1a8cf78 100644 --- a/pcsx2/x86/microVU_Misc.inl +++ b/pcsx2/x86/microVU_Misc.inl @@ -214,8 +214,7 @@ void mVUmergeRegs(const xmm& dest, const xmm& src, int xyzw, bool modXYZW) //------------------------------------------------------------------ // Backup Volatile Regs (EAX, ECX, EDX, MM0~7, XMM0~7, are all volatile according to 32bit Win/Linux ABI) -__fi void mVUbackupRegs(microVU& mVU, bool toMemory = false) -{ +__fi void mVUbackupRegs(microVU& mVU, bool toMemory = false) { if (toMemory) { for(int i = 0; i < 8; i++) { xMOVAPS(ptr128[&mVU.xmmBackup[i][0]], xmm(i)); @@ -228,8 +227,7 @@ __fi void mVUbackupRegs(microVU& mVU, bool toMemory = false) } // Restore Volatile Regs -__fi void mVUrestoreRegs(microVU& mVU, bool fromMemory = false) -{ +__fi void mVUrestoreRegs(microVU& mVU, bool fromMemory = false) { if (fromMemory) { for(int i = 0; i < 8; i++) { xMOVAPS(xmm(i), ptr128[&mVU.xmmBackup[i][0]]); @@ -238,6 +236,20 @@ __fi void mVUrestoreRegs(microVU& mVU, bool fromMemory = false) else xMOVAPS(xmmPQ, ptr128[&mVU.xmmBackup[xmmPQ.Id][0]]); } +_mVUt void __fc mVUprintRegs() { + microVU& mVU = mVUx; + for(int i = 0; i < 8; i++) { + Console.WriteLn("xmm%d = [0x%08x,0x%08x,0x%08x,0x%08x]", i, + mVU.xmmBackup[i][0], mVU.xmmBackup[i][1], + mVU.xmmBackup[i][2], mVU.xmmBackup[i][3]); + } + for(int i = 0; i < 8; i++) { + Console.WriteLn("xmm%d = [%f,%f,%f,%f]", i, + (float&)mVU.xmmBackup[i][0], (float&)mVU.xmmBackup[i][1], + (float&)mVU.xmmBackup[i][2], (float&)mVU.xmmBackup[i][3]); + } +} + // Gets called by mVUaddrFix at execution-time static void __fc mVUwarningRegAccess(u32 prog, u32 pc) { Console.Error("microVU0 Warning: Accessing VU1 Regs! [%04x] [%x]", pc, prog); @@ -288,12 +300,13 @@ __fi void mVUaddrFix(mV, const x32& gprReg) // Micro VU - Custom SSE Instructions //------------------------------------------------------------------ -struct SSEMaskPair { u32 mask1[4], mask2[4]; }; +struct SSEMasks { u32 MIN_MAX_1[4], MIN_MAX_2[4], ADD_SS[4]; }; -static const __aligned16 SSEMaskPair MIN_MAX = +static const __aligned16 SSEMasks sseMasks = { {0xffffffff, 0x80000000, 0xffffffff, 0x80000000}, - {0x00000000, 0x40000000, 0x00000000, 0x40000000} + {0x00000000, 0x40000000, 0x00000000, 0x40000000}, + {0x80000000, 0xffffffff, 0xffffffff, 0xffffffff} }; @@ -306,21 +319,21 @@ void MIN_MAX_PS(microVU& mVU, const xmm& to, const xmm& from, const xmm& t1in, c if (0) { // use double comparison // ZW xPSHUF.D(t1, to, 0xfa); - xPAND (t1, ptr128[MIN_MAX.mask1]); - xPOR (t1, ptr128[MIN_MAX.mask2]); + xPAND (t1, ptr128[sseMasks.MIN_MAX_1]); + xPOR (t1, ptr128[sseMasks.MIN_MAX_2]); xPSHUF.D(t2, from, 0xfa); - xPAND (t2, ptr128[MIN_MAX.mask1]); - xPOR (t2, ptr128[MIN_MAX.mask2]); + xPAND (t2, ptr128[sseMasks.MIN_MAX_1]); + xPOR (t2, ptr128[sseMasks.MIN_MAX_2]); if (min) xMIN.PD(t1, t2); else xMAX.PD(t1, t2); // XY xPSHUF.D(t2, from, 0x50); - xPAND (t2, ptr128[MIN_MAX.mask1]); - xPOR (t2, ptr128[MIN_MAX.mask2]); + xPAND (t2, ptr128[sseMasks.MIN_MAX_1]); + xPOR (t2, ptr128[sseMasks.MIN_MAX_2]); xPSHUF.D(to, to, 0x50); - xPAND (to, ptr128[MIN_MAX.mask1]); - xPOR (to, ptr128[MIN_MAX.mask2]); + xPAND (to, ptr128[sseMasks.MIN_MAX_1]); + xPOR (to, ptr128[sseMasks.MIN_MAX_2]); if (min) xMIN.PD(to, t2); else xMAX.PD(to, t2); @@ -355,83 +368,100 @@ void MIN_MAX_SS(mV, const xmm& to, const xmm& from, const xmm& t1in, bool min) { const xmm& t1 = t1in.IsEmpty() ? mVU.regAlloc->allocReg() : t1in; xSHUF.PS(to, from, 0); - xPAND (to, ptr128[MIN_MAX.mask1]); - xPOR (to, ptr128[MIN_MAX.mask2]); + xPAND (to, ptr128[sseMasks.MIN_MAX_1]); + xPOR (to, ptr128[sseMasks.MIN_MAX_2]); xPSHUF.D(t1, to, 0xee); if (min) xMIN.PD(to, t1); else xMAX.PD(to, t1); if (t1 != t1in) mVU.regAlloc->clearNeeded(t1); } -// Warning: Modifies all vectors in 'to' and 'from', and Modifies xmmT1 and xmmT2 -void ADD_SS(microVU& mVU, const xmm& to, const xmm& from, const xmm& t1in, const xmm& t2in) +// Not Used! - TriAce games only need a portion of this code to boot (see function below) +// What this code attempts to do is do a floating point ADD with only 1 guard bit, +// whereas FPU calculations that follow the IEEE standard have 3 guard bits (guard|round|sticky) +// Warning: Modifies all vectors in 'to' and 'from', and Modifies t1in +void ADD_SS_Single_Guard_Bit(microVU& mVU, const xmm& to, const xmm& from, const xmm& t1in) { const xmm& t1 = t1in.IsEmpty() ? mVU.regAlloc->allocReg() : t1in; - const xmm& t2 = t2in.IsEmpty() ? mVU.regAlloc->allocReg() : t2in; - xMOVAPS(t1, to); - xMOVAPS(t2, from); - xMOVD(ecx, to); - xSHR(ecx, 23); - xMOVD(eax, from); - xSHR(eax, 23); - xAND(ecx, 0xff); - xAND(eax, 0xff); - xSUB(ecx, eax); //ecx = exponent difference + xMOVD(eax, to); + xMOVD(ecx, from); + xSHR (eax, 23); + xSHR (ecx, 23); + xAND (eax, 0xff); + xAND (ecx, 0xff); + xSUB (ecx, eax); // Exponent Difference - xCMP(ecx, 25); - xForwardJGE8 case2; - xCMP(ecx, 0); - xForwardJG8 case3; - xForwardJE8 toend1; - xCMP(ecx, -25); - xForwardJLE8 case4; + xForwardJL8 case_neg; + xForwardJE8 case_end1; - // negative small - xNOT(ecx); // -ecx - 1 - xMOV(eax, 0xffffffff); - xSHL(eax, cl); - xPCMP.EQB(to, to); - xMOVDZX(from, eax); - xMOVSS(to, from); - xPCMP.EQB(from, from); - xForwardJump8 toend2; + xCMP (ecx, 24); + xForwardJLE8 case_pos_small; - case2.SetTarget(); // positive large - xMOV(eax, 0x80000000); - xPCMP.EQB(from, from); - xMOVDZX(to, eax); - xMOVSS(from, to); - xPCMP.EQB(to, to); - xForwardJump8 toend3; + // case_pos_big: + xPAND(to, ptr128[sseMasks.ADD_SS]); + xForwardJump8 case_end2; - case3.SetTarget(); // positive small - xDEC(ecx); - xMOV(eax, 0xffffffff); - xSHL(eax, cl); - xPCMP.EQB(from, from); - xMOVDZX(to, eax); - xMOVSS(from, to); - xPCMP.EQB(to, to); - xForwardJump8 toend4; + case_pos_small.SetTarget(); + xDEC (ecx); + xMOV (eax, 0xffffffff); + xSHL (eax, cl); + xMOVDZX(t1, eax); + xPAND (to, t1); + xForwardJump8 case_end3; - case4.SetTarget(); // negative large - xMOV(eax, 0x80000000); - xPCMP.EQB(to, to); - xMOVDZX(from, eax); - xMOVSS(to, from); - xPCMP.EQB(from, from); + case_neg.SetTarget(); + xCMP (ecx, -24); + xForwardJGE8 case_neg_small; - toend1.SetTarget(); - toend2.SetTarget(); - toend3.SetTarget(); - toend4.SetTarget(); + // case_neg_big: + xPAND(from, ptr128[sseMasks.ADD_SS]); + xForwardJump8 case_end4; + + case_neg_small.SetTarget(); + xNOT (ecx); // -ecx - 1 + xMOV (eax, 0xffffffff); + xSHL (eax, cl); + xMOVDZX(t1, eax); + xPAND (from, t1); + + case_end1.SetTarget(); + case_end2.SetTarget(); + case_end3.SetTarget(); + case_end4.SetTarget(); - xAND.PS(to, t1); // to contains mask - xAND.PS(from, t2); // from contains mask xADD.SS(to, from); if (t1 != t1in) mVU.regAlloc->clearNeeded(t1); - if (t2 != t2in) mVU.regAlloc->clearNeeded(t2); +} + +// Turns out only this is needed to get TriAce games booting with mVU +// Modifies from's lower vector +void ADD_SS_TriAceHack(microVU& mVU, const xmm& to, const xmm& from) +{ + xMOVD(eax, to); + xMOVD(ecx, from); + xSHR (eax, 23); + xSHR (ecx, 23); + xAND (eax, 0xff); + xAND (ecx, 0xff); + xSUB (ecx, eax); // Exponent Difference + + xCMP (ecx, -25); + xForwardJLE8 case_neg_big; + xCMP (ecx, 25); + xForwardJL8 case_end1; + + // case_pos_big: + xPAND(to, ptr128[sseMasks.ADD_SS]); + xForwardJump8 case_end2; + + case_neg_big.SetTarget(); + xPAND(from, ptr128[sseMasks.ADD_SS]); + + case_end1.SetTarget(); + case_end2.SetTarget(); + + xADD.SS(to, from); } #define clampOp(opX, isPS) { \ @@ -464,7 +494,7 @@ void SSE_MINSS(mV, const xmm& to, const xmm& from, const xmm& t1 = xEmptyReg, co void SSE_ADD2SS(mV, const xmm& to, const xmm& from, const xmm& t1 = xEmptyReg, const xmm& t2 = xEmptyReg) { if (!CHECK_VUADDSUBHACK) { clampOp(xADD.SS, 0); } - else { ADD_SS(mVU, to, from, t1, t2); } + else { ADD_SS_TriAceHack(mVU, to, from); } } // Does same as SSE_ADDPS since tri-ace games only need SS implementation of VUADDSUBHACK...