diff --git a/pcsx2/VU0.cpp b/pcsx2/VU0.cpp index d616bdb854..1218ad84dd 100644 --- a/pcsx2/VU0.cpp +++ b/pcsx2/VU0.cpp @@ -167,7 +167,7 @@ void CTC2() { break; case REG_CMSAR1: // REG_CMSAR1 if (!(VU0.VI[REG_VPU_STAT].UL & 0x100) ) { - vu1ExecMicro(cpuRegs.GPR.r[_Rt_].US[0]); // Execute VU1 Micro SubRoutine + vu1ExecMicro(cpuRegs.GPR.r[_Rt_].US[0] * 8); // Execute VU1 Micro SubRoutine vif1VUFinish(); } break; diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl index 01af2ce01c..5b54e63ffe 100644 --- a/pcsx2/x86/microVU_Lower.inl +++ b/pcsx2/x86/microVU_Lower.inl @@ -153,8 +153,8 @@ mVUop(mVU_RSQRT) { #define EATANhelper(addr) { \ SSE_MULSS(mVU, t2, Fs); \ SSE_MULSS(mVU, t2, Fs); \ - xMOVAPS (t1, t2); \ - xMUL.SS (t1, ptr32[addr]); \ + xMOVAPS (t1, t2); \ + xMUL.SS (t1, ptr32[addr]); \ SSE_ADDSS(mVU, PQ, t1); \ } @@ -182,7 +182,7 @@ mVUop(mVU_EATAN) { const xmm& t2 = mVU.regAlloc->allocReg(); xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance xMOVSS (xmmPQ, Fs); - xSUB.SS(Fs, ptr32[mVUglob.one]); + xSUB.SS(Fs, ptr32[mVUglob.one]); xADD.SS(xmmPQ, ptr32[mVUglob.one]); SSE_DIVSS(mVU, Fs, xmmPQ); mVU_EATAN_(mVU, xmmPQ, Fs, t1, t2); @@ -238,8 +238,8 @@ mVUop(mVU_EATANxz) { #define eexpHelper(addr) { \ SSE_MULSS(mVU, t2, Fs); \ - xMOVAPS (t1, t2); \ - xMUL.SS (t1, ptr32[addr]); \ + xMOVAPS (t1, t2); \ + xMUL.SS (t1, ptr32[addr]); \ SSE_ADDSS(mVU, xmmPQ, t1); \ } @@ -253,22 +253,22 @@ mVUop(mVU_EEXP) { xMOVSS (xmmPQ, Fs); xMUL.SS (xmmPQ, ptr32[mVUglob.E1]); xADD.SS (xmmPQ, ptr32[mVUglob.one]); - xMOVAPS (t1, Fs); + xMOVAPS (t1, Fs); SSE_MULSS(mVU, t1, Fs); - xMOVAPS (t2, t1); - xMUL.SS (t1, ptr32[mVUglob.E2]); + xMOVAPS (t2, t1); + xMUL.SS (t1, ptr32[mVUglob.E2]); SSE_ADDSS(mVU, xmmPQ, t1); eexpHelper(&mVUglob.E3); eexpHelper(&mVUglob.E4); eexpHelper(&mVUglob.E5); SSE_MULSS(mVU, t2, Fs); - xMUL.SS (t2, ptr32[mVUglob.E6]); + xMUL.SS (t2, ptr32[mVUglob.E6]); SSE_ADDSS(mVU, xmmPQ, t2); SSE_MULSS(mVU, xmmPQ, xmmPQ); SSE_MULSS(mVU, xmmPQ, xmmPQ); - xMOVSSZX (t2, ptr32[mVUglob.one]); + xMOVSSZX (t2, ptr32[mVUglob.one]); SSE_DIVSS(mVU, t2, xmmPQ); - xMOVSS (xmmPQ, t2); + xMOVSS (xmmPQ, t2); xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back mVU.regAlloc->clearNeeded(Fs); mVU.regAlloc->clearNeeded(t1); @@ -285,12 +285,12 @@ static __fi void mVU_sumXYZ(mV, const xmm& PQ, const xmm& Fs) { xMOVSS(PQ, Fs); } else { - SSE_MULPS(mVU, Fs, Fs); // wzyx ^ 2 - xMOVSS (PQ, Fs); // x ^ 2 - xPSHUF.D (Fs, Fs, 0xe1); // wzyx -> wzxy - SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2 - xPSHUF.D (Fs, Fs, 0xd2); // wzxy -> wxyz - SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2 + z ^ 2 + SSE_MULPS(mVU, Fs, Fs); // wzyx ^ 2 + xMOVSS (PQ, Fs); // x ^ 2 + xPSHUF.D (Fs, Fs, 0xe1); // wzyx -> wzxy + SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2 + xPSHUF.D (Fs, Fs, 0xd2); // wzxy -> wxyz + SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2 + z ^ 2 } } @@ -298,10 +298,10 @@ mVUop(mVU_ELENG) { pass1 { mVUanalyzeEFU2(mVU, _Fs_, 18); } pass2 { const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance mVU_sumXYZ(mVU, xmmPQ, Fs); - xSQRT.SS (xmmPQ, xmmPQ); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back + xSQRT.SS (xmmPQ, xmmPQ); + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back mVU.regAlloc->clearNeeded(Fs); mVU.profiler.EmitOp(opELENG); } @@ -312,12 +312,12 @@ mVUop(mVU_ERCPR) { pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 12); } pass2 { const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_))); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance - xMOVSS (xmmPQ, Fs); - xMOVSSZX (Fs, ptr32[mVUglob.one]); + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + xMOVSS (xmmPQ, Fs); + xMOVSSZX (Fs, ptr32[mVUglob.one]); SSE_DIVSS(mVU, Fs, xmmPQ); - xMOVSS (xmmPQ, Fs); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back + xMOVSS (xmmPQ, Fs); + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back mVU.regAlloc->clearNeeded(Fs); mVU.profiler.EmitOp(opERCPR); } @@ -328,13 +328,13 @@ mVUop(mVU_ERLENG) { pass1 { mVUanalyzeEFU2(mVU, _Fs_, 24); } pass2 { const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance mVU_sumXYZ(mVU, xmmPQ, Fs); - xSQRT.SS (xmmPQ, xmmPQ); - xMOVSSZX (Fs, ptr32[mVUglob.one]); + xSQRT.SS (xmmPQ, xmmPQ); + xMOVSSZX (Fs, ptr32[mVUglob.one]); SSE_DIVSS (mVU, Fs, xmmPQ); - xMOVSS (xmmPQ, Fs); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back + xMOVSS (xmmPQ, Fs); + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back mVU.regAlloc->clearNeeded(Fs); mVU.profiler.EmitOp(opERLENG); } @@ -345,12 +345,12 @@ mVUop(mVU_ERSADD) { pass1 { mVUanalyzeEFU2(mVU, _Fs_, 18); } pass2 { const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance mVU_sumXYZ(mVU, xmmPQ, Fs); - xMOVSSZX (Fs, ptr32[mVUglob.one]); + xMOVSSZX (Fs, ptr32[mVUglob.one]); SSE_DIVSS (mVU, Fs, xmmPQ); - xMOVSS (xmmPQ, Fs); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back + xMOVSS (xmmPQ, Fs); + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back mVU.regAlloc->clearNeeded(Fs); mVU.profiler.EmitOp(opERSADD); } @@ -361,13 +361,13 @@ mVUop(mVU_ERSQRT) { pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 18); } pass2 { const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_))); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance - xAND.PS (Fs, ptr128[mVUglob.absclip]); - xSQRT.SS (xmmPQ, Fs); - xMOVSSZX (Fs, ptr32[mVUglob.one]); + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + xAND.PS (Fs, ptr128[mVUglob.absclip]); + xSQRT.SS (xmmPQ, Fs); + xMOVSSZX (Fs, ptr32[mVUglob.one]); SSE_DIVSS(mVU, Fs, xmmPQ); - xMOVSS (xmmPQ, Fs); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back + xMOVSS (xmmPQ, Fs); + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back mVU.regAlloc->clearNeeded(Fs); mVU.profiler.EmitOp(opERSQRT); } @@ -393,29 +393,29 @@ mVUop(mVU_ESIN) { const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_))); const xmm& t1 = mVU.regAlloc->allocReg(); const xmm& t2 = mVU.regAlloc->allocReg(); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance - xMOVSS (xmmPQ, Fs); // pq = X - SSE_MULSS(mVU, Fs, Fs); // fs = X^2 - xMOVAPS (t1, Fs); // t1 = X^2 + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + xMOVSS (xmmPQ, Fs); // pq = X + SSE_MULSS(mVU, Fs, Fs); // fs = X^2 + xMOVAPS (t1, Fs); // t1 = X^2 SSE_MULSS(mVU, Fs, xmmPQ); // fs = X^3 - xMOVAPS (t2, Fs); // t2 = X^3 - xMUL.SS (Fs, ptr32[mVUglob.S2]); // fs = s2 * X^3 + xMOVAPS (t2, Fs); // t2 = X^3 + xMUL.SS (Fs, ptr32[mVUglob.S2]); // fs = s2 * X^3 SSE_ADDSS(mVU, xmmPQ, Fs); // pq = X + s2 * X^3 - SSE_MULSS(mVU, t2, t1); // t2 = X^3 * X^2 - xMOVAPS (Fs, t2); // fs = X^5 - xMUL.SS (Fs, ptr32[mVUglob.S3]); // ps = s3 * X^5 + SSE_MULSS(mVU, t2, t1); // t2 = X^3 * X^2 + xMOVAPS (Fs, t2); // fs = X^5 + xMUL.SS (Fs, ptr32[mVUglob.S3]); // ps = s3 * X^5 SSE_ADDSS(mVU, xmmPQ, Fs); // pq = X + s2 * X^3 + s3 * X^5 - SSE_MULSS(mVU, t2, t1); // t2 = X^5 * X^2 - xMOVAPS (Fs, t2); // fs = X^7 - xMUL.SS (Fs, ptr32[mVUglob.S4]); // fs = s4 * X^7 + SSE_MULSS(mVU, t2, t1); // t2 = X^5 * X^2 + xMOVAPS (Fs, t2); // fs = X^7 + xMUL.SS (Fs, ptr32[mVUglob.S4]); // fs = s4 * X^7 SSE_ADDSS(mVU, xmmPQ, Fs); // pq = X + s2 * X^3 + s3 * X^5 + s4 * X^7 - SSE_MULSS(mVU, t2, t1); // t2 = X^7 * X^2 - xMUL.SS (t2, ptr32[mVUglob.S5]); // t2 = s5 * X^9 + SSE_MULSS(mVU, t2, t1); // t2 = X^7 * X^2 + xMUL.SS (t2, ptr32[mVUglob.S5]); // t2 = s5 * X^9 SSE_ADDSS(mVU, xmmPQ, t2); // pq = X + s2 * X^3 + s3 * X^5 + s4 * X^7 + s5 * X^9 - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back mVU.regAlloc->clearNeeded(Fs); mVU.regAlloc->clearNeeded(t1); mVU.regAlloc->clearNeeded(t2); @@ -443,13 +443,13 @@ mVUop(mVU_ESUM) { pass2 { const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W); const xmm& t1 = mVU.regAlloc->allocReg(); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance - xPSHUF.D (t1, Fs, 0x1b); + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + xPSHUF.D (t1, Fs, 0x1b); SSE_ADDPS(mVU, Fs, t1); - xPSHUF.D (t1, Fs, 0x01); + xPSHUF.D (t1, Fs, 0x01); SSE_ADDSS(mVU, Fs, t1); - xMOVSS (xmmPQ, Fs); - xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back + xMOVSS (xmmPQ, Fs); + xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back mVU.regAlloc->clearNeeded(Fs); mVU.regAlloc->clearNeeded(t1); mVU.profiler.EmitOp(opESUM); @@ -842,15 +842,14 @@ mVUop(mVU_ILW) { } pass2 { xAddressVoid ptr(mVU.regs().Mem + offsetSS); - if (_Is_) { - mVUallocVIa(mVU, gprT2, _Is_); - xADD(gprT2, _Imm11_); - mVUaddrFix (mVU, gprT2); - ptr += gprT2; - } - else { - ptr += getVUmem(_Imm11_); - } + + mVUallocVIa(mVU, gprT2, _Is_); + if (!_Is_) + xXOR(gprT2, gprT2); + xADD(gprT2, _Imm11_); + mVUaddrFix (mVU, gprT2); + ptr += gprT2; + xMOVZX(gprT1, ptr16[ptr]); mVUallocVIb(mVU, gprT1, _It_); mVU.profiler.EmitOp(opILW); @@ -891,14 +890,14 @@ mVUop(mVU_ISW) { } pass2 { xAddressVoid ptr(mVU.regs().Mem); - if (_Is_) { - mVUallocVIa(mVU, gprT2, _Is_); - xADD(gprT2, _Imm11_); - mVUaddrFix (mVU, gprT2); - ptr += gprT2; - } - else - ptr += getVUmem(_Imm11_); + + mVUallocVIa(mVU, gprT2, _Is_); + if (!_Is_) + xXOR(gprT2, gprT2); + xADD(gprT2, _Imm11_); + mVUaddrFix (mVU, gprT2); + ptr += gprT2; + mVUallocVIa(mVU, gprT1, _It_); if (_X) xMOV(ptr32[ptr], gprT1); if (_Y) xMOV(ptr32[ptr+4], gprT1); @@ -938,14 +937,13 @@ mVUop(mVU_LQ) { pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, false); } pass2 { xAddressVoid ptr(mVU.regs().Mem); - if (_Is_) { - mVUallocVIa(mVU, gprT2, _Is_); - xADD(gprT2, _Imm11_); - mVUaddrFix(mVU, gprT2); - ptr += gprT2; - } - else - ptr += getVUmem(_Imm11_); + mVUallocVIa(mVU, gprT2, _Is_); + if (!_Is_) + xXOR(gprT2, gprT2); + xADD(gprT2, _Imm11_); + mVUaddrFix(mVU, gprT2); + ptr += gprT2; + const xmm& Ft = mVU.regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W); mVUloadReg(Ft, ptr, _X_Y_Z_W); mVU.regAlloc->clearNeeded(Ft); @@ -1006,14 +1004,14 @@ mVUop(mVU_SQ) { pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, false); } pass2 { xAddressVoid ptr(mVU.regs().Mem); - if (_It_) { - mVUallocVIa(mVU, gprT2, _It_); - xADD(gprT2, _Imm11_); - mVUaddrFix(mVU, gprT2); - ptr += gprT2; - } - else - ptr += getVUmem(_Imm11_); + + mVUallocVIa(mVU, gprT2, _It_); + if (!_It_) + xXOR(gprT2, gprT2); + xADD(gprT2, _Imm11_); + mVUaddrFix(mVU, gprT2); + ptr += gprT2; + const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W); mVUsaveReg(Fs, ptr, _X_Y_Z_W, 1); mVU.regAlloc->clearNeeded(Fs); diff --git a/pcsx2/x86/microVU_Macro.inl b/pcsx2/x86/microVU_Macro.inl index 4e136ccea1..6b33e9750c 100644 --- a/pcsx2/x86/microVU_Macro.inl +++ b/pcsx2/x86/microVU_Macro.inl @@ -347,6 +347,7 @@ static void recCTC2() { case REG_CMSAR1: // Execute VU1 Micro SubRoutine if (_Rt_) { xMOV(ecx, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]); + xSHL(ecx, 3); } else xXOR(ecx, ecx); xFastCall((void*)vu1ExecMicro, ecx);