From b894a7a21792a094b8227b504ad8f29b19057af2 Mon Sep 17 00:00:00 2001 From: cottonvibes Date: Thu, 12 Mar 2009 05:32:27 +0000 Subject: [PATCH] some tweaks based on some optimization tips jake found. the idea is that with SSE, doing operations that don't modify all the vectors can give false-dependencies, and in some situations prevent CPU's out-of-order execution (slows things down). so the solution is to only use stuff like "movss" when you need to preserve the upper 3 vectors, if not, its always better(faster) to use movaps. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@751 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/x86/microVU_Alloc.inl | 8 +- pcsx2/x86/microVU_Lower.inl | 93 +++++++++++++---------- pcsx2/x86/microVU_Misc.inl | 144 +++++++++++++++++++----------------- 3 files changed, 134 insertions(+), 111 deletions(-) diff --git a/pcsx2/x86/microVU_Alloc.inl b/pcsx2/x86/microVU_Alloc.inl index 32b53fbb97..4441c8e3c9 100644 --- a/pcsx2/x86/microVU_Alloc.inl +++ b/pcsx2/x86/microVU_Alloc.inl @@ -520,8 +520,8 @@ microVUt(void) mVUallocFMAC18a(int& ACC, int& Fs, int& Ft) { if (!_Ft_) { getZero4(Ft); } else { getReg4(Ft, _Ft_); } - SSE_SHUFPS_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY - SSE_SHUFPS_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ + SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY + SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ } microVUt(void) mVUallocFMAC18b(int& ACC, int& Fs) { @@ -546,8 +546,8 @@ microVUt(void) mVUallocFMAC19a(int& Fd, int&ACC, int& Fs, int& Ft) { if (!_Ft_) { getZero4(Ft); } else { getReg4(Ft, _Ft_); } - SSE_SHUFPS_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY - SSE_SHUFPS_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ + SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY + SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ } microVUt(void) mVUallocFMAC19b(int& Fd) { diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl index 3a9dc54aed..c3f3035c64 100644 --- a/pcsx2/x86/microVU_Lower.inl +++ b/pcsx2/x86/microVU_Lower.inl @@ -137,7 +137,7 @@ microVUf(void) mVU_RSQRT() { #define EATANhelper(addr) { \ SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \ SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \ - SSE_MOVSS_XMM_to_XMM(xmmFt, xmmT1); \ + SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmT1); \ SSE_MULSS_M32_to_XMM(xmmFt, (uptr)addr); \ SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt); \ } @@ -147,7 +147,7 @@ microVUt(void) mVU_EATAN_() { // ToDo: Can Be Optimized Further? (takes approximately (~115 cycles + mem access time) on a c2d) SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_T1); - SSE_MOVSS_XMM_to_XMM(xmmT1, xmmFs); + SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFs); EATANhelper(mVU_T2); EATANhelper(mVU_T3); @@ -158,14 +158,14 @@ microVUt(void) mVU_EATAN_() { EATANhelper(mVU_T8); SSE_ADDSS_M32_to_XMM(xmmPQ, (uptr)mVU_Pi4); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); } microVUf(void) mVU_EATAN() { microVU* mVU = mVUx; if (recPass == 0) {} else { getReg5(xmmFs, _Fs_, _Fsf_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_SUBSS_M32_to_XMM(xmmFs, (uptr)mVU_one); @@ -181,7 +181,7 @@ microVUf(void) mVU_EATANxy() { else { getReg6(xmmFt, _Fs_); SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFt, 0x01); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_SUBSS_M32_to_XMM(xmmFs, (uptr)mVU_one); @@ -197,7 +197,7 @@ microVUf(void) mVU_EATANxz() { else { getReg6(xmmFt, _Fs_); SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFt, 0x02); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_SUBSS_XMM_to_XMM(xmmFs, xmmFt); @@ -209,7 +209,7 @@ microVUf(void) mVU_EATANxz() { } #define eexpHelper(addr) { \ SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \ - SSE_MOVSS_XMM_to_XMM(xmmFt, xmmT1); \ + SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmT1); \ SSE_MULSS_M32_to_XMM(xmmFt, (uptr)addr); \ SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt); \ } @@ -218,14 +218,14 @@ microVUf(void) mVU_EEXP() { if (recPass == 0) {} else { getReg5(xmmFs, _Fs_, _Fsf_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_E1); SSE_ADDSS_M32_to_XMM(xmmPQ, (uptr)mVU_one); - SSE_MOVSS_XMM_to_XMM(xmmFt, xmmFs); + SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs); SSE_MULSS_XMM_to_XMM(xmmFt, xmmFs); - SSE_MOVSS_XMM_to_XMM(xmmT1, xmmFt); + SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFt); SSE_MULSS_M32_to_XMM(xmmFt, (uptr)mVU_E2); SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt); @@ -241,7 +241,7 @@ microVUf(void) mVU_EEXP() { SSE_MOVSS_M32_to_XMM(xmmT1, (uptr)mVU_one); SSE_DIVSS_XMM_to_XMM(xmmT1, xmmPQ); SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmT1); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back } } microVUt(void) mVU_sumXYZ() { @@ -253,9 +253,9 @@ microVUt(void) mVU_sumXYZ() { else { SSE_MULPS_XMM_to_XMM(xmmFs, xmmFs); // wzyx ^ 2 SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); - SSE_SHUFPS_XMM_to_XMM(xmmFs, xmmFs, 0xe1); // wzyx -> wzxy + SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFs, 0xe1); // wzyx -> wzxy SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); // x ^ 2 + y ^ 2 - SSE_SHUFPS_XMM_to_XMM(xmmFs, xmmFs, 0xD2); // wzxy -> wxyz + SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFs, 0xD2); // wzxy -> wxyz SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); // x ^ 2 + y ^ 2 + z ^ 2 } } @@ -264,10 +264,10 @@ microVUf(void) mVU_ELENG() { if (recPass == 0) {} else { getReg6(xmmFs, _Fs_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance mVU_sumXYZ(); SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmPQ); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back } } microVUf(void) mVU_ERCPR() { @@ -275,12 +275,12 @@ microVUf(void) mVU_ERCPR() { if (recPass == 0) {} else { getReg5(xmmFs, _Fs_, _Fsf_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one); SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ); SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back } } microVUf(void) mVU_ERLENG() { @@ -288,13 +288,13 @@ microVUf(void) mVU_ERLENG() { if (recPass == 0) {} else { getReg6(xmmFs, _Fs_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance mVU_sumXYZ(); SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmPQ); SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one); SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ); SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back } } microVUf(void) mVU_ERSADD() { @@ -302,13 +302,13 @@ microVUf(void) mVU_ERSADD() { if (recPass == 0) {} else { getReg6(xmmFs, _Fs_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance mVU_sumXYZ(); //SSE_RCPSS_XMM_to_XMM(xmmPQ, xmmPQ); // Lower Precision is bad? SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one); SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ); SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back } } microVUf(void) mVU_ERSQRT() { @@ -316,12 +316,12 @@ microVUf(void) mVU_ERSQRT() { if (recPass == 0) {} else { getReg5(xmmFs, _Fs_, _Fsf_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one); SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ); SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back } } microVUf(void) mVU_ESADD() { @@ -329,14 +329,14 @@ microVUf(void) mVU_ESADD() { if (recPass == 0) {} else { getReg6(xmmFs, _Fs_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance mVU_sumXYZ(); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back } } #define esinHelper(addr) { \ SSE_MULSS_XMM_to_XMM(xmmT1, xmmFt); \ - SSE_MOVSS_XMM_to_XMM(xmmFs, xmmT1); \ + SSE_MOVAPS_XMM_to_XMM(xmmFs, xmmT1); \ SSE_MULSS_M32_to_XMM(xmmFs, (uptr)addr); \ SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); \ } @@ -345,14 +345,14 @@ microVUf(void) mVU_ESIN() { if (recPass == 0) {} else { getReg5(xmmFs, _Fs_, _Fsf_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); //SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_one); // Multiplying by 1 is redundant? - SSE_MOVSS_XMM_to_XMM(xmmFt, xmmFs); + SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs); SSE_MULSS_XMM_to_XMM(xmmFs, xmmFt); - SSE_MOVSS_XMM_to_XMM(xmmT1, xmmFs); + SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFs); SSE_MULSS_XMM_to_XMM(xmmFs, xmmFt); - SSE_MOVSS_XMM_to_XMM(xmmFt, xmmFs); + SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs); SSE_MULSS_M32_to_XMM(xmmFs, (uptr)mVU_S2); SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); @@ -362,7 +362,7 @@ microVUf(void) mVU_ESIN() { SSE_MULSS_XMM_to_XMM(xmmT1, xmmFt); SSE_MULSS_M32_to_XMM(xmmT1, (uptr)mVU_S5); SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmT1); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back } } microVUf(void) mVU_ESQRT() { @@ -370,9 +370,9 @@ microVUf(void) mVU_ESQRT() { if (recPass == 0) {} else { getReg5(xmmFs, _Fs_, _Fsf_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmFs); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back } } microVUf(void) mVU_ESUM() { @@ -380,13 +380,13 @@ microVUf(void) mVU_ESUM() { if (recPass == 0) {} else { getReg6(xmmFs, _Fs_); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmFt, xmmFs, 0x1b); SSE_ADDPS_XMM_to_XMM(xmmFs, xmmFt); SSE2_PSHUFD_XMM_to_XMM(xmmFt, xmmFs, 0x01); SSE_ADDSS_XMM_to_XMM(xmmFs, xmmFt); SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); - SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back + SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back } } @@ -605,12 +605,29 @@ microVUf(void) mVU_MR32() { if (recPass == 0) {} else { mVUloadReg(xmmT1, (uptr)&mVU->regs->VF[_Fs_].UL[0], (_X_Y_Z_W == 8) ? 4 : 15); - if (_X_Y_Z_W != 8) { SSE_SHUFPS_XMM_to_XMM(xmmT1, xmmT1, 0x39); } + if (_X_Y_Z_W != 8) { SSE2_PSHUFD_XMM_to_XMM(xmmT1, xmmT1, 0x39); } mVUsaveReg(xmmT1, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W); } } - -microVUf(void) mVU_LQ() {} +microVUf(void) mVU_LQ() { + microVU* mVU = mVUx; + if (recPass == 0) {} + else { + if (!_Fs_) { + MOV32ItoR(gprT1, _Imm11_); + mVUaddrFix(gprT1); + mVUloadReg(xmmFt, (uptr)mVU->regs->Mem, _X_Y_Z_W); + mVUsaveReg(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W); + } + else { + mVUallocVIa(gprT1, _Fs_); + ADD32ItoR(gprT1, _Imm11_); + mVUaddrFix(gprT1); + mVUloadReg2(xmmFt, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W); + mVUsaveReg(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W); + } + } +} microVUf(void) mVU_LQD() {} microVUf(void) mVU_LQI() {} microVUf(void) mVU_SQ() {} diff --git a/pcsx2/x86/microVU_Misc.inl b/pcsx2/x86/microVU_Misc.inl index 3b1fe47866..6674a2d46d 100644 --- a/pcsx2/x86/microVU_Misc.inl +++ b/pcsx2/x86/microVU_Misc.inl @@ -73,82 +73,68 @@ microVUx(void) mVUunpack_xyzw(int dstreg, int srcreg, int xyzw) { } } -microVUx(void) mVUloadReg(int reg, u32 offset, int xyzw) { +microVUx(void) mVUloadReg(int reg, uptr offset, int xyzw) { switch( xyzw ) { case 8: SSE_MOVSS_M32_to_XMM(reg, offset); break; // X case 4: SSE_MOVSS_M32_to_XMM(reg, offset+4); break; // Y case 2: SSE_MOVSS_M32_to_XMM(reg, offset+8); break; // Z case 1: SSE_MOVSS_M32_to_XMM(reg, offset+12); break; // W - //case 3: SSE_MOVHPS_M64_to_XMM(reg, offset+8); break; // ZW (not sure if this is faster than default) - //case 12: SSE_MOVLPS_M64_to_XMM(reg, offset); break; // XY (not sure if this is faster than default) default: SSE_MOVAPS_M128_to_XMM(reg, offset); break; } } +microVUx(void) mVUloadReg2(int reg, int gprReg, uptr offset, int xyzw) { + switch( xyzw ) { + case 8: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset); break; // X + case 4: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+4); break; // Y + case 2: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+8); break; // Z + case 1: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+12); break; // W + default: SSE_MOVAPSRmtoROffset(reg, gprReg, offset); break; + } +} + microVUx(void) mVUsaveReg(int reg, u32 offset, int xyzw) { switch ( xyzw ) { - case 1: // W - SSE_MOVSS_XMM_to_M32(offset+12, reg); - break; - case 2: // Z - SSE_MOVSS_XMM_to_M32(offset+8, reg); - break; - case 3: // ZW - SSE_MOVHPS_XMM_to_M64(offset+8, reg); - break; - case 4: // Y - SSE_MOVSS_XMM_to_M32(offset+4, reg); - break; - case 5: // YW - SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xB1); - SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg); - SSE_MOVSS_XMM_to_M32(offset+4, reg); - SSE_MOVSS_XMM_to_M32(offset+12, xmmT1); - break; - case 6: // YZ - SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0xc9); - SSE_MOVLPS_XMM_to_M64(offset+4, xmmT1); - break; - case 7: // YZW - SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x93); //ZYXW - SSE_MOVHPS_XMM_to_M64(offset+4, xmmT1); - SSE_MOVSS_XMM_to_M32(offset+12, xmmT1); - break; - case 8: // X - SSE_MOVSS_XMM_to_M32(offset, reg); - break; - case 9: // XW - SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg); - SSE_MOVSS_XMM_to_M32(offset, reg); - if ( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSLDUP_XMM_to_XMM(xmmT1, xmmT1); - else SSE_SHUFPS_XMM_to_XMM(xmmT1, xmmT1, 0x55); - SSE_MOVSS_XMM_to_M32(offset+12, xmmT1); - break; - case 10: //XZ - SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg); - SSE_MOVSS_XMM_to_M32(offset, reg); - SSE_MOVSS_XMM_to_M32(offset+8, xmmT1); - break; - case 11: //XZW - SSE_MOVSS_XMM_to_M32(offset, reg); - SSE_MOVHPS_XMM_to_M64(offset+8, reg); - break; - case 12: // XY - SSE_MOVLPS_XMM_to_M64(offset, reg); - break; - case 13: // XYW - SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x4b); //YXZW - SSE_MOVHPS_XMM_to_M64(offset, xmmT1); - SSE_MOVSS_XMM_to_M32(offset+12, xmmT1); - break; - case 14: // XYZ - SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg); - SSE_MOVLPS_XMM_to_M64(offset, reg); - SSE_MOVSS_XMM_to_M32(offset+8, xmmT1); - break; - case 15: // XYZW - SSE_MOVAPS_XMM_to_M128(offset, reg); - break; + case 5: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xB1); + SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg); + SSE_MOVSS_XMM_to_M32(offset+4, reg); + SSE_MOVSS_XMM_to_M32(offset+12, xmmT1); + break; // YW + case 6: SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0xc9); + SSE_MOVLPS_XMM_to_M64(offset+4, xmmT1); + break; // YZ + case 7: SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x93); //ZYXW + SSE_MOVHPS_XMM_to_M64(offset+4, xmmT1); + SSE_MOVSS_XMM_to_M32(offset+12, xmmT1); + break; // YZW + case 9: SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg); + SSE_MOVSS_XMM_to_M32(offset, reg); + if ( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSLDUP_XMM_to_XMM(xmmT1, xmmT1); + else SSE2_PSHUFD_XMM_to_XMM(xmmT1, xmmT1, 0x55); + SSE_MOVSS_XMM_to_M32(offset+12, xmmT1); + break; // XW + case 10: SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg); + SSE_MOVSS_XMM_to_M32(offset, reg); + SSE_MOVSS_XMM_to_M32(offset+8, xmmT1); + break; //XZ + case 11: SSE_MOVSS_XMM_to_M32(offset, reg); + SSE_MOVHPS_XMM_to_M64(offset+8, reg); + break; //XZW + case 13: SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x4b); //YXZW + SSE_MOVHPS_XMM_to_M64(offset, xmmT1); + SSE_MOVSS_XMM_to_M32(offset+12, xmmT1); + break; // XYW + case 14: SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg); + SSE_MOVLPS_XMM_to_M64(offset, reg); + SSE_MOVSS_XMM_to_M32(offset+8, xmmT1); + break; // XYZ + case 8: SSE_MOVSS_XMM_to_M32(offset, reg); break; // X + case 4: SSE_MOVSS_XMM_to_M32(offset+4, reg); break; // Y + case 2: SSE_MOVSS_XMM_to_M32(offset+8, reg); break; // Z + case 1: SSE_MOVSS_XMM_to_M32(offset+12, reg); break; // W + case 12: SSE_MOVLPS_XMM_to_M64(offset, reg); break; // XY + case 3: SSE_MOVHPS_XMM_to_M64(offset+8, reg); break; // ZW + default: SSE_MOVAPS_XMM_to_M128(offset, reg); break; // XYZW } } @@ -174,10 +160,10 @@ microVUx(void) mVUmergeRegs(int dest, int src, int xyzw) { SSE2_MOVSD_XMM_to_XMM(dest, src); break; case 5: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd8); + SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd8); break; case 6: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x78); + SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x78); break; case 7: SSE_MOVSS_XMM_to_XMM(src, dest); SSE_MOVAPS_XMM_to_XMM(dest, src); @@ -185,10 +171,10 @@ microVUx(void) mVUmergeRegs(int dest, int src, int xyzw) { case 8: SSE_MOVSS_XMM_to_XMM(dest, src); break; case 9: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd2); + SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd2); break; case 10: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x72); + SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x72); break; case 11: SSE_MOVSS_XMM_to_XMM(dest, src); SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); @@ -210,4 +196,24 @@ microVUx(void) mVUmergeRegs(int dest, int src, int xyzw) { } } +// Transforms the Address in gprReg to valid VU0/VU1 Address +microVUt(void) mVUaddrFix(int gprReg) { + if ( vuIndex == 1 ) { + AND32ItoR(EAX, 0x3ff); // wrap around + SHL32ItoR(EAX, 4); + } + else { + u8 *jmpA, *jmpB; + CMP32ItoR(EAX, 0x400); + jmpA = JL8(0); // if addr >= 0x4000, reads VU1's VF regs and VI regs + AND32ItoR(EAX, 0x43f); + jmpB = JMP8(0); + x86SetJ8(jmpA); + AND32ItoR(EAX, 0xff); // if addr < 0x4000, wrap around + x86SetJ8(jmpB); + + SHL32ItoR(EAX, 4); // multiply by 16 (shift left by 4) + } +} + #endif //PCSX2_MICROVU