some tweaks based on some optimization tips jake found.

the idea is that with SSE, doing operations that don't modify all the vectors can give false-dependencies, and in some situations prevent CPU's out-of-order execution (slows things down).
so the solution is to only use stuff like "movss" when you need to preserve the upper 3 vectors, if not, its always better(faster) to use movaps.


git-svn-id: http://pcsx2.googlecode.com/svn/trunk@751 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
cottonvibes 2009-03-12 05:32:27 +00:00
parent 3870debc22
commit b894a7a217
3 changed files with 134 additions and 111 deletions

View File

@ -520,8 +520,8 @@ microVUt(void) mVUallocFMAC18a(int& ACC, int& Fs, int& Ft) {
if (!_Ft_) { getZero4(Ft); }
else { getReg4(Ft, _Ft_); }
SSE_SHUFPS_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
SSE_SHUFPS_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
}
microVUt(void) mVUallocFMAC18b(int& ACC, int& Fs) {
@ -546,8 +546,8 @@ microVUt(void) mVUallocFMAC19a(int& Fd, int&ACC, int& Fs, int& Ft) {
if (!_Ft_) { getZero4(Ft); }
else { getReg4(Ft, _Ft_); }
SSE_SHUFPS_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
SSE_SHUFPS_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
}
microVUt(void) mVUallocFMAC19b(int& Fd) {

View File

@ -137,7 +137,7 @@ microVUf(void) mVU_RSQRT() {
#define EATANhelper(addr) { \
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \
SSE_MOVSS_XMM_to_XMM(xmmFt, xmmT1); \
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmT1); \
SSE_MULSS_M32_to_XMM(xmmFt, (uptr)addr); \
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt); \
}
@ -147,7 +147,7 @@ microVUt(void) mVU_EATAN_() {
// ToDo: Can Be Optimized Further? (takes approximately (~115 cycles + mem access time) on a c2d)
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_T1);
SSE_MOVSS_XMM_to_XMM(xmmT1, xmmFs);
SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFs);
EATANhelper(mVU_T2);
EATANhelper(mVU_T3);
@ -158,14 +158,14 @@ microVUt(void) mVU_EATAN_() {
EATANhelper(mVU_T8);
SSE_ADDSS_M32_to_XMM(xmmPQ, (uptr)mVU_Pi4);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6);
}
microVUf(void) mVU_EATAN() {
microVU* mVU = mVUx;
if (recPass == 0) {}
else {
getReg5(xmmFs, _Fs_, _Fsf_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_SUBSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
@ -181,7 +181,7 @@ microVUf(void) mVU_EATANxy() {
else {
getReg6(xmmFt, _Fs_);
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFt, 0x01);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_SUBSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
@ -197,7 +197,7 @@ microVUf(void) mVU_EATANxz() {
else {
getReg6(xmmFt, _Fs_);
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFt, 0x02);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_SUBSS_XMM_to_XMM(xmmFs, xmmFt);
@ -209,7 +209,7 @@ microVUf(void) mVU_EATANxz() {
}
#define eexpHelper(addr) { \
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \
SSE_MOVSS_XMM_to_XMM(xmmFt, xmmT1); \
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmT1); \
SSE_MULSS_M32_to_XMM(xmmFt, (uptr)addr); \
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt); \
}
@ -218,14 +218,14 @@ microVUf(void) mVU_EEXP() {
if (recPass == 0) {}
else {
getReg5(xmmFs, _Fs_, _Fsf_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_E1);
SSE_ADDSS_M32_to_XMM(xmmPQ, (uptr)mVU_one);
SSE_MOVSS_XMM_to_XMM(xmmFt, xmmFs);
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs);
SSE_MULSS_XMM_to_XMM(xmmFt, xmmFs);
SSE_MOVSS_XMM_to_XMM(xmmT1, xmmFt);
SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFt);
SSE_MULSS_M32_to_XMM(xmmFt, (uptr)mVU_E2);
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt);
@ -241,7 +241,7 @@ microVUf(void) mVU_EEXP() {
SSE_MOVSS_M32_to_XMM(xmmT1, (uptr)mVU_one);
SSE_DIVSS_XMM_to_XMM(xmmT1, xmmPQ);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmT1);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
}
}
microVUt(void) mVU_sumXYZ() {
@ -253,9 +253,9 @@ microVUt(void) mVU_sumXYZ() {
else {
SSE_MULPS_XMM_to_XMM(xmmFs, xmmFs); // wzyx ^ 2
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_SHUFPS_XMM_to_XMM(xmmFs, xmmFs, 0xe1); // wzyx -> wzxy
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFs, 0xe1); // wzyx -> wzxy
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); // x ^ 2 + y ^ 2
SSE_SHUFPS_XMM_to_XMM(xmmFs, xmmFs, 0xD2); // wzxy -> wxyz
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFs, 0xD2); // wzxy -> wxyz
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); // x ^ 2 + y ^ 2 + z ^ 2
}
}
@ -264,10 +264,10 @@ microVUf(void) mVU_ELENG() {
if (recPass == 0) {}
else {
getReg6(xmmFs, _Fs_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ<vuIndex>();
SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmPQ);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
}
}
microVUf(void) mVU_ERCPR() {
@ -275,12 +275,12 @@ microVUf(void) mVU_ERCPR() {
if (recPass == 0) {}
else {
getReg5(xmmFs, _Fs_, _Fsf_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
}
}
microVUf(void) mVU_ERLENG() {
@ -288,13 +288,13 @@ microVUf(void) mVU_ERLENG() {
if (recPass == 0) {}
else {
getReg6(xmmFs, _Fs_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ<vuIndex>();
SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmPQ);
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
}
}
microVUf(void) mVU_ERSADD() {
@ -302,13 +302,13 @@ microVUf(void) mVU_ERSADD() {
if (recPass == 0) {}
else {
getReg6(xmmFs, _Fs_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ<vuIndex>();
//SSE_RCPSS_XMM_to_XMM(xmmPQ, xmmPQ); // Lower Precision is bad?
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
}
}
microVUf(void) mVU_ERSQRT() {
@ -316,12 +316,12 @@ microVUf(void) mVU_ERSQRT() {
if (recPass == 0) {}
else {
getReg5(xmmFs, _Fs_, _Fsf_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
}
}
microVUf(void) mVU_ESADD() {
@ -329,14 +329,14 @@ microVUf(void) mVU_ESADD() {
if (recPass == 0) {}
else {
getReg6(xmmFs, _Fs_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ<vuIndex>();
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
}
}
#define esinHelper(addr) { \
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFt); \
SSE_MOVSS_XMM_to_XMM(xmmFs, xmmT1); \
SSE_MOVAPS_XMM_to_XMM(xmmFs, xmmT1); \
SSE_MULSS_M32_to_XMM(xmmFs, (uptr)addr); \
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); \
}
@ -345,14 +345,14 @@ microVUf(void) mVU_ESIN() {
if (recPass == 0) {}
else {
getReg5(xmmFs, _Fs_, _Fsf_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
//SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_one); // Multiplying by 1 is redundant?
SSE_MOVSS_XMM_to_XMM(xmmFt, xmmFs);
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs);
SSE_MULSS_XMM_to_XMM(xmmFs, xmmFt);
SSE_MOVSS_XMM_to_XMM(xmmT1, xmmFs);
SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFs);
SSE_MULSS_XMM_to_XMM(xmmFs, xmmFt);
SSE_MOVSS_XMM_to_XMM(xmmFt, xmmFs);
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs);
SSE_MULSS_M32_to_XMM(xmmFs, (uptr)mVU_S2);
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs);
@ -362,7 +362,7 @@ microVUf(void) mVU_ESIN() {
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFt);
SSE_MULSS_M32_to_XMM(xmmT1, (uptr)mVU_S5);
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmT1);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
}
}
microVUf(void) mVU_ESQRT() {
@ -370,9 +370,9 @@ microVUf(void) mVU_ESQRT() {
if (recPass == 0) {}
else {
getReg5(xmmFs, _Fs_, _Fsf_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
}
}
microVUf(void) mVU_ESUM() {
@ -380,13 +380,13 @@ microVUf(void) mVU_ESUM() {
if (recPass == 0) {}
else {
getReg6(xmmFs, _Fs_);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmFt, xmmFs, 0x1b);
SSE_ADDPS_XMM_to_XMM(xmmFs, xmmFt);
SSE2_PSHUFD_XMM_to_XMM(xmmFt, xmmFs, 0x01);
SSE_ADDSS_XMM_to_XMM(xmmFs, xmmFt);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
}
}
@ -605,12 +605,29 @@ microVUf(void) mVU_MR32() {
if (recPass == 0) {}
else {
mVUloadReg<vuIndex>(xmmT1, (uptr)&mVU->regs->VF[_Fs_].UL[0], (_X_Y_Z_W == 8) ? 4 : 15);
if (_X_Y_Z_W != 8) { SSE_SHUFPS_XMM_to_XMM(xmmT1, xmmT1, 0x39); }
if (_X_Y_Z_W != 8) { SSE2_PSHUFD_XMM_to_XMM(xmmT1, xmmT1, 0x39); }
mVUsaveReg<vuIndex>(xmmT1, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
}
}
microVUf(void) mVU_LQ() {}
microVUf(void) mVU_LQ() {
microVU* mVU = mVUx;
if (recPass == 0) {}
else {
if (!_Fs_) {
MOV32ItoR(gprT1, _Imm11_);
mVUaddrFix<vuIndex>(gprT1);
mVUloadReg<vuIndex>(xmmFt, (uptr)mVU->regs->Mem, _X_Y_Z_W);
mVUsaveReg<vuIndex>(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
}
else {
mVUallocVIa<vuIndex>(gprT1, _Fs_);
ADD32ItoR(gprT1, _Imm11_);
mVUaddrFix<vuIndex>(gprT1);
mVUloadReg2<vuIndex>(xmmFt, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
mVUsaveReg<vuIndex>(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
}
}
}
microVUf(void) mVU_LQD() {}
microVUf(void) mVU_LQI() {}
microVUf(void) mVU_SQ() {}

View File

@ -73,82 +73,68 @@ microVUx(void) mVUunpack_xyzw(int dstreg, int srcreg, int xyzw) {
}
}
microVUx(void) mVUloadReg(int reg, u32 offset, int xyzw) {
microVUx(void) mVUloadReg(int reg, uptr offset, int xyzw) {
switch( xyzw ) {
case 8: SSE_MOVSS_M32_to_XMM(reg, offset); break; // X
case 4: SSE_MOVSS_M32_to_XMM(reg, offset+4); break; // Y
case 2: SSE_MOVSS_M32_to_XMM(reg, offset+8); break; // Z
case 1: SSE_MOVSS_M32_to_XMM(reg, offset+12); break; // W
//case 3: SSE_MOVHPS_M64_to_XMM(reg, offset+8); break; // ZW (not sure if this is faster than default)
//case 12: SSE_MOVLPS_M64_to_XMM(reg, offset); break; // XY (not sure if this is faster than default)
default: SSE_MOVAPS_M128_to_XMM(reg, offset); break;
}
}
microVUx(void) mVUloadReg2(int reg, int gprReg, uptr offset, int xyzw) {
switch( xyzw ) {
case 8: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset); break; // X
case 4: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+4); break; // Y
case 2: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+8); break; // Z
case 1: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+12); break; // W
default: SSE_MOVAPSRmtoROffset(reg, gprReg, offset); break;
}
}
microVUx(void) mVUsaveReg(int reg, u32 offset, int xyzw) {
switch ( xyzw ) {
case 1: // W
SSE_MOVSS_XMM_to_M32(offset+12, reg);
break;
case 2: // Z
SSE_MOVSS_XMM_to_M32(offset+8, reg);
break;
case 3: // ZW
SSE_MOVHPS_XMM_to_M64(offset+8, reg);
break;
case 4: // Y
SSE_MOVSS_XMM_to_M32(offset+4, reg);
break;
case 5: // YW
SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xB1);
SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
SSE_MOVSS_XMM_to_M32(offset+4, reg);
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
break;
case 6: // YZ
SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0xc9);
SSE_MOVLPS_XMM_to_M64(offset+4, xmmT1);
break;
case 7: // YZW
SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x93); //ZYXW
SSE_MOVHPS_XMM_to_M64(offset+4, xmmT1);
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
break;
case 8: // X
SSE_MOVSS_XMM_to_M32(offset, reg);
break;
case 9: // XW
SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
SSE_MOVSS_XMM_to_M32(offset, reg);
if ( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSLDUP_XMM_to_XMM(xmmT1, xmmT1);
else SSE_SHUFPS_XMM_to_XMM(xmmT1, xmmT1, 0x55);
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
break;
case 10: //XZ
SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
SSE_MOVSS_XMM_to_M32(offset, reg);
SSE_MOVSS_XMM_to_M32(offset+8, xmmT1);
break;
case 11: //XZW
SSE_MOVSS_XMM_to_M32(offset, reg);
SSE_MOVHPS_XMM_to_M64(offset+8, reg);
break;
case 12: // XY
SSE_MOVLPS_XMM_to_M64(offset, reg);
break;
case 13: // XYW
SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x4b); //YXZW
SSE_MOVHPS_XMM_to_M64(offset, xmmT1);
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
break;
case 14: // XYZ
SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
SSE_MOVLPS_XMM_to_M64(offset, reg);
SSE_MOVSS_XMM_to_M32(offset+8, xmmT1);
break;
case 15: // XYZW
SSE_MOVAPS_XMM_to_M128(offset, reg);
break;
case 5: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xB1);
SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
SSE_MOVSS_XMM_to_M32(offset+4, reg);
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
break; // YW
case 6: SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0xc9);
SSE_MOVLPS_XMM_to_M64(offset+4, xmmT1);
break; // YZ
case 7: SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x93); //ZYXW
SSE_MOVHPS_XMM_to_M64(offset+4, xmmT1);
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
break; // YZW
case 9: SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
SSE_MOVSS_XMM_to_M32(offset, reg);
if ( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSLDUP_XMM_to_XMM(xmmT1, xmmT1);
else SSE2_PSHUFD_XMM_to_XMM(xmmT1, xmmT1, 0x55);
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
break; // XW
case 10: SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
SSE_MOVSS_XMM_to_M32(offset, reg);
SSE_MOVSS_XMM_to_M32(offset+8, xmmT1);
break; //XZ
case 11: SSE_MOVSS_XMM_to_M32(offset, reg);
SSE_MOVHPS_XMM_to_M64(offset+8, reg);
break; //XZW
case 13: SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x4b); //YXZW
SSE_MOVHPS_XMM_to_M64(offset, xmmT1);
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
break; // XYW
case 14: SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
SSE_MOVLPS_XMM_to_M64(offset, reg);
SSE_MOVSS_XMM_to_M32(offset+8, xmmT1);
break; // XYZ
case 8: SSE_MOVSS_XMM_to_M32(offset, reg); break; // X
case 4: SSE_MOVSS_XMM_to_M32(offset+4, reg); break; // Y
case 2: SSE_MOVSS_XMM_to_M32(offset+8, reg); break; // Z
case 1: SSE_MOVSS_XMM_to_M32(offset+12, reg); break; // W
case 12: SSE_MOVLPS_XMM_to_M64(offset, reg); break; // XY
case 3: SSE_MOVHPS_XMM_to_M64(offset+8, reg); break; // ZW
default: SSE_MOVAPS_XMM_to_M128(offset, reg); break; // XYZW
}
}
@ -174,10 +160,10 @@ microVUx(void) mVUmergeRegs(int dest, int src, int xyzw) {
SSE2_MOVSD_XMM_to_XMM(dest, src);
break;
case 5: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd8);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd8);
break;
case 6: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c);
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x78);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x78);
break;
case 7: SSE_MOVSS_XMM_to_XMM(src, dest);
SSE_MOVAPS_XMM_to_XMM(dest, src);
@ -185,10 +171,10 @@ microVUx(void) mVUmergeRegs(int dest, int src, int xyzw) {
case 8: SSE_MOVSS_XMM_to_XMM(dest, src);
break;
case 9: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9);
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd2);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd2);
break;
case 10: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d);
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x72);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x72);
break;
case 11: SSE_MOVSS_XMM_to_XMM(dest, src);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
@ -210,4 +196,24 @@ microVUx(void) mVUmergeRegs(int dest, int src, int xyzw) {
}
}
// Transforms the Address in gprReg to valid VU0/VU1 Address
microVUt(void) mVUaddrFix(int gprReg) {
if ( vuIndex == 1 ) {
AND32ItoR(EAX, 0x3ff); // wrap around
SHL32ItoR(EAX, 4);
}
else {
u8 *jmpA, *jmpB;
CMP32ItoR(EAX, 0x400);
jmpA = JL8(0); // if addr >= 0x4000, reads VU1's VF regs and VI regs
AND32ItoR(EAX, 0x43f);
jmpB = JMP8(0);
x86SetJ8(jmpA);
AND32ItoR(EAX, 0xff); // if addr < 0x4000, wrap around
x86SetJ8(jmpB);
SHL32ItoR(EAX, 4); // multiply by 16 (shift left by 4)
}
}
#endif //PCSX2_MICROVU