mirror of https://github.com/PCSX2/pcsx2.git
some tweaks based on some optimization tips jake found.
the idea is that with SSE, doing operations that don't modify all the vectors can give false-dependencies, and in some situations prevent CPU's out-of-order execution (slows things down). so the solution is to only use stuff like "movss" when you need to preserve the upper 3 vectors, if not, its always better(faster) to use movaps. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@751 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
3870debc22
commit
b894a7a217
|
@ -520,8 +520,8 @@ microVUt(void) mVUallocFMAC18a(int& ACC, int& Fs, int& Ft) {
|
|||
if (!_Ft_) { getZero4(Ft); }
|
||||
else { getReg4(Ft, _Ft_); }
|
||||
|
||||
SSE_SHUFPS_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
|
||||
SSE_SHUFPS_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
|
||||
SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
|
||||
SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
|
||||
}
|
||||
|
||||
microVUt(void) mVUallocFMAC18b(int& ACC, int& Fs) {
|
||||
|
@ -546,8 +546,8 @@ microVUt(void) mVUallocFMAC19a(int& Fd, int&ACC, int& Fs, int& Ft) {
|
|||
if (!_Ft_) { getZero4(Ft); }
|
||||
else { getReg4(Ft, _Ft_); }
|
||||
|
||||
SSE_SHUFPS_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
|
||||
SSE_SHUFPS_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
|
||||
SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
|
||||
SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
|
||||
}
|
||||
|
||||
microVUt(void) mVUallocFMAC19b(int& Fd) {
|
||||
|
|
|
@ -137,7 +137,7 @@ microVUf(void) mVU_RSQRT() {
|
|||
#define EATANhelper(addr) { \
|
||||
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \
|
||||
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \
|
||||
SSE_MOVSS_XMM_to_XMM(xmmFt, xmmT1); \
|
||||
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmT1); \
|
||||
SSE_MULSS_M32_to_XMM(xmmFt, (uptr)addr); \
|
||||
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt); \
|
||||
}
|
||||
|
@ -147,7 +147,7 @@ microVUt(void) mVU_EATAN_() {
|
|||
// ToDo: Can Be Optimized Further? (takes approximately (~115 cycles + mem access time) on a c2d)
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_T1);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmT1, xmmFs);
|
||||
SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFs);
|
||||
|
||||
EATANhelper(mVU_T2);
|
||||
EATANhelper(mVU_T3);
|
||||
|
@ -158,14 +158,14 @@ microVUt(void) mVU_EATAN_() {
|
|||
EATANhelper(mVU_T8);
|
||||
|
||||
SSE_ADDSS_M32_to_XMM(xmmPQ, (uptr)mVU_Pi4);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6);
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6);
|
||||
}
|
||||
microVUf(void) mVU_EATAN() {
|
||||
microVU* mVU = mVUx;
|
||||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg5(xmmFs, _Fs_, _Fsf_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_SUBSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
|
||||
|
@ -181,7 +181,7 @@ microVUf(void) mVU_EATANxy() {
|
|||
else {
|
||||
getReg6(xmmFt, _Fs_);
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFt, 0x01);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_SUBSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
|
||||
|
@ -197,7 +197,7 @@ microVUf(void) mVU_EATANxz() {
|
|||
else {
|
||||
getReg6(xmmFt, _Fs_);
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFt, 0x02);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_SUBSS_XMM_to_XMM(xmmFs, xmmFt);
|
||||
|
@ -209,7 +209,7 @@ microVUf(void) mVU_EATANxz() {
|
|||
}
|
||||
#define eexpHelper(addr) { \
|
||||
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \
|
||||
SSE_MOVSS_XMM_to_XMM(xmmFt, xmmT1); \
|
||||
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmT1); \
|
||||
SSE_MULSS_M32_to_XMM(xmmFt, (uptr)addr); \
|
||||
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt); \
|
||||
}
|
||||
|
@ -218,14 +218,14 @@ microVUf(void) mVU_EEXP() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg5(xmmFs, _Fs_, _Fsf_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_E1);
|
||||
SSE_ADDSS_M32_to_XMM(xmmPQ, (uptr)mVU_one);
|
||||
|
||||
SSE_MOVSS_XMM_to_XMM(xmmFt, xmmFs);
|
||||
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs);
|
||||
SSE_MULSS_XMM_to_XMM(xmmFt, xmmFs);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmT1, xmmFt);
|
||||
SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFt);
|
||||
SSE_MULSS_M32_to_XMM(xmmFt, (uptr)mVU_E2);
|
||||
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt);
|
||||
|
||||
|
@ -241,7 +241,7 @@ microVUf(void) mVU_EEXP() {
|
|||
SSE_MOVSS_M32_to_XMM(xmmT1, (uptr)mVU_one);
|
||||
SSE_DIVSS_XMM_to_XMM(xmmT1, xmmPQ);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmT1);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
}
|
||||
}
|
||||
microVUt(void) mVU_sumXYZ() {
|
||||
|
@ -253,9 +253,9 @@ microVUt(void) mVU_sumXYZ() {
|
|||
else {
|
||||
SSE_MULPS_XMM_to_XMM(xmmFs, xmmFs); // wzyx ^ 2
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmFs, xmmFs, 0xe1); // wzyx -> wzxy
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFs, 0xe1); // wzyx -> wzxy
|
||||
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); // x ^ 2 + y ^ 2
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmFs, xmmFs, 0xD2); // wzxy -> wxyz
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFs, 0xD2); // wzxy -> wxyz
|
||||
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); // x ^ 2 + y ^ 2 + z ^ 2
|
||||
}
|
||||
}
|
||||
|
@ -264,10 +264,10 @@ microVUf(void) mVU_ELENG() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg6(xmmFs, _Fs_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
mVU_sumXYZ<vuIndex>();
|
||||
SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmPQ);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
}
|
||||
}
|
||||
microVUf(void) mVU_ERCPR() {
|
||||
|
@ -275,12 +275,12 @@ microVUf(void) mVU_ERCPR() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg5(xmmFs, _Fs_, _Fsf_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
|
||||
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
}
|
||||
}
|
||||
microVUf(void) mVU_ERLENG() {
|
||||
|
@ -288,13 +288,13 @@ microVUf(void) mVU_ERLENG() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg6(xmmFs, _Fs_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
mVU_sumXYZ<vuIndex>();
|
||||
SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmPQ);
|
||||
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
|
||||
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
}
|
||||
}
|
||||
microVUf(void) mVU_ERSADD() {
|
||||
|
@ -302,13 +302,13 @@ microVUf(void) mVU_ERSADD() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg6(xmmFs, _Fs_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
mVU_sumXYZ<vuIndex>();
|
||||
//SSE_RCPSS_XMM_to_XMM(xmmPQ, xmmPQ); // Lower Precision is bad?
|
||||
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
|
||||
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
}
|
||||
}
|
||||
microVUf(void) mVU_ERSQRT() {
|
||||
|
@ -316,12 +316,12 @@ microVUf(void) mVU_ERSQRT() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg5(xmmFs, _Fs_, _Fsf_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
|
||||
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
}
|
||||
}
|
||||
microVUf(void) mVU_ESADD() {
|
||||
|
@ -329,14 +329,14 @@ microVUf(void) mVU_ESADD() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg6(xmmFs, _Fs_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
mVU_sumXYZ<vuIndex>();
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
}
|
||||
}
|
||||
#define esinHelper(addr) { \
|
||||
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFt); \
|
||||
SSE_MOVSS_XMM_to_XMM(xmmFs, xmmT1); \
|
||||
SSE_MOVAPS_XMM_to_XMM(xmmFs, xmmT1); \
|
||||
SSE_MULSS_M32_to_XMM(xmmFs, (uptr)addr); \
|
||||
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); \
|
||||
}
|
||||
|
@ -345,14 +345,14 @@ microVUf(void) mVU_ESIN() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg5(xmmFs, _Fs_, _Fsf_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
//SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_one); // Multiplying by 1 is redundant?
|
||||
SSE_MOVSS_XMM_to_XMM(xmmFt, xmmFs);
|
||||
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs);
|
||||
SSE_MULSS_XMM_to_XMM(xmmFs, xmmFt);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmT1, xmmFs);
|
||||
SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFs);
|
||||
SSE_MULSS_XMM_to_XMM(xmmFs, xmmFt);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmFt, xmmFs);
|
||||
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs);
|
||||
SSE_MULSS_M32_to_XMM(xmmFs, (uptr)mVU_S2);
|
||||
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
|
||||
|
@ -362,7 +362,7 @@ microVUf(void) mVU_ESIN() {
|
|||
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFt);
|
||||
SSE_MULSS_M32_to_XMM(xmmT1, (uptr)mVU_S5);
|
||||
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmT1);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
}
|
||||
}
|
||||
microVUf(void) mVU_ESQRT() {
|
||||
|
@ -370,9 +370,9 @@ microVUf(void) mVU_ESQRT() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg5(xmmFs, _Fs_, _Fsf_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
}
|
||||
}
|
||||
microVUf(void) mVU_ESUM() {
|
||||
|
@ -380,13 +380,13 @@ microVUf(void) mVU_ESUM() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
getReg6(xmmFs, _Fs_);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmFt, xmmFs, 0x1b);
|
||||
SSE_ADDPS_XMM_to_XMM(xmmFs, xmmFt);
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmFt, xmmFs, 0x01);
|
||||
SSE_ADDSS_XMM_to_XMM(xmmFs, xmmFt);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, writeP ? 0x27 : 0xC6); // Flip back
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -605,12 +605,29 @@ microVUf(void) mVU_MR32() {
|
|||
if (recPass == 0) {}
|
||||
else {
|
||||
mVUloadReg<vuIndex>(xmmT1, (uptr)&mVU->regs->VF[_Fs_].UL[0], (_X_Y_Z_W == 8) ? 4 : 15);
|
||||
if (_X_Y_Z_W != 8) { SSE_SHUFPS_XMM_to_XMM(xmmT1, xmmT1, 0x39); }
|
||||
if (_X_Y_Z_W != 8) { SSE2_PSHUFD_XMM_to_XMM(xmmT1, xmmT1, 0x39); }
|
||||
mVUsaveReg<vuIndex>(xmmT1, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
|
||||
}
|
||||
}
|
||||
|
||||
microVUf(void) mVU_LQ() {}
|
||||
microVUf(void) mVU_LQ() {
|
||||
microVU* mVU = mVUx;
|
||||
if (recPass == 0) {}
|
||||
else {
|
||||
if (!_Fs_) {
|
||||
MOV32ItoR(gprT1, _Imm11_);
|
||||
mVUaddrFix<vuIndex>(gprT1);
|
||||
mVUloadReg<vuIndex>(xmmFt, (uptr)mVU->regs->Mem, _X_Y_Z_W);
|
||||
mVUsaveReg<vuIndex>(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
|
||||
}
|
||||
else {
|
||||
mVUallocVIa<vuIndex>(gprT1, _Fs_);
|
||||
ADD32ItoR(gprT1, _Imm11_);
|
||||
mVUaddrFix<vuIndex>(gprT1);
|
||||
mVUloadReg2<vuIndex>(xmmFt, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
|
||||
mVUsaveReg<vuIndex>(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
|
||||
}
|
||||
}
|
||||
}
|
||||
microVUf(void) mVU_LQD() {}
|
||||
microVUf(void) mVU_LQI() {}
|
||||
microVUf(void) mVU_SQ() {}
|
||||
|
|
|
@ -73,82 +73,68 @@ microVUx(void) mVUunpack_xyzw(int dstreg, int srcreg, int xyzw) {
|
|||
}
|
||||
}
|
||||
|
||||
microVUx(void) mVUloadReg(int reg, u32 offset, int xyzw) {
|
||||
microVUx(void) mVUloadReg(int reg, uptr offset, int xyzw) {
|
||||
switch( xyzw ) {
|
||||
case 8: SSE_MOVSS_M32_to_XMM(reg, offset); break; // X
|
||||
case 4: SSE_MOVSS_M32_to_XMM(reg, offset+4); break; // Y
|
||||
case 2: SSE_MOVSS_M32_to_XMM(reg, offset+8); break; // Z
|
||||
case 1: SSE_MOVSS_M32_to_XMM(reg, offset+12); break; // W
|
||||
//case 3: SSE_MOVHPS_M64_to_XMM(reg, offset+8); break; // ZW (not sure if this is faster than default)
|
||||
//case 12: SSE_MOVLPS_M64_to_XMM(reg, offset); break; // XY (not sure if this is faster than default)
|
||||
default: SSE_MOVAPS_M128_to_XMM(reg, offset); break;
|
||||
}
|
||||
}
|
||||
|
||||
microVUx(void) mVUloadReg2(int reg, int gprReg, uptr offset, int xyzw) {
|
||||
switch( xyzw ) {
|
||||
case 8: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset); break; // X
|
||||
case 4: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+4); break; // Y
|
||||
case 2: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+8); break; // Z
|
||||
case 1: SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+12); break; // W
|
||||
default: SSE_MOVAPSRmtoROffset(reg, gprReg, offset); break;
|
||||
}
|
||||
}
|
||||
|
||||
microVUx(void) mVUsaveReg(int reg, u32 offset, int xyzw) {
|
||||
switch ( xyzw ) {
|
||||
case 1: // W
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, reg);
|
||||
break;
|
||||
case 2: // Z
|
||||
SSE_MOVSS_XMM_to_M32(offset+8, reg);
|
||||
break;
|
||||
case 3: // ZW
|
||||
SSE_MOVHPS_XMM_to_M64(offset+8, reg);
|
||||
break;
|
||||
case 4: // Y
|
||||
SSE_MOVSS_XMM_to_M32(offset+4, reg);
|
||||
break;
|
||||
case 5: // YW
|
||||
SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xB1);
|
||||
SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+4, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
|
||||
break;
|
||||
case 6: // YZ
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0xc9);
|
||||
SSE_MOVLPS_XMM_to_M64(offset+4, xmmT1);
|
||||
break;
|
||||
case 7: // YZW
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x93); //ZYXW
|
||||
SSE_MOVHPS_XMM_to_M64(offset+4, xmmT1);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
|
||||
break;
|
||||
case 8: // X
|
||||
SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
break;
|
||||
case 9: // XW
|
||||
SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
if ( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSLDUP_XMM_to_XMM(xmmT1, xmmT1);
|
||||
else SSE_SHUFPS_XMM_to_XMM(xmmT1, xmmT1, 0x55);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
|
||||
break;
|
||||
case 10: //XZ
|
||||
SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+8, xmmT1);
|
||||
break;
|
||||
case 11: //XZW
|
||||
SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
SSE_MOVHPS_XMM_to_M64(offset+8, reg);
|
||||
break;
|
||||
case 12: // XY
|
||||
SSE_MOVLPS_XMM_to_M64(offset, reg);
|
||||
break;
|
||||
case 13: // XYW
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x4b); //YXZW
|
||||
SSE_MOVHPS_XMM_to_M64(offset, xmmT1);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
|
||||
break;
|
||||
case 14: // XYZ
|
||||
SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
|
||||
SSE_MOVLPS_XMM_to_M64(offset, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+8, xmmT1);
|
||||
break;
|
||||
case 15: // XYZW
|
||||
SSE_MOVAPS_XMM_to_M128(offset, reg);
|
||||
break;
|
||||
case 5: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xB1);
|
||||
SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+4, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
|
||||
break; // YW
|
||||
case 6: SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0xc9);
|
||||
SSE_MOVLPS_XMM_to_M64(offset+4, xmmT1);
|
||||
break; // YZ
|
||||
case 7: SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x93); //ZYXW
|
||||
SSE_MOVHPS_XMM_to_M64(offset+4, xmmT1);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
|
||||
break; // YZW
|
||||
case 9: SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
if ( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSLDUP_XMM_to_XMM(xmmT1, xmmT1);
|
||||
else SSE2_PSHUFD_XMM_to_XMM(xmmT1, xmmT1, 0x55);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
|
||||
break; // XW
|
||||
case 10: SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+8, xmmT1);
|
||||
break; //XZ
|
||||
case 11: SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
SSE_MOVHPS_XMM_to_M64(offset+8, reg);
|
||||
break; //XZW
|
||||
case 13: SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x4b); //YXZW
|
||||
SSE_MOVHPS_XMM_to_M64(offset, xmmT1);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, xmmT1);
|
||||
break; // XYW
|
||||
case 14: SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
|
||||
SSE_MOVLPS_XMM_to_M64(offset, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+8, xmmT1);
|
||||
break; // XYZ
|
||||
case 8: SSE_MOVSS_XMM_to_M32(offset, reg); break; // X
|
||||
case 4: SSE_MOVSS_XMM_to_M32(offset+4, reg); break; // Y
|
||||
case 2: SSE_MOVSS_XMM_to_M32(offset+8, reg); break; // Z
|
||||
case 1: SSE_MOVSS_XMM_to_M32(offset+12, reg); break; // W
|
||||
case 12: SSE_MOVLPS_XMM_to_M64(offset, reg); break; // XY
|
||||
case 3: SSE_MOVHPS_XMM_to_M64(offset+8, reg); break; // ZW
|
||||
default: SSE_MOVAPS_XMM_to_M128(offset, reg); break; // XYZW
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -174,10 +160,10 @@ microVUx(void) mVUmergeRegs(int dest, int src, int xyzw) {
|
|||
SSE2_MOVSD_XMM_to_XMM(dest, src);
|
||||
break;
|
||||
case 5: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd8);
|
||||
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd8);
|
||||
break;
|
||||
case 6: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x78);
|
||||
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x78);
|
||||
break;
|
||||
case 7: SSE_MOVSS_XMM_to_XMM(src, dest);
|
||||
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
||||
|
@ -185,10 +171,10 @@ microVUx(void) mVUmergeRegs(int dest, int src, int xyzw) {
|
|||
case 8: SSE_MOVSS_XMM_to_XMM(dest, src);
|
||||
break;
|
||||
case 9: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd2);
|
||||
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd2);
|
||||
break;
|
||||
case 10: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x72);
|
||||
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x72);
|
||||
break;
|
||||
case 11: SSE_MOVSS_XMM_to_XMM(dest, src);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
|
||||
|
@ -210,4 +196,24 @@ microVUx(void) mVUmergeRegs(int dest, int src, int xyzw) {
|
|||
}
|
||||
}
|
||||
|
||||
// Transforms the Address in gprReg to valid VU0/VU1 Address
|
||||
microVUt(void) mVUaddrFix(int gprReg) {
|
||||
if ( vuIndex == 1 ) {
|
||||
AND32ItoR(EAX, 0x3ff); // wrap around
|
||||
SHL32ItoR(EAX, 4);
|
||||
}
|
||||
else {
|
||||
u8 *jmpA, *jmpB;
|
||||
CMP32ItoR(EAX, 0x400);
|
||||
jmpA = JL8(0); // if addr >= 0x4000, reads VU1's VF regs and VI regs
|
||||
AND32ItoR(EAX, 0x43f);
|
||||
jmpB = JMP8(0);
|
||||
x86SetJ8(jmpA);
|
||||
AND32ItoR(EAX, 0xff); // if addr < 0x4000, wrap around
|
||||
x86SetJ8(jmpB);
|
||||
|
||||
SHL32ItoR(EAX, 4); // multiply by 16 (shift left by 4)
|
||||
}
|
||||
}
|
||||
|
||||
#endif //PCSX2_MICROVU
|
||||
|
|
Loading…
Reference in New Issue