Tmmk fixed a bug in saveEAX where a vector wasn't saved in some situations. He also optimized some VU functions, so this revision brings about 5% more speed ;)

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@631 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
ramapcsx2 2009-01-25 13:09:54 +00:00 committed by Gregory Hainaut
parent dde14aab30
commit c787b00cc7
5 changed files with 607 additions and 206 deletions

View File

@ -576,12 +576,11 @@ int _vuGetTempXMMreg(int info)
//------------------------------------------------------------------ //------------------------------------------------------------------
void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw) void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw)
{ {
SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
switch (xyzw) { switch (xyzw) {
case 0: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x00); break; case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break;
case 1: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x55); break; case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break;
case 2: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xaa); break; case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break;
case 3: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xff); break; case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break;
} }
} }
@ -657,8 +656,9 @@ void VU_MERGE3(int dest, int src) { // 1100s
} }
void VU_MERGE4(int dest, int src) { // 0010 void VU_MERGE4(int dest, int src) { // 0010
SSE_MOVSS_XMM_to_XMM(src, dest); SSE_MOVSS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4); //SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
SSE_MOVAPS_XMM_to_XMM(dest, src); //SSE_MOVAPS_XMM_to_XMM(dest, src);
SSE2_MOVSD_XMM_to_XMM(dest, src);
} }
void VU_MERGE4b(int dest, int src) { // 0010s void VU_MERGE4b(int dest, int src) { // 0010s
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
@ -743,16 +743,7 @@ void VU_MERGE11(int dest, int src) { // 1101s
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
} }
void VU_MERGE12(int dest, int src) { // 0011 void VU_MERGE12(int dest, int src) { // 0011
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4); SSE2_MOVSD_XMM_to_XMM(dest, src);
SSE_MOVAPS_XMM_to_XMM(dest, src);
}
void VU_MERGE12b(int dest, int src) { // 0011
SSE_MOVSS_XMM_to_XMM(dest, src);
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
SSE_MOVSS_XMM_to_XMM(dest, src);
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
} }
void VU_MERGE13(int dest, int src) { // 1011 void VU_MERGE13(int dest, int src) { // 1011
SSE_MOVHLPS_XMM_to_XMM(dest, src); SSE_MOVHLPS_XMM_to_XMM(dest, src);
@ -806,7 +797,7 @@ static VUMERGEFN s_VuMerge2[16] = {
VU_MERGE0, VU_MERGE1b, VU_MERGE2b, VU_MERGE3, VU_MERGE0, VU_MERGE1b, VU_MERGE2b, VU_MERGE3,
VU_MERGE4b, VU_MERGE5b, VU_MERGE6b, VU_MERGE7b, VU_MERGE4b, VU_MERGE5b, VU_MERGE6b, VU_MERGE7b,
VU_MERGE8, VU_MERGE9b, VU_MERGE10b, VU_MERGE11, VU_MERGE8, VU_MERGE9b, VU_MERGE10b, VU_MERGE11,
VU_MERGE12b, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 }; VU_MERGE12, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 };
// Modifies the Source Reg! // Modifies the Source Reg!
void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) { void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) {
@ -876,6 +867,12 @@ void vFloat3(int regd, int regTemp) { //1100
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
} }
void vFloat3b(int regd, int regTemp) { //1100 //regTemp is Modified
SSE2_MOVSD_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
}
void vFloat3c(int regd, int regTemp) { //1100 void vFloat3c(int regd, int regTemp) { //1100
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -912,6 +909,12 @@ void vFloat5(int regd, int regTemp) { //1010
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
} }
void vFloat5b(int regd, int regTemp) { //1010
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x5);
}
void vFloat5c(int regd, int regTemp) { //1010 void vFloat5c(int regd, int regTemp) { //1010
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -933,6 +936,12 @@ void vFloat6(int regd, int regTemp) { //0110
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
} }
void vFloat6b(int regd, int regTemp) { //0110
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x9);
}
void vFloat6c(int regd, int regTemp) { //0110 void vFloat6c(int regd, int regTemp) { //0110
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -946,37 +955,44 @@ void vFloat6c(int regd, int regTemp) { //0110
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ORPS_XMM_to_XMM(regd, regTemp);
} }
void vFloat7(int regd, int regTemp) { //1110 void vFloat7(int regd, int regTemp) { //1110
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
}
} }
void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVSS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
VU_MERGE_REGS_CUSTOM(regd, regTemp, 7); SSE_MOVSS_XMM_to_XMM(regd, regTemp);
} }
void vFloat7c(int regd, int regTemp) { //1110 void vFloat7c(int regd, int regTemp) { //1110
SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ORPS_XMM_to_XMM(regd, regTemp);
if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
else {
SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat8(int regd, int regTemp) { //0001 void vFloat8(int regd, int regTemp) { //0001
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
@ -997,6 +1013,12 @@ void vFloat9(int regd, int regTemp) { //1001
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
} }
void vFloat9b(int regd, int regTemp) { //1001
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x6);
}
void vFloat9c(int regd, int regTemp) { //1001 void vFloat9c(int regd, int regTemp) { //1001
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -1016,6 +1038,12 @@ void vFloat10(int regd, int regTemp) { //0101
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
} }
void vFloat10b(int regd, int regTemp) { //0101
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0xa);
}
void vFloat10c(int regd, int regTemp) { //0101 void vFloat10c(int regd, int regTemp) { //0101
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -1028,35 +1056,54 @@ void vFloat10c(int regd, int regTemp) { //0101
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ORPS_XMM_to_XMM(regd, regTemp);
} }
void vFloat11(int regd, int regTemp) { //1101 void vFloat11(int regd, int regTemp) { //1101
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
}
} }
void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
VU_MERGE_REGS_CUSTOM(regd, regTemp, 11); if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x02);
else {
SSE_MOVSS_XMM_to_XMM(regTemp, regd);
SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat11c(int regd, int regTemp) { //1101 void vFloat11c(int regd, int regTemp) { //1101
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0xe1);
SSE2_MOVD_XMM_to_R(EAX, regTemp);
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_ORPS_XMM_to_XMM(regd, regTemp);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x01);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); }
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); else {
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_ORPS_XMM_to_XMM(regTemp, regd);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); SSE2_MOVD_R_to_XMM(regd, EAX);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MOVLHPS_XMM_to_XMM(regd, regTemp);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xe2);
}
} }
void vFloat12(int regd, int regTemp) { //0011 void vFloat12(int regd, int regTemp) { //0011
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
@ -1066,6 +1113,12 @@ void vFloat12(int regd, int regTemp) { //0011
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
} }
void vFloat12b(int regd, int regTemp) { //0011 //regTemp is Modified
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE2_PUNPCKLQDQ_XMM_to_XMM(regd, regTemp);
}
void vFloat12c(int regd, int regTemp) { //0011 void vFloat12c(int regd, int regTemp) { //0011
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -1078,66 +1131,100 @@ void vFloat12c(int regd, int regTemp) { //0011
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ORPS_XMM_to_XMM(regd, regTemp);
} }
void vFloat13(int regd, int regTemp) { //1011 void vFloat13(int regd, int regTemp) { //1011
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
}
} }
void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
VU_MERGE_REGS_CUSTOM(regd, regTemp, 13); if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x04);
else {
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x64);
}
} }
void vFloat13c(int regd, int regTemp) { //1011 void vFloat13c(int regd, int regTemp) { //1011
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0xd2);
SSE2_MOVD_XMM_to_R(EAX, regTemp);
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ORPS_XMM_to_XMM(regd, regTemp);
if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x02);
else {
SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE_SHUFPS_XMM_to_XMM(regTemp, regd, 0xf0);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x84);
}
} }
void vFloat14(int regd, int regTemp) { //0111 void vFloat14(int regd, int regTemp) { //0111
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
}
} }
void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
VU_MERGE_REGS_CUSTOM(regd, regTemp, 14); if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x08);
else {
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xc4);
}
} }
void vFloat14c(int regd, int regTemp) { //0111 void vFloat14c(int regd, int regTemp) { //0111
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0x93);
SSE2_MOVD_XMM_to_R(EAX, regTemp);
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ORPS_XMM_to_XMM(regd, regTemp);
if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x03);
else {
SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE_SHUFPS_XMM_to_XMM(regTemp, regd, 0xa0);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x24);
}
} }
void vFloat15(int regd, int regTemp) { //1111 void vFloat15(int regd, int regTemp) { //1111
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
@ -1158,10 +1245,10 @@ vFloat vFloats1[16] = { //regTemp is not modified
vFloat12, vFloat13, vFloat14, vFloat15 }; vFloat12, vFloat13, vFloat14, vFloat15 };
vFloat vFloats2[16] = { //regTemp is modified vFloat vFloats2[16] = { //regTemp is modified
vFloat0, vFloat1, vFloat2, vFloat3, vFloat0, vFloat1, vFloat2, vFloat3b,
vFloat4, vFloat5, vFloat6, vFloat7b, vFloat4, vFloat5, vFloat6, vFloat7b,
vFloat8, vFloat9, vFloat10, vFloat11b, vFloat8, vFloat9, vFloat10, vFloat11b,
vFloat12, vFloat13b, vFloat14b, vFloat15 }; vFloat12b, vFloat13b, vFloat14b, vFloat15 };
vFloat vFloats4[16] = { //regTemp is modified vFloat vFloats4[16] = { //regTemp is modified
vFloat0, vFloat1c, vFloat2c, vFloat3c, vFloat0, vFloat1c, vFloat2c, vFloat3c,
@ -1269,4 +1356,4 @@ void SetVUNanMode(int mode)
{ {
g_VuNanHandling = mode; g_VuNanHandling = mode;
if ( mode ) SysPrintf("enabling vunan mode"); if ( mode ) SysPrintf("enabling vunan mode");
} }

View File

@ -799,10 +799,9 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
switch ( _X_Y_Z_W ) { switch ( _X_Y_Z_W ) {
case 1: // W* case 1: // W*
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x27); SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x27);
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset+12); if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_S); else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x27);
break; break;
case 2: // Z* case 2: // Z*
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
@ -817,39 +816,35 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB1); SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB1);
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
if ( x86reg >= 0 ) { if ( x86reg >= 0 ) {
SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4); SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12); SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
} }
else { else {
SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_S); SSE_MOVSS_XMM_to_M32(offset+4, EEREC_S);
SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP); SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
} }
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB1); SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB1);
break; break;
case 4: // Y* case 4: // Y*
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xe1); SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0xe1);
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4); if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
else SSE_MOVSS_XMM_to_M32(offset+4, EEREC_S); else SSE_MOVSS_XMM_to_M32(offset+4, EEREC_TEMP);
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xe1);
break; break;
case 6: // YZ case 6: // YZ
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xc9); SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0xc9);
if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4); if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
else SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_S); else SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_TEMP);
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xd2);
break; break;
case 7: // YZW case 7: // YZW
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x39); SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x93); //ZYXW
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
if ( x86reg >= 0 ) { if ( x86reg >= 0 ) {
SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4); SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12); SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
} }
else { else {
SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_S); SSE_MOVHPS_XMM_to_M64(offset+4, EEREC_TEMP);
SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP); SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
} }
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x93);
break; break;
case 8: // X* case 8: // X*
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset); if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
@ -867,26 +862,41 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP); else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
break; break;
//case 10: break; case 10: //XZ
//case 11: break; SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
if ( x86reg >= 0 ) {
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+8);
}
else {
SSE_MOVSS_XMM_to_M32(offset, EEREC_S);
SSE_MOVSS_XMM_to_M32(offset+8, EEREC_TEMP);
}
break;
case 11: break; //XZW
if ( x86reg >= 0 ) {
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+8);
}
else {
SSE_MOVSS_XMM_to_M32(offset, EEREC_S);
SSE_MOVHPS_XMM_to_M64(offset+8, EEREC_S);
}
break;
case 12: // XY case 12: // XY
if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+0); if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+0);
else SSE_MOVLPS_XMM_to_M64(offset, EEREC_S); else SSE_MOVLPS_XMM_to_M64(offset, EEREC_S);
break; break;
case 13: // XYW case 13: // XYW
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB4); //ZWYX SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x4b); //YXZW
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
if ( x86reg >= 0 ) { if ( x86reg >= 0 ) {
SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+0); SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+0);
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12); SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
} }
else { else {
SSE_MOVLPS_XMM_to_M64(offset, EEREC_S); SSE_MOVHPS_XMM_to_M64(offset, EEREC_TEMP);
SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP); SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
} }
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB4);
break; break;
case 14: // XYZ case 14: // XYZ
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
@ -2006,4 +2016,4 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
mtgsThread->SendDataPacket(); mtgsThread->SendDataPacket();
} }
} }
//------------------------------------------------------------------ //------------------------------------------------------------------

View File

@ -182,7 +182,6 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
} }
SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip wzyx to xyzw SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip wzyx to xyzw
XOR32RtoR(x86macflag, x86macflag); // Clear Mac Flag
MOV32MtoR(x86temp, prevstataddr); // Load the previous status in to x86temp MOV32MtoR(x86temp, prevstataddr); // Load the previous status in to x86temp
AND16ItoR(x86temp, 0xff0); // Keep Sticky and D/I flags AND16ItoR(x86temp, 0xff0); // Keep Sticky and D/I flags
@ -202,13 +201,12 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
SSE_ANDPS_M128_to_XMM(t1reg, (uptr)VU_Zero_Helper_Mask); SSE_ANDPS_M128_to_XMM(t1reg, (uptr)VU_Zero_Helper_Mask);
SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)VU_Pos_Infinity); // If infinity, then overflow has occured (NaN's don't report as overflow) SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)VU_Pos_Infinity); // If infinity, then overflow has occured (NaN's don't report as overflow)
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the previous calculation
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified) AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified)
pjmp = JZ8(0); // Skip if none are pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x208); // OS, O flags OR16ItoR(x86temp, 0x208); // OS, O flags
SHL16ItoR(EAX, 12); SHL16ItoR(x86macflag, 12);
OR32RtoR(x86macflag, EAX);
if (_XYZW_SS) pjmp32 = JMP32(0); // Skip Underflow Check if (_XYZW_SS) pjmp32 = JMP32(0); // Skip Underflow Check
x86SetJ8(pjmp); x86SetJ8(pjmp);
@ -246,35 +244,91 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask) vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask)
//-------------------------Check for Signed flags------------------------------ if (_XYZW_SS) {
//-------------------------Check for Signed flags------------------------------
// The following code makes sure the Signed Bit isn't set with Negative Zero // The following code makes sure the Signed Bit isn't set with Negative Zero
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
SSE_CMPNEPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is not zero SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
SSE_ANDPS_XMM_to_XMM(t1reg, reg);
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg if (CHECK_VU_EXTRA_FLAGS) {
SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x82); // SS, S flags OR16ItoR(x86temp, 0x82); // SS, S flags
SHL16ItoR(EAX, 4); SHL16ItoR(EAX, 4);
OR32RtoR(x86macflag, EAX); OR32RtoR(x86macflag, EAX);
if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
x86SetJ8(pjmp); x86SetJ8(pjmp);
}
else {
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg (for zero flag)
SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the t1reg
//-------------------------Check for Zero flags------------------------------ AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg OR16ItoR(x86temp, 0x82); // SS, S flags
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero SHL16ItoR(x86macflag, 4);
pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
x86SetJ8(pjmp);
}
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation //-------------------------Check for Zero flags------------------------------
if (CHECK_VU_EXTRA_FLAGS) {
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
pjmp = JZ8(0); // Skip if none are }
OR16ItoR(x86temp, 0x41); // ZS, Z flags
OR32RtoR(x86macflag, EAX); AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
x86SetJ8(pjmp); pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x41); // ZS, Z flags
OR32RtoR(x86macflag, EAX);
x86SetJ8(pjmp);
}
else {
//-------------------------Check for Zero flags------------------------------
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
if (CHECK_VU_EXTRA_FLAGS) {
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x41); // ZS, Z flags
OR32RtoR(x86macflag, EAX);
x86SetJ8(pjmp);
}
else {
SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the previous calculation
AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x41); // ZS, Z flags
x86SetJ8(pjmp);
}
//-------------------------Check for Signed flags------------------------------
// The following code makes sure the Signed Bit isn't set with Negative Zero
SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x82); // SS, S flags
SHL16ItoR(EAX, 4);
OR32RtoR(x86macflag, EAX);
x86SetJ8(pjmp);
}
//-------------------------Finally: Send the Flags to the Mac Flag Address------------------------------ //-------------------------Finally: Send the Flags to the Mac Flag Address------------------------------
@ -298,8 +352,6 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
// //
// Note: See FPU_ADD_SUB() for more info on what this is doing. // Note: See FPU_ADD_SUB() for more info on what this is doing.
//------------------------------------------------------------------ //------------------------------------------------------------------
static const PCSX2_ALIGNED16(u32 VU_fullmask[4]) = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};
static const PCSX2_ALIGNED16(u32 VU_helperbyte[4]) = {0xff, 0xff, 0xff, 0xff};
static PCSX2_ALIGNED16(u32 VU_addsuband[2][4]); static PCSX2_ALIGNED16(u32 VU_addsuband[2][4]);
static PCSX2_ALIGNED16(u32 VU_addsub_reg[2][4]); static PCSX2_ALIGNED16(u32 VU_addsub_reg[2][4]);
static u32 ecx_temp_loc; static u32 ecx_temp_loc;
@ -313,17 +365,16 @@ void VU_ADD_SUB(u32 regd, u32 regt, int is_sub, int info)
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd); SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt); SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_fullmask[0]); SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd); SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd); SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd);
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
SSE2_PSRLD_I8_to_XMM(regd, 23); SSE2_PSLLD_I8_to_XMM(regd, 1);
SSE2_PSRLD_I8_to_XMM(regt, 23); SSE2_PSLLD_I8_to_XMM(regt, 1);
SSE2_PAND_M128_to_XMM(regd, (uptr)&VU_helperbyte[0]); SSE2_PSRLD_I8_to_XMM(regd, 24);
SSE2_PAND_M128_to_XMM(regt, (uptr)&VU_helperbyte[0]); SSE2_PSRLD_I8_to_XMM(regt, 24);
SSE2_PSUBD_XMM_to_XMM(regd, regt); SSE2_PSUBD_XMM_to_XMM(regd, regt);
@ -389,6 +440,88 @@ void VU_ADD_SUB(u32 regd, u32 regt, int is_sub, int info)
_freeX86reg(temp2); _freeX86reg(temp2);
} }
void VU_ADD_SUB_SSE4(u32 regd, u32 regt, int is_sub, int info)
{
u8 *localptr[4][8];
int temp1 = _allocX86reg(ECX, X86TYPE_TEMP, 0, 0); //receives regd//_allocX86reg(ECX, X86TYPE_TEMP, 0, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode);
int temp2 = ALLOCTEMPX86(0);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
SSE2_PSLLD_I8_to_XMM(regd, 1);
SSE2_PSLLD_I8_to_XMM(regt, 1);
SSE2_PSRLD_I8_to_XMM(regd, 24);
SSE2_PSRLD_I8_to_XMM(regt, 24);
SSE2_PSUBD_XMM_to_XMM(regd, regt);
#define PERFORM_SSE4(i) \
\
SSE_PEXTRW_XMM_to_R32(temp1, regd, i*2); \
MOVSX32R16toR(temp1, temp1); \
CMP32ItoR(temp1, 25);\
localptr[i][0] = JGE8(0);\
CMP32ItoR(temp1, 0);\
localptr[i][1] = JG8(0);\
localptr[i][2] = JE8(0);\
CMP32ItoR(temp1, -25);\
localptr[i][3] = JLE8(0);\
\
NEG32R(temp1); \
DEC32R(temp1);\
MOV32ItoR(temp2, 0xffffffff); \
SHL32CLtoR(temp2); \
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
localptr[i][4] = JMP8(0);\
\
x86SetJ8(localptr[i][0]);\
MOV32ItoR(temp2, 0xffffffff); \
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
SHL32ItoR(temp2, 31); \
SSE4_PINSRD_R32_to_XMM(regt, temp2, i); \
localptr[i][5] = JMP8(0);\
\
x86SetJ8(localptr[i][1]);\
DEC32R(temp1);\
MOV32ItoR(temp2, 0xffffffff);\
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
SHL32CLtoR(temp2); \
SSE4_PINSRD_R32_to_XMM(regt, temp2, i); \
localptr[i][6] = JMP8(0);\
\
x86SetJ8(localptr[i][3]);\
MOV32ItoR(temp2, 0x80000000); \
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
localptr[i][7] = JMP8(0);\
\
x86SetJ8(localptr[i][2]);\
\
x86SetJ8(localptr[i][4]);\
x86SetJ8(localptr[i][5]);\
x86SetJ8(localptr[i][6]);\
x86SetJ8(localptr[i][7]);
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
PERFORM_SSE4(0);
PERFORM_SSE4(1);
PERFORM_SSE4(2);
PERFORM_SSE4(3);
#undef PERFORM_SSE4
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
if (is_sub) SSE_SUBPS_XMM_to_XMM(regd, regt);
else SSE_ADDPS_XMM_to_XMM(regd, regt);
SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
_freeX86reg(temp1);
_freeX86reg(temp2);
}
void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info) void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
{ {
u8 *localptr[8]; u8 *localptr[8];
@ -399,22 +532,17 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd); SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt); if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_fullmask[0]); SSE2_MOVD_XMM_to_R(temp1, regd);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd); SHR32ItoR(temp1, 23);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd);
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
SSE_PEXTRW_XMM_to_R32(temp1, regd, 1);
SHR32ItoR(temp1, 23 - 16);
if (is_mem) { if (is_mem) {
MOV32MtoR(temp2, addrt); MOV32MtoR(temp2, addrt);
MOV32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
SHR32ItoR(temp2, 23); SHR32ItoR(temp2, 23);
} }
else { else {
SSE_PEXTRW_XMM_to_R32(temp2, regt, 1); SSE2_MOVD_XMM_to_R(temp2, regt);
SHR32ItoR(temp2, 23 - 16); SHR32ItoR(temp2, 23);
} }
AND32ItoR(temp1, 0xff); AND32ItoR(temp1, 0xff);
@ -432,24 +560,60 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
NEG32R(temp1); NEG32R(temp1);
DEC32R(temp1); DEC32R(temp1);
MOV32ItoR(temp2, 0xffffffff); MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2); SHL32CLtoR(temp2);
MOV32RtoM((uptr)&VU_addsuband[0][0], temp2); SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
if (is_mem) {
SSE_PINSRW_R32_to_XMM(regd, temp2, 0);
SHR32ItoR(temp2, 16);
SSE_PINSRW_R32_to_XMM(regd, temp2, 1);
}
else {
SSE2_MOVD_R_to_XMM(regt, temp2);
SSE_MOVSS_XMM_to_XMM(regd, regt);
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
}
localptr[4] = JMP8(0); localptr[4] = JMP8(0);
x86SetJ8(localptr[0]); x86SetJ8(localptr[0]);
MOV32ItoM((uptr)&VU_addsuband[1][0], 0x80000000); MOV32ItoR(temp2, 0x80000000);
if (is_mem)
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
else {
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
SSE2_MOVD_R_to_XMM(regd, temp2);
SSE_MOVSS_XMM_to_XMM(regt, regd);
}
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
localptr[5] = JMP8(0); localptr[5] = JMP8(0);
x86SetJ8(localptr[1]); x86SetJ8(localptr[1]);
DEC32R(temp1); DEC32R(temp1);
MOV32ItoR(temp2, 0xffffffff); MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2); SHL32CLtoR(temp2);
MOV32RtoM((uptr)&VU_addsuband[1][0], temp2); if (is_mem)
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
else {
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
SSE2_MOVD_R_to_XMM(regd, temp2);
SSE_MOVSS_XMM_to_XMM(regt, regd);
}
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
localptr[6] = JMP8(0); localptr[6] = JMP8(0);
x86SetJ8(localptr[3]); x86SetJ8(localptr[3]);
MOV32ItoM((uptr)&VU_addsuband[0][0], 0x80000000); MOV32ItoR(temp2, 0x80000000);
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
if (is_mem) {
SSE_PINSRW_R32_to_XMM(regd, temp2, 0);
SHR32ItoR(temp2, 16);
SSE_PINSRW_R32_to_XMM(regd, temp2, 1);
}
else {
SSE2_MOVD_R_to_XMM(regt, temp2);
SSE_MOVSS_XMM_to_XMM(regd, regt);
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
}
localptr[7] = JMP8(0); localptr[7] = JMP8(0);
x86SetJ8(localptr[2]); x86SetJ8(localptr[2]);
@ -460,21 +624,121 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
if (is_mem) if (is_mem)
{ {
SSE_MOVSS_M32_to_XMM(regd, addrt); SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[1][0]); //regd contains addrt
SSE_MOVSS_XMM_to_M32((uptr)&VU_addsub_reg[1][0], regd);
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[0][0]);
if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]); if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]); else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
} }
else else
{ {
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[0][0]); SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsuband[1][0]); SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt);
else SSE_ADDSS_XMM_to_XMM(regd, regt);
SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
}
_freeX86reg(temp1);
_freeX86reg(temp2);
}
void VU_ADD_SUB_SS_SSE4(u32 regd, u32 regt, int is_sub, int is_mem, int info)
{
u8 *localptr[8];
u32 addrt = regt; //for case is_mem
int temp1 = _allocX86reg(ECX, X86TYPE_TEMP, 0, 0); //receives regd //_allocX86reg(ECX, X86TYPE_TEMP, 0, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode);
int temp2 = ALLOCTEMPX86(0);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
SSE2_MOVD_XMM_to_R(temp1, regd);
SHR32ItoR(temp1, 23);
if (is_mem) {
MOV32MtoR(temp2, addrt);
MOV32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
SHR32ItoR(temp2, 23);
}
else {
SSE2_MOVD_XMM_to_R(temp2, regt);
SHR32ItoR(temp2, 23);
}
AND32ItoR(temp1, 0xff);
AND32ItoR(temp2, 0xff);
SUB32RtoR(temp1, temp2); //temp1 = exponent difference
CMP32ItoR(temp1, 25);
localptr[0] = JGE8(0);
CMP32ItoR(temp1, 0);
localptr[1] = JG8(0);
localptr[2] = JE8(0);
CMP32ItoR(temp1, -25);
localptr[3] = JLE8(0);
NEG32R(temp1);
DEC32R(temp1);
MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2);
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
SSE4_PINSRD_R32_to_XMM(regd, temp2, 0);
if (!is_mem)
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
localptr[4] = JMP8(0);
x86SetJ8(localptr[0]);
MOV32ItoR(temp2, 0x80000000);
if (is_mem)
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
else {
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
SSE4_PINSRD_R32_to_XMM(regt, temp2, 0);
}
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
localptr[5] = JMP8(0);
x86SetJ8(localptr[1]);
DEC32R(temp1);
MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2);
if (is_mem)
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
else {
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
SSE4_PINSRD_R32_to_XMM(regt, temp2, 0);
}
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
localptr[6] = JMP8(0);
x86SetJ8(localptr[3]);
MOV32ItoR(temp2, 0x80000000);
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
SSE4_PINSRD_R32_to_XMM(regd, temp2, 0);
if (!is_mem)
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
localptr[7] = JMP8(0);
x86SetJ8(localptr[2]);
x86SetJ8(localptr[4]);
x86SetJ8(localptr[5]);
x86SetJ8(localptr[6]);
x86SetJ8(localptr[7]);
if (is_mem)
{
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
}
else
{
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt); if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt);
else SSE_ADDSS_XMM_to_XMM(regd, regt); else SSE_ADDSS_XMM_to_XMM(regd, regt);
@ -487,27 +751,57 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
} }
void SSE_ADDPS_XMM_to_XMM_custom(int info, int regd, int regt) { void SSE_ADDPS_XMM_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB(regd, regt, 0, info); if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SSE4(regd, regt, 0, info);
else
VU_ADD_SUB(regd, regt, 0, info);
}
else SSE_ADDPS_XMM_to_XMM(regd, regt); else SSE_ADDPS_XMM_to_XMM(regd, regt);
} }
void SSE_SUBPS_XMM_to_XMM_custom(int info, int regd, int regt) { void SSE_SUBPS_XMM_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB(regd, regt, 1, info); if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SSE4(regd, regt, 1, info);
else
VU_ADD_SUB(regd, regt, 1, info);
}
else SSE_SUBPS_XMM_to_XMM(regd, regt); else SSE_SUBPS_XMM_to_XMM(regd, regt);
} }
void SSE_ADDSS_XMM_to_XMM_custom(int info, int regd, int regt) { void SSE_ADDSS_XMM_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 0, 0, info); if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SS_SSE4(regd, regt, 0, 0, info);
else
VU_ADD_SUB_SS(regd, regt, 0, 0, info);
}
else SSE_ADDSS_XMM_to_XMM(regd, regt); else SSE_ADDSS_XMM_to_XMM(regd, regt);
} }
void SSE_SUBSS_XMM_to_XMM_custom(int info, int regd, int regt) { void SSE_SUBSS_XMM_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 1, 0, info); if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SS_SSE4(regd, regt, 1, 0, info);
else
VU_ADD_SUB_SS(regd, regt, 1, 0, info);
}
else SSE_SUBSS_XMM_to_XMM(regd, regt); else SSE_SUBSS_XMM_to_XMM(regd, regt);
} }
void SSE_ADDSS_M32_to_XMM_custom(int info, int regd, int regt) { void SSE_ADDSS_M32_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 0, 1, info); if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SS_SSE4(regd, regt, 0, 1, info);
else
VU_ADD_SUB_SS(regd, regt, 0, 1, info);
}
else SSE_ADDSS_M32_to_XMM(regd, regt); else SSE_ADDSS_M32_to_XMM(regd, regt);
} }
void SSE_SUBSS_M32_to_XMM_custom(int info, int regd, int regt) { void SSE_SUBSS_M32_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 1, 1, info); if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SS_SSE4(regd, regt, 1, 1, info);
else
VU_ADD_SUB_SS(regd, regt, 1, 1, info);
}
else SSE_SUBSS_M32_to_XMM(regd, regt); else SSE_SUBSS_M32_to_XMM(regd, regt);
} }
//------------------------------------------------------------------ //------------------------------------------------------------------
@ -2814,4 +3108,4 @@ void recVUMI_CLIP(VURegs *VU, int info)
_freeX86reg(x86temp1); _freeX86reg(x86temp1);
_freeX86reg(x86temp2); _freeX86reg(x86temp2);
} }

View File

@ -1617,6 +1617,7 @@ extern void SSE4_BLENDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm
extern void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from); extern void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from); extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from);
extern void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from); extern void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8);
//********************* //*********************
// SSE-X - uses both SSE,SSE2 code and tries to keep consistensies between the data // SSE-X - uses both SSE,SSE2 code and tries to keep consistensies between the data

View File

@ -1143,6 +1143,15 @@ __forceinline void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from
ModRM(3, to, from); ModRM(3, to, from);
} }
__forceinline void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8)
{
write8(0x66);
RexRB(0, to, from);
write24(0x223A0F);
ModRM(3, to, from);
write8(imm8);
}
// SSE-X // SSE-X
__forceinline void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from ) __forceinline void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from )
{ {