mirror of https://github.com/PCSX2/pcsx2.git
Tmmk fixed a bug in saveEAX where a vector wasn't saved in some situations. He also optimized some VU functions, so this revision brings about 5% more speed ;)
git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@631 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
parent
dde14aab30
commit
c787b00cc7
|
@ -576,12 +576,11 @@ int _vuGetTempXMMreg(int info)
|
|||
//------------------------------------------------------------------
|
||||
void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw)
|
||||
{
|
||||
SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
|
||||
switch (xyzw) {
|
||||
case 0: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x00); break;
|
||||
case 1: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x55); break;
|
||||
case 2: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xaa); break;
|
||||
case 3: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xff); break;
|
||||
case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break;
|
||||
case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break;
|
||||
case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break;
|
||||
case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -657,8 +656,9 @@ void VU_MERGE3(int dest, int src) { // 1100s
|
|||
}
|
||||
void VU_MERGE4(int dest, int src) { // 0010
|
||||
SSE_MOVSS_XMM_to_XMM(src, dest);
|
||||
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
|
||||
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
||||
//SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
|
||||
//SSE_MOVAPS_XMM_to_XMM(dest, src);
|
||||
SSE2_MOVSD_XMM_to_XMM(dest, src);
|
||||
}
|
||||
void VU_MERGE4b(int dest, int src) { // 0010s
|
||||
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
||||
|
@ -743,16 +743,7 @@ void VU_MERGE11(int dest, int src) { // 1101s
|
|||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
|
||||
}
|
||||
void VU_MERGE12(int dest, int src) { // 0011
|
||||
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
|
||||
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
||||
}
|
||||
void VU_MERGE12b(int dest, int src) { // 0011
|
||||
SSE_MOVSS_XMM_to_XMM(dest, src);
|
||||
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
||||
SSE_MOVSS_XMM_to_XMM(dest, src);
|
||||
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
||||
SSE2_MOVSD_XMM_to_XMM(dest, src);
|
||||
}
|
||||
void VU_MERGE13(int dest, int src) { // 1011
|
||||
SSE_MOVHLPS_XMM_to_XMM(dest, src);
|
||||
|
@ -806,7 +797,7 @@ static VUMERGEFN s_VuMerge2[16] = {
|
|||
VU_MERGE0, VU_MERGE1b, VU_MERGE2b, VU_MERGE3,
|
||||
VU_MERGE4b, VU_MERGE5b, VU_MERGE6b, VU_MERGE7b,
|
||||
VU_MERGE8, VU_MERGE9b, VU_MERGE10b, VU_MERGE11,
|
||||
VU_MERGE12b, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 };
|
||||
VU_MERGE12, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 };
|
||||
|
||||
// Modifies the Source Reg!
|
||||
void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) {
|
||||
|
@ -876,6 +867,12 @@ void vFloat3(int regd, int regTemp) { //1100
|
|||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
|
||||
}
|
||||
void vFloat3b(int regd, int regTemp) { //1100 //regTemp is Modified
|
||||
SSE2_MOVSD_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
|
||||
}
|
||||
void vFloat3c(int regd, int regTemp) { //1100
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
||||
|
@ -912,6 +909,12 @@ void vFloat5(int regd, int regTemp) { //1010
|
|||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
|
||||
}
|
||||
void vFloat5b(int regd, int regTemp) { //1010
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x5);
|
||||
}
|
||||
void vFloat5c(int regd, int regTemp) { //1010
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
||||
|
@ -933,6 +936,12 @@ void vFloat6(int regd, int regTemp) { //0110
|
|||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
|
||||
}
|
||||
void vFloat6b(int regd, int regTemp) { //0110
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x9);
|
||||
}
|
||||
void vFloat6c(int regd, int regTemp) { //0110
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
||||
|
@ -946,37 +955,44 @@ void vFloat6c(int regd, int regTemp) { //0110
|
|||
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
||||
}
|
||||
void vFloat7(int regd, int regTemp) { //1110
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions ) {
|
||||
SSE2_MOVD_XMM_to_R(EAX, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
|
||||
}
|
||||
else {
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
|
||||
}
|
||||
}
|
||||
void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals);
|
||||
VU_MERGE_REGS_CUSTOM(regd, regTemp, 7);
|
||||
SSE_MOVSS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
|
||||
}
|
||||
void vFloat7c(int regd, int regTemp) { //1110
|
||||
SSE2_MOVD_XMM_to_R(EAX, regd);
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
|
||||
else {
|
||||
SSE2_MOVD_R_to_XMM(regTemp, EAX);
|
||||
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
|
||||
}
|
||||
}
|
||||
void vFloat8(int regd, int regTemp) { //0001
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
|
@ -997,6 +1013,12 @@ void vFloat9(int regd, int regTemp) { //1001
|
|||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
}
|
||||
void vFloat9b(int regd, int regTemp) { //1001
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x6);
|
||||
}
|
||||
void vFloat9c(int regd, int regTemp) { //1001
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
||||
|
@ -1016,6 +1038,12 @@ void vFloat10(int regd, int regTemp) { //0101
|
|||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
}
|
||||
void vFloat10b(int regd, int regTemp) { //0101
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0xa);
|
||||
}
|
||||
void vFloat10c(int regd, int regTemp) { //0101
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
||||
|
@ -1028,35 +1056,54 @@ void vFloat10c(int regd, int regTemp) { //0101
|
|||
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
||||
}
|
||||
void vFloat11(int regd, int regTemp) { //1101
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions ) {
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
SSE2_MOVD_XMM_to_R(EAX, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
}
|
||||
else {
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
|
||||
}
|
||||
}
|
||||
void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals);
|
||||
VU_MERGE_REGS_CUSTOM(regd, regTemp, 11);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x02);
|
||||
else {
|
||||
SSE_MOVSS_XMM_to_XMM(regTemp, regd);
|
||||
SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
|
||||
}
|
||||
}
|
||||
void vFloat11c(int regd, int regTemp) { //1101
|
||||
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0xe1);
|
||||
SSE2_MOVD_XMM_to_R(EAX, regTemp);
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
|
||||
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions ) {
|
||||
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
||||
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x01);
|
||||
}
|
||||
else {
|
||||
SSE_ORPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE2_MOVD_R_to_XMM(regd, EAX);
|
||||
SSE_MOVLHPS_XMM_to_XMM(regd, regTemp);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xe2);
|
||||
}
|
||||
}
|
||||
void vFloat12(int regd, int regTemp) { //0011
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
|
@ -1066,6 +1113,12 @@ void vFloat12(int regd, int regTemp) { //0011
|
|||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
}
|
||||
void vFloat12b(int regd, int regTemp) { //0011 //regTemp is Modified
|
||||
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE2_PUNPCKLQDQ_XMM_to_XMM(regd, regTemp);
|
||||
}
|
||||
void vFloat12c(int regd, int regTemp) { //0011
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
||||
|
@ -1078,66 +1131,100 @@ void vFloat12c(int regd, int regTemp) { //0011
|
|||
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
||||
}
|
||||
void vFloat13(int regd, int regTemp) { //1011
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions ) {
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
SSE2_MOVD_XMM_to_R(EAX, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
}
|
||||
else {
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
|
||||
}
|
||||
}
|
||||
void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals);
|
||||
VU_MERGE_REGS_CUSTOM(regd, regTemp, 13);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x04);
|
||||
else {
|
||||
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x64);
|
||||
}
|
||||
}
|
||||
void vFloat13c(int regd, int regTemp) { //1011
|
||||
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0xd2);
|
||||
SSE2_MOVD_XMM_to_R(EAX, regTemp);
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x02);
|
||||
else {
|
||||
SSE2_MOVD_R_to_XMM(regTemp, EAX);
|
||||
SSE_SHUFPS_XMM_to_XMM(regTemp, regd, 0xf0);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x84);
|
||||
}
|
||||
}
|
||||
void vFloat14(int regd, int regTemp) { //0111
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions ) {
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
SSE2_MOVD_XMM_to_R(EAX, regd);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
||||
}
|
||||
else {
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
|
||||
}
|
||||
}
|
||||
void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals);
|
||||
VU_MERGE_REGS_CUSTOM(regd, regTemp, 14);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x08);
|
||||
else {
|
||||
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xc4);
|
||||
}
|
||||
}
|
||||
void vFloat14c(int regd, int regTemp) { //0111
|
||||
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0x93);
|
||||
SSE2_MOVD_XMM_to_R(EAX, regTemp);
|
||||
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
||||
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
||||
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
||||
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x03);
|
||||
else {
|
||||
SSE2_MOVD_R_to_XMM(regTemp, EAX);
|
||||
SSE_SHUFPS_XMM_to_XMM(regTemp, regd, 0xa0);
|
||||
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x24);
|
||||
}
|
||||
}
|
||||
void vFloat15(int regd, int regTemp) { //1111
|
||||
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
||||
|
@ -1158,10 +1245,10 @@ vFloat vFloats1[16] = { //regTemp is not modified
|
|||
vFloat12, vFloat13, vFloat14, vFloat15 };
|
||||
|
||||
vFloat vFloats2[16] = { //regTemp is modified
|
||||
vFloat0, vFloat1, vFloat2, vFloat3,
|
||||
vFloat0, vFloat1, vFloat2, vFloat3b,
|
||||
vFloat4, vFloat5, vFloat6, vFloat7b,
|
||||
vFloat8, vFloat9, vFloat10, vFloat11b,
|
||||
vFloat12, vFloat13b, vFloat14b, vFloat15 };
|
||||
vFloat12b, vFloat13b, vFloat14b, vFloat15 };
|
||||
|
||||
vFloat vFloats4[16] = { //regTemp is modified
|
||||
vFloat0, vFloat1c, vFloat2c, vFloat3c,
|
||||
|
@ -1269,4 +1356,4 @@ void SetVUNanMode(int mode)
|
|||
{
|
||||
g_VuNanHandling = mode;
|
||||
if ( mode ) SysPrintf("enabling vunan mode");
|
||||
}
|
||||
}
|
|
@ -799,10 +799,9 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
|
|||
|
||||
switch ( _X_Y_Z_W ) {
|
||||
case 1: // W*
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x27);
|
||||
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset+12);
|
||||
else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_S);
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x27);
|
||||
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x27);
|
||||
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
|
||||
else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
|
||||
break;
|
||||
case 2: // Z*
|
||||
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
|
@ -817,39 +816,35 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
|
|||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB1);
|
||||
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
if ( x86reg >= 0 ) {
|
||||
SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
|
||||
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
|
||||
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
|
||||
}
|
||||
else {
|
||||
SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_S);
|
||||
SSE_MOVSS_XMM_to_M32(offset+4, EEREC_S);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
|
||||
}
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB1);
|
||||
break;
|
||||
case 4: // Y*
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xe1);
|
||||
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
|
||||
else SSE_MOVSS_XMM_to_M32(offset+4, EEREC_S);
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xe1);
|
||||
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0xe1);
|
||||
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
|
||||
else SSE_MOVSS_XMM_to_M32(offset+4, EEREC_TEMP);
|
||||
break;
|
||||
case 6: // YZ
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xc9);
|
||||
if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
|
||||
else SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_S);
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xd2);
|
||||
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0xc9);
|
||||
if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
|
||||
else SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_TEMP);
|
||||
break;
|
||||
case 7: // YZW
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x39);
|
||||
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x93); //ZYXW
|
||||
if ( x86reg >= 0 ) {
|
||||
SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
|
||||
SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
|
||||
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
|
||||
}
|
||||
else {
|
||||
SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_S);
|
||||
SSE_MOVHPS_XMM_to_M64(offset+4, EEREC_TEMP);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
|
||||
}
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x93);
|
||||
break;
|
||||
case 8: // X*
|
||||
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
|
||||
|
@ -867,26 +862,41 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
|
|||
else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
|
||||
|
||||
break;
|
||||
//case 10: break;
|
||||
//case 11: break;
|
||||
case 10: //XZ
|
||||
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
if ( x86reg >= 0 ) {
|
||||
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
|
||||
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+8);
|
||||
}
|
||||
else {
|
||||
SSE_MOVSS_XMM_to_M32(offset, EEREC_S);
|
||||
SSE_MOVSS_XMM_to_M32(offset+8, EEREC_TEMP);
|
||||
}
|
||||
break;
|
||||
case 11: break; //XZW
|
||||
if ( x86reg >= 0 ) {
|
||||
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
|
||||
SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+8);
|
||||
}
|
||||
else {
|
||||
SSE_MOVSS_XMM_to_M32(offset, EEREC_S);
|
||||
SSE_MOVHPS_XMM_to_M64(offset+8, EEREC_S);
|
||||
}
|
||||
break;
|
||||
case 12: // XY
|
||||
if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+0);
|
||||
else SSE_MOVLPS_XMM_to_M64(offset, EEREC_S);
|
||||
break;
|
||||
|
||||
case 13: // XYW
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB4); //ZWYX
|
||||
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x4b); //YXZW
|
||||
if ( x86reg >= 0 ) {
|
||||
SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+0);
|
||||
SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+0);
|
||||
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
|
||||
}
|
||||
else {
|
||||
SSE_MOVLPS_XMM_to_M64(offset, EEREC_S);
|
||||
SSE_MOVHPS_XMM_to_M64(offset, EEREC_TEMP);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
|
||||
|
||||
}
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB4);
|
||||
break;
|
||||
case 14: // XYZ
|
||||
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
|
@ -2006,4 +2016,4 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
|
|||
mtgsThread->SendDataPacket();
|
||||
}
|
||||
}
|
||||
//------------------------------------------------------------------
|
||||
//------------------------------------------------------------------
|
|
@ -182,7 +182,6 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
|
|||
}
|
||||
|
||||
SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip wzyx to xyzw
|
||||
XOR32RtoR(x86macflag, x86macflag); // Clear Mac Flag
|
||||
MOV32MtoR(x86temp, prevstataddr); // Load the previous status in to x86temp
|
||||
AND16ItoR(x86temp, 0xff0); // Keep Sticky and D/I flags
|
||||
|
||||
|
@ -202,13 +201,12 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
|
|||
SSE_ANDPS_M128_to_XMM(t1reg, (uptr)VU_Zero_Helper_Mask);
|
||||
SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)VU_Pos_Infinity); // If infinity, then overflow has occured (NaN's don't report as overflow)
|
||||
|
||||
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
|
||||
SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the previous calculation
|
||||
|
||||
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified)
|
||||
AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified)
|
||||
pjmp = JZ8(0); // Skip if none are
|
||||
OR16ItoR(x86temp, 0x208); // OS, O flags
|
||||
SHL16ItoR(EAX, 12);
|
||||
OR32RtoR(x86macflag, EAX);
|
||||
SHL16ItoR(x86macflag, 12);
|
||||
if (_XYZW_SS) pjmp32 = JMP32(0); // Skip Underflow Check
|
||||
x86SetJ8(pjmp);
|
||||
|
||||
|
@ -246,35 +244,91 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
|
|||
|
||||
vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask)
|
||||
|
||||
//-------------------------Check for Signed flags------------------------------
|
||||
if (_XYZW_SS) {
|
||||
//-------------------------Check for Signed flags------------------------------
|
||||
|
||||
// The following code makes sure the Signed Bit isn't set with Negative Zero
|
||||
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
|
||||
SSE_CMPNEPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is not zero
|
||||
SSE_ANDPS_XMM_to_XMM(t1reg, reg);
|
||||
// The following code makes sure the Signed Bit isn't set with Negative Zero
|
||||
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
|
||||
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
|
||||
|
||||
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg
|
||||
if (CHECK_VU_EXTRA_FLAGS) {
|
||||
SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
|
||||
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg
|
||||
|
||||
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
|
||||
pjmp = JZ8(0); // Skip if none are
|
||||
OR16ItoR(x86temp, 0x82); // SS, S flags
|
||||
SHL16ItoR(EAX, 4);
|
||||
OR32RtoR(x86macflag, EAX);
|
||||
if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
|
||||
x86SetJ8(pjmp);
|
||||
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
|
||||
pjmp = JZ8(0); // Skip if none are
|
||||
OR16ItoR(x86temp, 0x82); // SS, S flags
|
||||
SHL16ItoR(EAX, 4);
|
||||
OR32RtoR(x86macflag, EAX);
|
||||
pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
|
||||
x86SetJ8(pjmp);
|
||||
}
|
||||
else {
|
||||
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg (for zero flag)
|
||||
SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
|
||||
SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the t1reg
|
||||
|
||||
//-------------------------Check for Zero flags------------------------------
|
||||
|
||||
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
|
||||
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
|
||||
AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
|
||||
pjmp = JZ8(0); // Skip if none are
|
||||
OR16ItoR(x86temp, 0x82); // SS, S flags
|
||||
SHL16ItoR(x86macflag, 4);
|
||||
pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
|
||||
x86SetJ8(pjmp);
|
||||
}
|
||||
|
||||
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
|
||||
//-------------------------Check for Zero flags------------------------------
|
||||
|
||||
if (CHECK_VU_EXTRA_FLAGS) {
|
||||
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
|
||||
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
|
||||
|
||||
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
|
||||
pjmp = JZ8(0); // Skip if none are
|
||||
OR16ItoR(x86temp, 0x41); // ZS, Z flags
|
||||
OR32RtoR(x86macflag, EAX);
|
||||
x86SetJ8(pjmp);
|
||||
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
|
||||
}
|
||||
|
||||
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
|
||||
pjmp = JZ8(0); // Skip if none are
|
||||
OR16ItoR(x86temp, 0x41); // ZS, Z flags
|
||||
OR32RtoR(x86macflag, EAX);
|
||||
x86SetJ8(pjmp);
|
||||
}
|
||||
else {
|
||||
//-------------------------Check for Zero flags------------------------------
|
||||
|
||||
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
|
||||
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
|
||||
|
||||
if (CHECK_VU_EXTRA_FLAGS) {
|
||||
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
|
||||
|
||||
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
|
||||
pjmp = JZ8(0); // Skip if none are
|
||||
OR16ItoR(x86temp, 0x41); // ZS, Z flags
|
||||
OR32RtoR(x86macflag, EAX);
|
||||
x86SetJ8(pjmp);
|
||||
}
|
||||
else {
|
||||
SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the previous calculation
|
||||
|
||||
AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
|
||||
pjmp = JZ8(0); // Skip if none are
|
||||
OR16ItoR(x86temp, 0x41); // ZS, Z flags
|
||||
x86SetJ8(pjmp);
|
||||
}
|
||||
|
||||
//-------------------------Check for Signed flags------------------------------
|
||||
|
||||
// The following code makes sure the Signed Bit isn't set with Negative Zero
|
||||
SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
|
||||
|
||||
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg
|
||||
|
||||
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
|
||||
pjmp = JZ8(0); // Skip if none are
|
||||
OR16ItoR(x86temp, 0x82); // SS, S flags
|
||||
SHL16ItoR(EAX, 4);
|
||||
OR32RtoR(x86macflag, EAX);
|
||||
x86SetJ8(pjmp);
|
||||
}
|
||||
|
||||
//-------------------------Finally: Send the Flags to the Mac Flag Address------------------------------
|
||||
|
||||
|
@ -298,8 +352,6 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
|
|||
//
|
||||
// Note: See FPU_ADD_SUB() for more info on what this is doing.
|
||||
//------------------------------------------------------------------
|
||||
static const PCSX2_ALIGNED16(u32 VU_fullmask[4]) = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};
|
||||
static const PCSX2_ALIGNED16(u32 VU_helperbyte[4]) = {0xff, 0xff, 0xff, 0xff};
|
||||
static PCSX2_ALIGNED16(u32 VU_addsuband[2][4]);
|
||||
static PCSX2_ALIGNED16(u32 VU_addsub_reg[2][4]);
|
||||
static u32 ecx_temp_loc;
|
||||
|
@ -313,17 +365,16 @@ void VU_ADD_SUB(u32 regd, u32 regt, int is_sub, int info)
|
|||
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_fullmask[0]);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
|
||||
|
||||
SSE2_PSRLD_I8_to_XMM(regd, 23);
|
||||
SSE2_PSRLD_I8_to_XMM(regt, 23);
|
||||
SSE2_PSLLD_I8_to_XMM(regd, 1);
|
||||
SSE2_PSLLD_I8_to_XMM(regt, 1);
|
||||
|
||||
SSE2_PAND_M128_to_XMM(regd, (uptr)&VU_helperbyte[0]);
|
||||
SSE2_PAND_M128_to_XMM(regt, (uptr)&VU_helperbyte[0]);
|
||||
SSE2_PSRLD_I8_to_XMM(regd, 24);
|
||||
SSE2_PSRLD_I8_to_XMM(regt, 24);
|
||||
|
||||
SSE2_PSUBD_XMM_to_XMM(regd, regt);
|
||||
|
||||
|
@ -389,6 +440,88 @@ void VU_ADD_SUB(u32 regd, u32 regt, int is_sub, int info)
|
|||
_freeX86reg(temp2);
|
||||
}
|
||||
|
||||
void VU_ADD_SUB_SSE4(u32 regd, u32 regt, int is_sub, int info)
|
||||
{
|
||||
u8 *localptr[4][8];
|
||||
int temp1 = _allocX86reg(ECX, X86TYPE_TEMP, 0, 0); //receives regd//_allocX86reg(ECX, X86TYPE_TEMP, 0, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode);
|
||||
int temp2 = ALLOCTEMPX86(0);
|
||||
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
|
||||
|
||||
SSE2_PSLLD_I8_to_XMM(regd, 1);
|
||||
SSE2_PSLLD_I8_to_XMM(regt, 1);
|
||||
|
||||
SSE2_PSRLD_I8_to_XMM(regd, 24);
|
||||
SSE2_PSRLD_I8_to_XMM(regt, 24);
|
||||
|
||||
SSE2_PSUBD_XMM_to_XMM(regd, regt);
|
||||
|
||||
#define PERFORM_SSE4(i) \
|
||||
\
|
||||
SSE_PEXTRW_XMM_to_R32(temp1, regd, i*2); \
|
||||
MOVSX32R16toR(temp1, temp1); \
|
||||
CMP32ItoR(temp1, 25);\
|
||||
localptr[i][0] = JGE8(0);\
|
||||
CMP32ItoR(temp1, 0);\
|
||||
localptr[i][1] = JG8(0);\
|
||||
localptr[i][2] = JE8(0);\
|
||||
CMP32ItoR(temp1, -25);\
|
||||
localptr[i][3] = JLE8(0);\
|
||||
\
|
||||
NEG32R(temp1); \
|
||||
DEC32R(temp1);\
|
||||
MOV32ItoR(temp2, 0xffffffff); \
|
||||
SHL32CLtoR(temp2); \
|
||||
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
|
||||
localptr[i][4] = JMP8(0);\
|
||||
\
|
||||
x86SetJ8(localptr[i][0]);\
|
||||
MOV32ItoR(temp2, 0xffffffff); \
|
||||
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
|
||||
SHL32ItoR(temp2, 31); \
|
||||
SSE4_PINSRD_R32_to_XMM(regt, temp2, i); \
|
||||
localptr[i][5] = JMP8(0);\
|
||||
\
|
||||
x86SetJ8(localptr[i][1]);\
|
||||
DEC32R(temp1);\
|
||||
MOV32ItoR(temp2, 0xffffffff);\
|
||||
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
|
||||
SHL32CLtoR(temp2); \
|
||||
SSE4_PINSRD_R32_to_XMM(regt, temp2, i); \
|
||||
localptr[i][6] = JMP8(0);\
|
||||
\
|
||||
x86SetJ8(localptr[i][3]);\
|
||||
MOV32ItoR(temp2, 0x80000000); \
|
||||
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
|
||||
localptr[i][7] = JMP8(0);\
|
||||
\
|
||||
x86SetJ8(localptr[i][2]);\
|
||||
\
|
||||
x86SetJ8(localptr[i][4]);\
|
||||
x86SetJ8(localptr[i][5]);\
|
||||
x86SetJ8(localptr[i][6]);\
|
||||
x86SetJ8(localptr[i][7]);
|
||||
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
|
||||
PERFORM_SSE4(0);
|
||||
PERFORM_SSE4(1);
|
||||
PERFORM_SSE4(2);
|
||||
PERFORM_SSE4(3);
|
||||
#undef PERFORM_SSE4
|
||||
|
||||
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
|
||||
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
|
||||
|
||||
if (is_sub) SSE_SUBPS_XMM_to_XMM(regd, regt);
|
||||
else SSE_ADDPS_XMM_to_XMM(regd, regt);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
|
||||
|
||||
_freeX86reg(temp1);
|
||||
_freeX86reg(temp2);
|
||||
}
|
||||
|
||||
void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
|
||||
{
|
||||
u8 *localptr[8];
|
||||
|
@ -399,22 +532,17 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
|
|||
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
|
||||
if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_fullmask[0]);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
|
||||
|
||||
SSE_PEXTRW_XMM_to_R32(temp1, regd, 1);
|
||||
SHR32ItoR(temp1, 23 - 16);
|
||||
SSE2_MOVD_XMM_to_R(temp1, regd);
|
||||
SHR32ItoR(temp1, 23);
|
||||
|
||||
if (is_mem) {
|
||||
MOV32MtoR(temp2, addrt);
|
||||
MOV32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
|
||||
SHR32ItoR(temp2, 23);
|
||||
}
|
||||
else {
|
||||
SSE_PEXTRW_XMM_to_R32(temp2, regt, 1);
|
||||
SHR32ItoR(temp2, 23 - 16);
|
||||
SSE2_MOVD_XMM_to_R(temp2, regt);
|
||||
SHR32ItoR(temp2, 23);
|
||||
}
|
||||
|
||||
AND32ItoR(temp1, 0xff);
|
||||
|
@ -432,24 +560,60 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
|
|||
|
||||
NEG32R(temp1);
|
||||
DEC32R(temp1);
|
||||
MOV32ItoR(temp2, 0xffffffff);
|
||||
SHL32CLtoR(temp2);
|
||||
MOV32RtoM((uptr)&VU_addsuband[0][0], temp2);
|
||||
MOV32ItoR(temp2, 0xffffffff);
|
||||
SHL32CLtoR(temp2);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
|
||||
if (is_mem) {
|
||||
SSE_PINSRW_R32_to_XMM(regd, temp2, 0);
|
||||
SHR32ItoR(temp2, 16);
|
||||
SSE_PINSRW_R32_to_XMM(regd, temp2, 1);
|
||||
}
|
||||
else {
|
||||
SSE2_MOVD_R_to_XMM(regt, temp2);
|
||||
SSE_MOVSS_XMM_to_XMM(regd, regt);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
|
||||
}
|
||||
localptr[4] = JMP8(0);
|
||||
|
||||
x86SetJ8(localptr[0]);
|
||||
MOV32ItoM((uptr)&VU_addsuband[1][0], 0x80000000);
|
||||
MOV32ItoR(temp2, 0x80000000);
|
||||
if (is_mem)
|
||||
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
|
||||
else {
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
|
||||
SSE2_MOVD_R_to_XMM(regd, temp2);
|
||||
SSE_MOVSS_XMM_to_XMM(regt, regd);
|
||||
}
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
|
||||
localptr[5] = JMP8(0);
|
||||
|
||||
x86SetJ8(localptr[1]);
|
||||
DEC32R(temp1);
|
||||
MOV32ItoR(temp2, 0xffffffff);
|
||||
SHL32CLtoR(temp2);
|
||||
MOV32RtoM((uptr)&VU_addsuband[1][0], temp2);
|
||||
if (is_mem)
|
||||
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
|
||||
else {
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
|
||||
SSE2_MOVD_R_to_XMM(regd, temp2);
|
||||
SSE_MOVSS_XMM_to_XMM(regt, regd);
|
||||
}
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
|
||||
localptr[6] = JMP8(0);
|
||||
|
||||
x86SetJ8(localptr[3]);
|
||||
MOV32ItoM((uptr)&VU_addsuband[0][0], 0x80000000);
|
||||
MOV32ItoR(temp2, 0x80000000);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
|
||||
if (is_mem) {
|
||||
SSE_PINSRW_R32_to_XMM(regd, temp2, 0);
|
||||
SHR32ItoR(temp2, 16);
|
||||
SSE_PINSRW_R32_to_XMM(regd, temp2, 1);
|
||||
}
|
||||
else {
|
||||
SSE2_MOVD_R_to_XMM(regt, temp2);
|
||||
SSE_MOVSS_XMM_to_XMM(regd, regt);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
|
||||
}
|
||||
localptr[7] = JMP8(0);
|
||||
|
||||
x86SetJ8(localptr[2]);
|
||||
|
@ -460,21 +624,121 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
|
|||
|
||||
if (is_mem)
|
||||
{
|
||||
SSE_MOVSS_M32_to_XMM(regd, addrt);
|
||||
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[1][0]); //regd contains addrt
|
||||
SSE_MOVSS_XMM_to_M32((uptr)&VU_addsub_reg[1][0], regd);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
|
||||
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[0][0]);
|
||||
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
|
||||
|
||||
if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
|
||||
else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[0][0]);
|
||||
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsuband[1][0]);
|
||||
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
|
||||
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
|
||||
|
||||
if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt);
|
||||
else SSE_ADDSS_XMM_to_XMM(regd, regt);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
|
||||
}
|
||||
|
||||
_freeX86reg(temp1);
|
||||
_freeX86reg(temp2);
|
||||
}
|
||||
|
||||
void VU_ADD_SUB_SS_SSE4(u32 regd, u32 regt, int is_sub, int is_mem, int info)
|
||||
{
|
||||
u8 *localptr[8];
|
||||
u32 addrt = regt; //for case is_mem
|
||||
int temp1 = _allocX86reg(ECX, X86TYPE_TEMP, 0, 0); //receives regd //_allocX86reg(ECX, X86TYPE_TEMP, 0, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode);
|
||||
int temp2 = ALLOCTEMPX86(0);
|
||||
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
|
||||
if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
|
||||
|
||||
SSE2_MOVD_XMM_to_R(temp1, regd);
|
||||
SHR32ItoR(temp1, 23);
|
||||
|
||||
if (is_mem) {
|
||||
MOV32MtoR(temp2, addrt);
|
||||
MOV32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
|
||||
SHR32ItoR(temp2, 23);
|
||||
}
|
||||
else {
|
||||
SSE2_MOVD_XMM_to_R(temp2, regt);
|
||||
SHR32ItoR(temp2, 23);
|
||||
}
|
||||
|
||||
AND32ItoR(temp1, 0xff);
|
||||
AND32ItoR(temp2, 0xff);
|
||||
|
||||
SUB32RtoR(temp1, temp2); //temp1 = exponent difference
|
||||
|
||||
CMP32ItoR(temp1, 25);
|
||||
localptr[0] = JGE8(0);
|
||||
CMP32ItoR(temp1, 0);
|
||||
localptr[1] = JG8(0);
|
||||
localptr[2] = JE8(0);
|
||||
CMP32ItoR(temp1, -25);
|
||||
localptr[3] = JLE8(0);
|
||||
|
||||
NEG32R(temp1);
|
||||
DEC32R(temp1);
|
||||
MOV32ItoR(temp2, 0xffffffff);
|
||||
SHL32CLtoR(temp2);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
|
||||
SSE4_PINSRD_R32_to_XMM(regd, temp2, 0);
|
||||
if (!is_mem)
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
|
||||
localptr[4] = JMP8(0);
|
||||
|
||||
x86SetJ8(localptr[0]);
|
||||
MOV32ItoR(temp2, 0x80000000);
|
||||
if (is_mem)
|
||||
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
|
||||
else {
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
|
||||
SSE4_PINSRD_R32_to_XMM(regt, temp2, 0);
|
||||
}
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
|
||||
localptr[5] = JMP8(0);
|
||||
|
||||
x86SetJ8(localptr[1]);
|
||||
DEC32R(temp1);
|
||||
MOV32ItoR(temp2, 0xffffffff);
|
||||
SHL32CLtoR(temp2);
|
||||
if (is_mem)
|
||||
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
|
||||
else {
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
|
||||
SSE4_PINSRD_R32_to_XMM(regt, temp2, 0);
|
||||
}
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
|
||||
localptr[6] = JMP8(0);
|
||||
|
||||
x86SetJ8(localptr[3]);
|
||||
MOV32ItoR(temp2, 0x80000000);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
|
||||
SSE4_PINSRD_R32_to_XMM(regd, temp2, 0);
|
||||
if (!is_mem)
|
||||
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
|
||||
localptr[7] = JMP8(0);
|
||||
|
||||
x86SetJ8(localptr[2]);
|
||||
x86SetJ8(localptr[4]);
|
||||
x86SetJ8(localptr[5]);
|
||||
x86SetJ8(localptr[6]);
|
||||
x86SetJ8(localptr[7]);
|
||||
|
||||
if (is_mem)
|
||||
{
|
||||
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
|
||||
|
||||
if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
|
||||
else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
|
||||
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
|
||||
|
||||
if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt);
|
||||
else SSE_ADDSS_XMM_to_XMM(regd, regt);
|
||||
|
@ -487,27 +751,57 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
|
|||
}
|
||||
|
||||
void SSE_ADDPS_XMM_to_XMM_custom(int info, int regd, int regt) {
|
||||
if (CHECK_VUADDSUBHACK) VU_ADD_SUB(regd, regt, 0, info);
|
||||
if (CHECK_VUADDSUBHACK) {
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
VU_ADD_SUB_SSE4(regd, regt, 0, info);
|
||||
else
|
||||
VU_ADD_SUB(regd, regt, 0, info);
|
||||
}
|
||||
else SSE_ADDPS_XMM_to_XMM(regd, regt);
|
||||
}
|
||||
void SSE_SUBPS_XMM_to_XMM_custom(int info, int regd, int regt) {
|
||||
if (CHECK_VUADDSUBHACK) VU_ADD_SUB(regd, regt, 1, info);
|
||||
if (CHECK_VUADDSUBHACK) {
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
VU_ADD_SUB_SSE4(regd, regt, 1, info);
|
||||
else
|
||||
VU_ADD_SUB(regd, regt, 1, info);
|
||||
}
|
||||
else SSE_SUBPS_XMM_to_XMM(regd, regt);
|
||||
}
|
||||
void SSE_ADDSS_XMM_to_XMM_custom(int info, int regd, int regt) {
|
||||
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 0, 0, info);
|
||||
if (CHECK_VUADDSUBHACK) {
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
VU_ADD_SUB_SS_SSE4(regd, regt, 0, 0, info);
|
||||
else
|
||||
VU_ADD_SUB_SS(regd, regt, 0, 0, info);
|
||||
}
|
||||
else SSE_ADDSS_XMM_to_XMM(regd, regt);
|
||||
}
|
||||
void SSE_SUBSS_XMM_to_XMM_custom(int info, int regd, int regt) {
|
||||
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 1, 0, info);
|
||||
if (CHECK_VUADDSUBHACK) {
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
VU_ADD_SUB_SS_SSE4(regd, regt, 1, 0, info);
|
||||
else
|
||||
VU_ADD_SUB_SS(regd, regt, 1, 0, info);
|
||||
}
|
||||
else SSE_SUBSS_XMM_to_XMM(regd, regt);
|
||||
}
|
||||
void SSE_ADDSS_M32_to_XMM_custom(int info, int regd, int regt) {
|
||||
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 0, 1, info);
|
||||
if (CHECK_VUADDSUBHACK) {
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
VU_ADD_SUB_SS_SSE4(regd, regt, 0, 1, info);
|
||||
else
|
||||
VU_ADD_SUB_SS(regd, regt, 0, 1, info);
|
||||
}
|
||||
else SSE_ADDSS_M32_to_XMM(regd, regt);
|
||||
}
|
||||
void SSE_SUBSS_M32_to_XMM_custom(int info, int regd, int regt) {
|
||||
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 1, 1, info);
|
||||
if (CHECK_VUADDSUBHACK) {
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions )
|
||||
VU_ADD_SUB_SS_SSE4(regd, regt, 1, 1, info);
|
||||
else
|
||||
VU_ADD_SUB_SS(regd, regt, 1, 1, info);
|
||||
}
|
||||
else SSE_SUBSS_M32_to_XMM(regd, regt);
|
||||
}
|
||||
//------------------------------------------------------------------
|
||||
|
@ -2814,4 +3108,4 @@ void recVUMI_CLIP(VURegs *VU, int info)
|
|||
|
||||
_freeX86reg(x86temp1);
|
||||
_freeX86reg(x86temp2);
|
||||
}
|
||||
}
|
|
@ -1617,6 +1617,7 @@ extern void SSE4_BLENDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm
|
|||
extern void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
|
||||
extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from);
|
||||
extern void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
|
||||
extern void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8);
|
||||
|
||||
//*********************
|
||||
// SSE-X - uses both SSE,SSE2 code and tries to keep consistensies between the data
|
||||
|
|
|
@ -1143,6 +1143,15 @@ __forceinline void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from
|
|||
ModRM(3, to, from);
|
||||
}
|
||||
|
||||
__forceinline void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8)
|
||||
{
|
||||
write8(0x66);
|
||||
RexRB(0, to, from);
|
||||
write24(0x223A0F);
|
||||
ModRM(3, to, from);
|
||||
write8(imm8);
|
||||
}
|
||||
|
||||
// SSE-X
|
||||
__forceinline void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from )
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue