From f93e8c1beb8ae64c5457f06e61552e28db4a4606 Mon Sep 17 00:00:00 2001 From: cottonvibes Date: Thu, 25 Dec 2008 18:39:06 +0000 Subject: [PATCH] fixed/added/optimized some VU stuff. i plan to go over most (all) VU micro instructions over the next few days/weeks, if i don't get bored of it that is ;) 3 down, 100+ more to go! :D git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@495 a6443dda-0b58-4228-96e9-037be469359c --- pcsx2/IPU/IPU.c | 4 +- pcsx2/x86/iVUmicro.cpp | 178 +++++++++++++++++++++++++++++------- pcsx2/x86/iVUmicro.h | 1 + pcsx2/x86/iVUmicroLower.cpp | 114 +++++++---------------- pcsx2/x86/iVUmicroUpper.cpp | 127 ++++++++++++------------- pcsx2/x86/iVUops.h | 13 +++ 6 files changed, 255 insertions(+), 182 deletions(-) diff --git a/pcsx2/IPU/IPU.c b/pcsx2/IPU/IPU.c index 8f1ddf98ad..d4620c55c3 100644 --- a/pcsx2/IPU/IPU.c +++ b/pcsx2/IPU/IPU.c @@ -1516,7 +1516,7 @@ void ipu_vq(struct macroblock_rgb16 *rgb16, u8* indx4){ SysPrintf("IPU: VQ not implemented"); } -void ipu_copy(struct macroblock_8 *mb8, struct macroblock_16 *mb16){ +void ipu_copy(struct macroblock_8 *mb8, struct macroblock_16 *mb16) { unsigned char *s=(unsigned char*)mb8; signed short *d=(signed short*)mb16; int i; @@ -1749,7 +1749,7 @@ int IPU1dma() break; default: - SysPrintf("IPU ERROR: different transfer mode!, Please report to PCSX2 Team\n"); + Console::Error("IPU ERROR: different transfer mode!, Please report to PCSX2 Team\n"); break; } diff --git a/pcsx2/x86/iVUmicro.cpp b/pcsx2/x86/iVUmicro.cpp index f13903232f..606e29638b 100644 --- a/pcsx2/x86/iVUmicro.cpp +++ b/pcsx2/x86/iVUmicro.cpp @@ -724,26 +724,17 @@ void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw) void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw) { - if( cpucaps.hasStreamingSIMD4Extensions ) { - switch (xyzw) { - case 0: if( dstreg != srcreg ) { - SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(0, 0, 0));} break; - case 1: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0)); break; - case 2: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(2, 0, 0)); break; - case 3: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0)); break; - } - } - else { - switch (xyzw) { - case 0: if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg); break; - case 1: if( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg); - else { if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg); SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x55); } - break; - case 2: SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); break; - case 3: if( cpucaps.hasStreamingSIMD3Extensions && dstreg != srcreg ) { SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg); SSE_MOVHLPS_XMM_to_XMM(dstreg, dstreg); } - else { if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg); SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xff); } - break; - } + switch (xyzw) { + case 0: if( dstreg != srcreg ) SSE_MOVSS_XMM_to_XMM(dstreg, srcreg); break; + case 1: if ( cpucaps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0)); + else if ( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg); + else { if( dstreg != srcreg ) { SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg); } SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x55); } + break; + case 2: SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); break; + case 3: if ( cpucaps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0)); + else if ( cpucaps.hasStreamingSIMD3Extensions && dstreg != srcreg ) { SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg); SSE_MOVHLPS_XMM_to_XMM(dstreg, dstreg); } + else { if( dstreg != srcreg ) { SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg); } SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xff); } + break; } } @@ -765,65 +756,168 @@ void _vuMoveSS(VURegs * VU, int dstreg, int srcreg) } // 1 - src, 0 - dest wzyx -void VU_MERGE0(int dest, int src) { // 0000 +void VU_MERGE0(int dest, int src) { // 0000s } void VU_MERGE1(int dest, int src) { // 1000 SSE_MOVHLPS_XMM_to_XMM(src, dest); SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4); } +void VU_MERGE1b(int dest, int src) { // 1000s + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); +} void VU_MERGE2(int dest, int src) { // 0100 SSE_MOVHLPS_XMM_to_XMM(src, dest); SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64); } -void VU_MERGE3(int dest, int src) { // 1100 +void VU_MERGE2b(int dest, int src) { // 0100s + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); +} +void VU_MERGE3(int dest, int src) { // 1100s SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); } -void VU_MERGE4(int dest, int src) { // 0010s +void VU_MERGE4(int dest, int src) { // 0010 SSE_MOVSS_XMM_to_XMM(src, dest); SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4); SSE_MOVAPS_XMM_to_XMM(dest, src); } +void VU_MERGE4b(int dest, int src) { // 0010s + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} void VU_MERGE5(int dest, int src) { // 1010 SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8); SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd8); } +void VU_MERGE5b(int dest, int src) { // 1010s + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} void VU_MERGE6(int dest, int src) { // 0110 SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c); SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x78); } -void VU_MERGE7(int dest, int src) { // 1110s +void VU_MERGE6b(int dest, int src) { // 0110s + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} +void VU_MERGE7(int dest, int src) { // 1110 SSE_MOVSS_XMM_to_XMM(src, dest); SSE_MOVAPS_XMM_to_XMM(dest, src); } -void VU_MERGE8(int dest, int src) { // 0001 +void VU_MERGE7b(int dest, int src) { // 1110s + SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} +void VU_MERGE8(int dest, int src) { // 0001s SSE_MOVSS_XMM_to_XMM(dest, src); } void VU_MERGE9(int dest, int src) { // 1001 SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9); SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd2); } +void VU_MERGE9b(int dest, int src) { // 1001s + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); +} void VU_MERGE10(int dest, int src) { // 0101 SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d); SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x72); } -void VU_MERGE11(int dest, int src) { // 1101 +void VU_MERGE10b(int dest, int src) { // 0101s + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); +} +void VU_MERGE11(int dest, int src) { // 1101s SSE_MOVSS_XMM_to_XMM(dest, src); SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); } -void VU_MERGE12(int dest, int src) { // 0011s +void VU_MERGE12(int dest, int src) { // 0011 SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4); SSE_MOVAPS_XMM_to_XMM(dest, src); } -void VU_MERGE13(int dest, int src) { // 1011s +void VU_MERGE12b(int dest, int src) { // 0011 + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} +void VU_MERGE13(int dest, int src) { // 1011 SSE_MOVHLPS_XMM_to_XMM(dest, src); SSE_SHUFPS_XMM_to_XMM(src, dest, 0x64); SSE_MOVAPS_XMM_to_XMM(dest, src); } -void VU_MERGE14(int dest, int src) { // 0111s +void VU_MERGE13b(int dest, int src) { // 1011s + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} +void VU_MERGE14(int dest, int src) { // 0111 SSE_MOVHLPS_XMM_to_XMM(dest, src); SSE_SHUFPS_XMM_to_XMM(src, dest, 0xc4); SSE_MOVAPS_XMM_to_XMM(dest, src); } +void VU_MERGE14b(int dest, int src) { // 0111s + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); +} void VU_MERGE15(int dest, int src) { // 1111s SSE_MOVAPS_XMM_to_XMM(dest, src); } @@ -836,18 +930,34 @@ static VUMERGEFN s_VuMerge[16] = { VU_MERGE8, VU_MERGE9, VU_MERGE10, VU_MERGE11, VU_MERGE12, VU_MERGE13, VU_MERGE14, VU_MERGE15 }; -void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) -{ - xyzw &= 0xf; +static VUMERGEFN s_VuMerge2[16] = { + VU_MERGE0, VU_MERGE1b, VU_MERGE2b, VU_MERGE3, + VU_MERGE4b, VU_MERGE5b, VU_MERGE6b, VU_MERGE7b, + VU_MERGE8, VU_MERGE9b, VU_MERGE10b, VU_MERGE11, + VU_MERGE12b, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 }; - if(dest != src && xyzw != 0) { - if(cpucaps.hasStreamingSIMD4Extensions) { +// Modifies the Source Reg! +void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) { + xyzw &= 0xf; + if ( (dest != src) && (xyzw != 0) ) { + if ( cpucaps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) { xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3); SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw); } else s_VuMerge[xyzw](dest, src); } } +// Doesn't Modify the Source Reg! (ToDo: s_VuMerge2() has room for optimization) +void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw) { + xyzw &= 0xf; + if ( (dest != src) && (xyzw != 0) ) { + if ( cpucaps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) { + xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3); + SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw); + } + else s_VuMerge2[xyzw](dest, src); + } +} //------------------------------------------------------------------ diff --git a/pcsx2/x86/iVUmicro.h b/pcsx2/x86/iVUmicro.h index 1cfc04b7fc..6c979bc91a 100644 --- a/pcsx2/x86/iVUmicro.h +++ b/pcsx2/x86/iVUmicro.h @@ -112,6 +112,7 @@ void _vuMoveSS(VURegs * VU, int dstreg, int srcreg); void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw); void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw); void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw); +void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw); #define VU_MERGE_REGS(dest, src) { \ VU_MERGE_REGS_CUSTOM(dest, src, _X_Y_Z_W); \ } diff --git a/pcsx2/x86/iVUmicroLower.cpp b/pcsx2/x86/iVUmicroLower.cpp index 2f14c15820..049fded427 100644 --- a/pcsx2/x86/iVUmicroLower.cpp +++ b/pcsx2/x86/iVUmicroLower.cpp @@ -835,11 +835,10 @@ void recVUMI_LQI(VURegs *VU, int info) //------------------------------------------------------------------ -// _saveEAX() +// _saveEAX() ToDo: Needs Checking/Fixing! (cottonvibes) //------------------------------------------------------------------ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info) { - int t1reg; assert( offset < 0x80000000 ); if( _Fs_ == 0 ) { @@ -996,90 +995,51 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info) } } break; - default: - //SysPrintf("SAVEEAX Default %d\n", _X_Y_Z_W); + default: // ToDo: Needs checking! (cottonvibes) + SysPrintf("SAVEEAX Default %d\n", _X_Y_Z_W); // EEREC_D is a temp reg // find the first nonwrite reg +/* t1reg = _vuGetTempXMMreg(info); - if( t1reg < 0 ) { for(t1reg = 0; t1reg < XMMREGS; ++t1reg) { if( xmmregs[t1reg].inuse && !(xmmregs[t1reg].mode&MODE_WRITE) ) break; } - if( t1reg == XMMREGS ) t1reg = -1; - else { - if( t1reg != EEREC_S ) _allocTempXMMreg(XMMT_FPS, t1reg); - } + else if( t1reg != EEREC_S ) _allocTempXMMreg(XMMT_FPS, t1reg); } +*/ + // do it with one reg + //SSE_MOVAPS_XMM_to_M128((uptr)&VU->VF[_Fs_], EEREC_S); - if( t1reg >= 0 ) { - // found a temp reg - if( VU == &VU1 ) { - if( x86reg >= 0 ) SSE_MOVAPSRmtoROffset(EEREC_TEMP, x86reg, offset); - else SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, offset); - } - else { - if( x86reg >= 0 ) SSE_MOVUPSRmtoROffset(EEREC_TEMP, x86reg, offset); - else { - if( offset & 15 ) SSE_MOVUPS_M128_to_XMM(EEREC_TEMP, offset); - else SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, offset); - } - } - - if( t1reg != EEREC_S ) SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S); - - VU_MERGE_REGS(EEREC_TEMP, t1reg); - - if( VU == &VU1 ) { - if( x86reg >= 0 ) SSE_MOVAPSRtoRmOffset(x86reg, EEREC_TEMP, offset); - else SSE_MOVAPS_XMM_to_M128(offset, EEREC_TEMP); - } - else { - if( x86reg >= 0 ) SSE_MOVUPSRtoRmOffset(x86reg, EEREC_TEMP, offset); - else SSE_MOVUPS_XMM_to_M128(offset, EEREC_TEMP); - } - - if( t1reg != EEREC_S ) _freeXMMreg(t1reg); - else { - // read back the data - SSE_MOVAPS_M128_to_XMM(EEREC_S, (uptr)&VU->VF[_Fs_]); - } + if( VU == &VU1 ) { + if( x86reg >= 0 ) SSE_MOVAPSRmtoROffset(EEREC_TEMP, x86reg, offset); + else SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, offset); } else { - // do it with one reg - SSE_MOVAPS_XMM_to_M128((uptr)&VU->VF[_Fs_], EEREC_S); - - if( VU == &VU1 ) { - if( x86reg >= 0 ) SSE_MOVAPSRmtoROffset(EEREC_TEMP, x86reg, offset); + if( x86reg >= 0 ) SSE_MOVUPSRmtoROffset(EEREC_TEMP, x86reg, offset); + else { + if( offset & 15 ) SSE_MOVUPS_M128_to_XMM(EEREC_TEMP, offset); else SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, offset); } - else { - if( x86reg >= 0 ) SSE_MOVUPSRmtoROffset(EEREC_TEMP, x86reg, offset); - else { - if( offset & 15 ) SSE_MOVUPS_M128_to_XMM(EEREC_TEMP, offset); - else SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, offset); - } - } - - VU_MERGE_REGS(EEREC_TEMP, EEREC_S); - - if( VU == &VU1 ) { - if( x86reg >= 0 ) SSE_MOVAPSRtoRmOffset(x86reg, EEREC_TEMP, offset); - else SSE_MOVAPS_XMM_to_M128(offset, EEREC_TEMP); - } - else { - if( x86reg >= 0 ) SSE_MOVUPSRtoRmOffset(x86reg, EEREC_TEMP, offset); - else { - if( offset & 15 ) SSE_MOVUPS_XMM_to_M128(offset, EEREC_TEMP); - else SSE_MOVAPS_XMM_to_M128(offset, EEREC_TEMP); - } - } - - // read back the data - SSE_MOVAPS_M128_to_XMM(EEREC_S, (uptr)&VU->VF[_Fs_]); } + VU_MERGE_REGS_SAFE(EEREC_TEMP, EEREC_S, _X_Y_Z_W); + + if( VU == &VU1 ) { + if( x86reg >= 0 ) SSE_MOVAPSRtoRmOffset(x86reg, EEREC_TEMP, offset); + else SSE_MOVAPS_XMM_to_M128(offset, EEREC_TEMP); + } + else { + if( x86reg >= 0 ) SSE_MOVUPSRtoRmOffset(x86reg, EEREC_TEMP, offset); + else { + if( offset & 15 ) SSE_MOVUPS_XMM_to_M128(offset, EEREC_TEMP); + else SSE_MOVAPS_XMM_to_M128(offset, EEREC_TEMP); + } + } + + // read back the data + //SSE_MOVAPS_M128_to_XMM(EEREC_S, (uptr)&VU->VF[_Fs_]); break; } } @@ -1094,9 +1054,7 @@ void recVUMI_SQ(VURegs *VU, int info) s16 imm; imm = ( VU->code & 0x400) ? ( VU->code & 0x3ff) | 0xfc00 : ( VU->code & 0x3ff); - if ( _Ft_ == 0 ) { - _saveEAX(VU, -1, (uptr)GET_VU_MEM(VU, (int)imm * 16), info); - } + if ( _Ft_ == 0 ) _saveEAX(VU, -1, (uptr)GET_VU_MEM(VU, (int)imm * 16), info); else { int ftreg = ALLOCVI(_Ft_, MODE_READ); _saveEAX(VU, recVUTransformAddr(ftreg, VU, _Ft_, imm), (uptr)VU->Mem, info); @@ -1110,9 +1068,8 @@ void recVUMI_SQ(VURegs *VU, int info) //------------------------------------------------------------------ void recVUMI_SQD(VURegs *VU, int info) { - if (_Ft_ == 0) { - _saveEAX(VU, -1, (uptr)VU->Mem, info); - } else { + if (_Ft_ == 0) _saveEAX(VU, -1, (uptr)VU->Mem, info); + else { int ftreg = ALLOCVI(_Ft_, MODE_READ|MODE_WRITE); SUB16ItoR( ftreg, 1 ); _saveEAX(VU, recVUTransformAddr(ftreg, VU, _Ft_, 0), (uptr)VU->Mem, info); @@ -1126,9 +1083,8 @@ void recVUMI_SQD(VURegs *VU, int info) //------------------------------------------------------------------ void recVUMI_SQI(VURegs *VU, int info) { - if (_Ft_ == 0) { - _saveEAX(VU, -1, (uptr)VU->Mem, info); - } else { + if (_Ft_ == 0) _saveEAX(VU, -1, (uptr)VU->Mem, info); + else { int ftreg = ALLOCVI(_Ft_, MODE_READ|MODE_WRITE); _saveEAX(VU, recVUTransformAddr(ftreg, VU, _Ft_, 0), (uptr)VU->Mem, info); diff --git a/pcsx2/x86/iVUmicroUpper.cpp b/pcsx2/x86/iVUmicroUpper.cpp index 108ba15b1a..61d2aeac20 100644 --- a/pcsx2/x86/iVUmicroUpper.cpp +++ b/pcsx2/x86/iVUmicroUpper.cpp @@ -103,22 +103,22 @@ static const PCSX2_ALIGNED16(int SSEmovMask[ 16 ][ 4 ]) = static const PCSX2_ALIGNED16(u32 const_abs_table[16][4]) = { - { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, - { 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, - { 0xffffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, - { 0xffffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, - { 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, - { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, - { 0xffffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, - { 0xffffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, - { 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff }, - { 0x7fffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, - { 0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, - { 0x7fffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, - { 0x7fffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, - { 0x7fffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, - { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, - { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, + { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000 + { 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, //0001 + { 0xffffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, //0010 + { 0xffffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, //0011 + { 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, //0100 + { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, //0101 + { 0xffffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, //0110 + { 0xffffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0111 + { 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000 + { 0x7fffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, //1001 + { 0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, //1010 + { 0x7fffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, //1011 + { 0x7fffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, //1100 + { 0x7fffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, //1101 + { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, //1110 + { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1111 }; static const PCSX2_ALIGNED16(float recMult_float_to_int4[4]) = { 16.0, 16.0, 16.0, 16.0 }; @@ -236,7 +236,7 @@ void recUpdateFlags(VURegs * VU, int reg, int info) //-------------------------Optional Code: Denormals Are Zero------------------------------ if (CHECK_VU_UNDERFLOW) { // Sets underflow/denormals to zero SSE_ANDNPS_XMM_to_XMM(t1reg, reg); // t1reg = !t1reg & reg (t1reg = denormals are positive zero) - VU_MERGE_REGS_CUSTOM(t1reg, reg, (15 - flipMask[_X_Y_Z_W])); // Send t1reg the vectors that shouldn't be modified (since reg was flipped, we need a mask to get the unmodified vectors) + VU_MERGE_REGS_SAFE(t1reg, reg, (15 - flipMask[_X_Y_Z_W])); // Send t1reg the vectors that shouldn't be modified (since reg was flipped, we need a mask to get the unmodified vectors) // Now we have Denormals are Positive Zero in t1reg; the next two lines take Signed Zero into account SSE_ANDPS_M128_to_XMM(reg, (uptr)&VU_Signed_Zero_Mask[ 0 ]); // Only keep the sign bit for each vector SSE_ORPS_XMM_to_XMM(reg, t1reg); // Denormals are Signed Zero, and unmodified vectors stay the same! @@ -305,40 +305,45 @@ void recUpdateFlags(VURegs * VU, int reg, int info) //------------------------------------------------------------------ // *VU Upper Instructions!* +// +// Note: * = Checked for errors by cottonvibes //------------------------------------------------------------------ //------------------------------------------------------------------ -// ABS +// ABS* //------------------------------------------------------------------ -void recVUMI_ABS(VURegs *VU, int info) +void recVUMI_ABS(VURegs *VU, int info) { - if ( _Ft_ == 0 ) return; + //SysPrintf("recVUMI_ABS()\n"); + if ( (_Ft_ == 0) || (_X_Y_Z_W == 0) ) return; - if (_X_Y_Z_W != 0xf) { // here we use a temp reg because not all xyzw are being modified + if ((_X_Y_Z_W == 0x8) || (_X_Y_Z_W == 0xf)) { + VU_MERGE_REGS(EEREC_T, EEREC_S); + SSE_ANDPS_M128_to_XMM(EEREC_T, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] ); + } + else { // Use a temp reg because VU_MERGE_REGS() modifies source reg! SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] ); - VU_MERGE_REGS(EEREC_T, EEREC_TEMP); - } - else { // all xyzw are being modified, so no need to use temp reg - if( EEREC_T != EEREC_S ) SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_S); - SSE_ANDPS_M128_to_XMM(EEREC_T, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] ); } } //------------------------------------------------------------------ //------------------------------------------------------------------ -// ADD +// ADD*, ADD_iq, ADD_xyzw* //------------------------------------------------------------------ PCSX2_ALIGNED16(float s_two[4]) = {0,0,0,2}; void recVUMI_ADD(VURegs *VU, int info) { - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + //SysPrintf("recVUMI_ADD()\n"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); - if( _Fs_ == 0 && _Ft_ == 0 ) { // if adding VF00 with VF00, then the result is always 0,0,0,2 - if( _X_Y_Z_W != 0xf ) { + if ( _Fs_ == 0 && _Ft_ == 0 ) { // if adding VF00 with VF00, then the result is always 0,0,0,2 + if ( _X_Y_Z_W == 0x8 ) SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)s_two); + else if ( _X_Y_Z_W != 0xf ) { SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)s_two); VU_MERGE_REGS(EEREC_D, EEREC_TEMP); } @@ -346,8 +351,8 @@ void recVUMI_ADD(VURegs *VU, int info) } else { if (CHECK_VU_EXTRA_OVERFLOW) { - vuFloat( info, EEREC_S, _X_Y_Z_W); - vuFloat( info, EEREC_T, _X_Y_Z_W); + vuFloat5( EEREC_S, EEREC_TEMP, _X_Y_Z_W); + vuFloat5( EEREC_T, EEREC_TEMP, _X_Y_Z_W); } if( _X_Y_Z_W == 8 ) { // If only adding x, then we can do a Scalar Add if (EEREC_D == EEREC_S) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T); @@ -360,7 +365,6 @@ void recVUMI_ADD(VURegs *VU, int info) else if (_X_Y_Z_W != 0xf) { // If xyzw != 1111, then we have to use a temp reg SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); } else { // All xyzw being modified (xyzw == 1111) @@ -372,7 +376,7 @@ void recVUMI_ADD(VURegs *VU, int info) } } } - +flagUpdate: recUpdateFlags(VU, EEREC_D, info); } @@ -433,30 +437,31 @@ void recVUMI_ADD_iq(VURegs *VU, uptr addr, int info) } } } + recUpdateFlags(VU, EEREC_D, info); } void recVUMI_ADD_xyzw(VURegs *VU, int xyzw, int info) { + //SysPrintf("recVUMI_ADD_xyzw()\n"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); if (CHECK_VU_EXTRA_OVERFLOW) { - vuFloat5( EEREC_S, EEREC_TEMP, _X_Y_Z_W); - vuFloat5( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); + if (_Fs_) vuFloat5( EEREC_S, EEREC_TEMP, _X_Y_Z_W); + if (_Ft_) vuFloat5( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); } - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - if( _Ft_ == 0 && xyzw < 3 ) { - // just move - if( _X_Y_Z_W != 0xf ) { + if ( _Ft_ == 0 && xyzw < 3 ) { // just move since adding zero + if ( _X_Y_Z_W == 0x8 ) { VU_MERGE_REGS(EEREC_D, EEREC_S); } + else if ( _X_Y_Z_W != 0xf ) { SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); VU_MERGE_REGS(EEREC_D, EEREC_TEMP); } - else if( EEREC_D != EEREC_S ) SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); + else if ( EEREC_D != EEREC_S ) SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); } - else if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP) ) { - if( xyzw == 0 ) { - if( EEREC_D == EEREC_T ) { - SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_S); - } + else if ( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP) ) { + if ( xyzw == 0 ) { + if ( EEREC_D == EEREC_T ) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_S); else { if( EEREC_D != EEREC_S ) SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T); @@ -464,38 +469,27 @@ void recVUMI_ADD_xyzw(VURegs *VU, int xyzw, int info) } else { _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); - if( EEREC_D != EEREC_S ) SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + if ( EEREC_D != EEREC_S ) SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); } } - else if( _Fs_ == 0 && !_W ) { // ToDo: Check this! (cottonvibes) - // just move + else if( _Fs_ == 0 && !_W ) { // just move _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - -// SSE_MOVAPS_XMM_to_M128((u32)s_tempmem, EEREC_TEMP); -// SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (u32)const_clip); -// SSE_CMPNLTPS_M128_to_XMM(EEREC_TEMP, (u32)s_FloatMinMax); -// SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (u32)s_tempmem); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); } else { - if( _X_Y_Z_W != 0xf || EEREC_D == EEREC_S || EEREC_D == EEREC_TEMP) + if ( _X_Y_Z_W != 0xf ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - - if (_X_Y_Z_W != 0xf) { SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); VU_MERGE_REGS(EEREC_D, EEREC_TEMP); } else { - if( EEREC_D == EEREC_TEMP ) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); - else if( EEREC_D == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - else { - _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw); - SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); - } + if( EEREC_D == EEREC_TEMP ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); } + else if( EEREC_D == EEREC_S ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); } + else { _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); } } } +flagUpdate: recUpdateFlags(VU, EEREC_D, info); } @@ -2502,7 +2496,6 @@ void recVUMI_ITOF0( VURegs *VU, int info ) if (_X_Y_Z_W != 0xf) { SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - vuFloat( info, EEREC_TEMP, 15); // Clamp infinities VU_MERGE_REGS(EEREC_T, EEREC_TEMP); xmmregs[EEREC_T].mode |= MODE_WRITE; @@ -2519,12 +2512,12 @@ void recVUMI_ITOFX(VURegs *VU, int addr, int info) if (_X_Y_Z_W != 0xf) { SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr); vuFloat( info, EEREC_TEMP, 15); // Clamp infinities VU_MERGE_REGS(EEREC_T, EEREC_TEMP); xmmregs[EEREC_T].mode |= MODE_WRITE; - } else { + } + else { SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_T, EEREC_S); SSE_MULPS_M128_to_XMM(EEREC_T, addr); vuFloat2(EEREC_T, EEREC_TEMP, 15); // Clamp infinities diff --git a/pcsx2/x86/iVUops.h b/pcsx2/x86/iVUops.h index 0285f1acac..4c9dde53dd 100644 --- a/pcsx2/x86/iVUops.h +++ b/pcsx2/x86/iVUops.h @@ -27,6 +27,19 @@ CALLFunc((uptr)VU##MI_##f); \ } +#define REC_VUOPs(VU, f) { \ + _freeXMMregs(); \ + X86_32CODE(_freeMMXregs(); SetFPUstate();) \ + if (VU==&VU1) { \ + MOV32ItoM((uptr)&VU1.code, (u32)VU1.code); \ + CALLFunc((uptr)VU1MI_##f); \ + } \ + else { \ + MOV32ItoM((uptr)&VU0.code, (u32)VU0.code); \ + CALLFunc((uptr)VU0MI_##f); \ + } \ +} + #define REC_VUOPFLAGS(VU, f) { \ _freeXMMregs(/*&VU*/); \ X86_32CODE(_freeMMXregs(); SetFPUstate();) \