mirror of https://github.com/PCSX2/pcsx2.git
microVU:
- More regalloc work/fixes - Implemented some untested SSE4.1 optimizations (can't test since don't have sse4.1 cpu) pcsx2: - Added an SSE4 instruction to the legacy emitter (just a wrapper to the new emitter function). Note: Currently tri-ace fix and logical min-max code (thing that mad DaZ safe to use) is broken with mVU. Will fix later. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1547 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
fb0a990605
commit
98c1855916
|
@ -1349,6 +1349,7 @@ extern void SSE4_DPPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8);
|
|||
extern void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8);
|
||||
extern void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8);
|
||||
extern void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8);
|
||||
extern void SSE4_EXTRACTPS_XMM_to_M32(uptr to, x86SSERegType from, u8 imm8);
|
||||
extern void SSE4_BLENDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8);
|
||||
extern void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
|
||||
extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from);
|
||||
|
|
|
@ -364,6 +364,7 @@ emitterT void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 im
|
|||
|
||||
emitterT void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8) { xINSERTPS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); }
|
||||
emitterT void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8) { xEXTRACTPS( xRegister32(to), xRegisterSSE(from), imm8 ); }
|
||||
emitterT void SSE4_EXTRACTPS_XMM_to_M32(uptr to, x86SSERegType from, u8 imm8) { xEXTRACTPS( (u32*)to, xRegisterSSE(from), imm8 ); }
|
||||
|
||||
emitterT void SSE4_DPPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8) { xDP.PS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); }
|
||||
emitterT void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8) { xDP.PS( xRegisterSSE(to), (void*)from, imm8 ); }
|
||||
|
|
|
@ -161,6 +161,7 @@ struct microIR {
|
|||
// Reg Alloc
|
||||
//------------------------------------------------------------------
|
||||
|
||||
void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW);
|
||||
void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW);
|
||||
void mVUloadReg(int reg, uptr offset, int xyzw);
|
||||
|
||||
|
@ -223,7 +224,7 @@ public:
|
|||
}
|
||||
void writeBackReg(int reg) {
|
||||
if ((xmmReg[reg].reg > 0) && xmmReg[reg].xyzw) { // Reg was modified and not Temp or vf0
|
||||
if (xmmReg[reg].reg == 32) SSE_MOVAPS_XMM_to_M128((uptr)&vuRegs->ACC.UL[0], reg);
|
||||
if (xmmReg[reg].reg == 32) mVUsaveReg(reg, (uptr)&vuRegs->ACC.UL[0], xmmReg[reg].xyzw, 1);
|
||||
else mVUsaveReg(reg, (uptr)&vuRegs->VF[xmmReg[reg].reg].UL[0], xmmReg[reg].xyzw, 1);
|
||||
for (int i = 0; i < xmmTotal; i++) {
|
||||
if (i == reg) continue;
|
||||
|
@ -241,20 +242,26 @@ public:
|
|||
clearReg(reg); // Clear Reg
|
||||
}
|
||||
void clearNeeded(int reg) {
|
||||
// ToDo: Merge Regs Support
|
||||
xmmReg[reg].isNeeded = 0;
|
||||
if (xmmReg[reg].xyzw) { // Reg was modified
|
||||
if (xmmReg[reg].reg > 0) {
|
||||
if (xmmReg[reg].xyzw < 0xf) writeBackReg(reg); // Always Write Back Partial Writes
|
||||
if (xmmReg[reg].reg > 0) {
|
||||
for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg
|
||||
if (i == reg) continue;
|
||||
if (xmmReg[i].reg == xmmReg[reg].reg) {
|
||||
if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon::Error("microVU Error: clearNeeded()");
|
||||
clearReg(i);
|
||||
int mergeRegs = 0;
|
||||
if (xmmReg[reg].xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes
|
||||
for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg
|
||||
if (i == reg) continue;
|
||||
if (xmmReg[i].reg == xmmReg[reg].reg) {
|
||||
if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon::Error("microVU Error: clearNeeded() [%d]", params xmmReg[i].reg);
|
||||
if (mergeRegs == 1) {
|
||||
mVUmergeRegs(i, reg, xmmReg[reg].xyzw, 1);
|
||||
xmmReg[i].xyzw = 0xf;
|
||||
xmmReg[i].count = counter;
|
||||
mergeRegs = 2;
|
||||
}
|
||||
else clearReg(i);
|
||||
}
|
||||
}
|
||||
if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged
|
||||
else if (mergeRegs) writeBackReg(reg); // Write Back Partial Writes if couldn't merge
|
||||
}
|
||||
else clearReg(reg); // If Reg was temp or vf0, then invalidate itself
|
||||
}
|
||||
|
@ -263,7 +270,7 @@ public:
|
|||
counter++;
|
||||
if (vfLoadReg >= 0) { // Search For Cached Regs
|
||||
for (int i = 0; i < xmmTotal; i++) {
|
||||
if ((xmmReg[i].reg == vfLoadReg) && (!xmmReg[i].xyzw // Reg Was Not Modified
|
||||
if ((xmmReg[i].reg == vfLoadReg) && (!xmmReg[i].xyzw // Reg Was Not Modified
|
||||
|| (/*!xmmReg[i].isNeeded &&*/ xmmReg[i].reg && (xmmReg[i].xyzw==0xf)))) { // Reg Had All Vectors Modified and != VF0
|
||||
int z = i;
|
||||
if (vfWriteReg >= 0) { // Reg will be modified
|
||||
|
@ -296,7 +303,8 @@ public:
|
|||
writeBackReg(x);
|
||||
|
||||
if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading)
|
||||
if (vfLoadReg == 32) mVUloadReg(x, (uptr)&vuRegs->ACC.UL[0], xyzw);
|
||||
if ((vfLoadReg == 0) && !(xyzw & 1)) { SSE2_PXOR_XMM_to_XMM(x, x); }
|
||||
else if (vfLoadReg == 32) mVUloadReg(x, (uptr)&vuRegs->ACC.UL[0], xyzw);
|
||||
else if (vfLoadReg >= 0) mVUloadReg(x, (uptr)&vuRegs->VF[vfLoadReg].UL[0], xyzw);
|
||||
xmmReg[x].reg = vfWriteReg;
|
||||
xmmReg[x].xyzw = xyzw;
|
||||
|
|
|
@ -104,10 +104,16 @@ void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW) {
|
|||
return;*/
|
||||
|
||||
switch ( xyzw ) {
|
||||
case 5: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY
|
||||
SSE_MOVSS_XMM_to_M32(offset+4, reg);
|
||||
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, reg);
|
||||
case 5: if (cpucaps.hasStreamingSIMD4Extensions) {
|
||||
SSE4_EXTRACTPS_XMM_to_M32(offset+4, reg, 1);
|
||||
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3);
|
||||
}
|
||||
else {
|
||||
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY
|
||||
SSE_MOVSS_XMM_to_M32(offset+4, reg);
|
||||
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, reg);
|
||||
}
|
||||
break; // YW
|
||||
case 6: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xc9);
|
||||
SSE_MOVLPS_XMM_to_M64(offset+4, reg);
|
||||
|
@ -203,25 +209,33 @@ void mVUsaveReg2(int reg, int gprReg, u32 offset, int xyzw) {
|
|||
}
|
||||
}
|
||||
|
||||
// Modifies the Source Reg!
|
||||
void mVUmergeRegs(int dest, int src, int xyzw) {
|
||||
// Modifies the Source Reg! (ToDo: Optimize modXYZW = 1 cases)
|
||||
void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0) {
|
||||
xyzw &= 0xf;
|
||||
if ( (dest != src) && (xyzw != 0) ) {
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) {
|
||||
if (cpucaps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf)) {
|
||||
if (modXYZW) {
|
||||
if (xyzw == 1) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 3, 0)); return; }
|
||||
else if (xyzw == 2) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 2, 0)); return; }
|
||||
else if (xyzw == 4) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 1, 0)); return; }
|
||||
}
|
||||
xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
|
||||
SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw);
|
||||
}
|
||||
else {
|
||||
switch (xyzw) {
|
||||
case 1: SSE_MOVHLPS_XMM_to_XMM(src, dest);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4);
|
||||
case 1: if (modXYZW) mVUunpack_xyzw(src, src, 0);
|
||||
SSE_MOVHLPS_XMM_to_XMM(src, dest); // src = Sw Sz Dw Dz
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4); // 11 00 01 00
|
||||
break;
|
||||
case 2: SSE_MOVHLPS_XMM_to_XMM(src, dest);
|
||||
case 2: if (modXYZW) mVUunpack_xyzw(src, src, 0);
|
||||
SSE_MOVHLPS_XMM_to_XMM(src, dest);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
|
||||
break;
|
||||
case 3: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
|
||||
break;
|
||||
case 4: SSE_MOVSS_XMM_to_XMM(src, dest);
|
||||
case 4: if (modXYZW) mVUunpack_xyzw(src, src, 0);
|
||||
SSE_MOVSS_XMM_to_XMM(src, dest);
|
||||
SSE2_MOVSD_XMM_to_XMM(dest, src);
|
||||
break;
|
||||
case 5: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);
|
||||
|
@ -333,8 +347,8 @@ void MIN_MAX_(x86SSERegType to, x86SSERegType from, bool min) {
|
|||
// Warning: Modifies from and to's upper 3 vectors
|
||||
void MIN_MAX_SS(x86SSERegType to, x86SSERegType from, bool min) {
|
||||
SSE_SHUFPS_XMM_to_XMM (to, from, 0);
|
||||
SSE2_PAND_M128_to_XMM (to, (uptr)MIN_MAX_MASK1);
|
||||
SSE2_POR_M128_to_XMM (to, (uptr)MIN_MAX_MASK2);
|
||||
SSE2_PAND_M128_to_XMM (to, (uptr)MIN_MAX_MASK1);
|
||||
SSE2_POR_M128_to_XMM (to, (uptr)MIN_MAX_MASK2);
|
||||
SSE2_PSHUFD_XMM_to_XMM(from, to, 0xee);
|
||||
if (min) SSE2_MINPD_XMM_to_XMM(to, from);
|
||||
else SSE2_MAXPD_XMM_to_XMM(to, from);
|
||||
|
|
|
@ -106,12 +106,21 @@ void mVU_printOP(microVU* mVU, int opCase, char* opName, bool isACC) {
|
|||
opCase4 { if (isACC) { mVUlogACC(); } else { mVUlogFd(); } mVUlogQ(); }
|
||||
}
|
||||
|
||||
// Sets Up Pass1 Info for Normal, BC, I, and Q Cases
|
||||
void setupPass1(microVU* mVU, int opCase, bool isACC, bool noFlagUpdate) {
|
||||
opCase1 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); }
|
||||
opCase2 { mVUanalyzeFMAC3(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); }
|
||||
opCase3 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); }
|
||||
opCase4 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); }
|
||||
if (noFlagUpdate) { sFLAG.doFlag = 0; }
|
||||
}
|
||||
|
||||
// Sets Up Ft Reg for Normal, BC, I, and Q Cases
|
||||
void setupFtReg(microVU* mVU, int& Ft, int opCase) {
|
||||
opCase1 { Ft = mVU->regAlloc->allocReg(_Ft_); }
|
||||
opCase2 {
|
||||
if (!_XYZW_SS) {
|
||||
Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W);
|
||||
Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf);
|
||||
mVUunpack_xyzw(Ft, Ft, _bc_);
|
||||
}
|
||||
else Ft = mVU->regAlloc->allocReg(_Ft_);
|
||||
|
@ -122,13 +131,7 @@ void setupFtReg(microVU* mVU, int& Ft, int opCase) {
|
|||
|
||||
// Normal FMAC Opcodes
|
||||
void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, char* opName) {
|
||||
pass1 {
|
||||
opCase1 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); }
|
||||
opCase2 { mVUanalyzeFMAC3(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); }
|
||||
opCase3 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); }
|
||||
opCase4 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); }
|
||||
if ((opType == 3) || (opType == 4)) { sFLAG.doFlag = 0; }
|
||||
}
|
||||
pass1 { setupPass1(mVU, opCase, isACC, ((opType == 3) || (opType == 4))); }
|
||||
pass2 {
|
||||
int Fs, Ft, ACC;
|
||||
mVU->regAlloc->reset(); // Reset for Testing
|
||||
|
@ -169,12 +172,7 @@ void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, ch
|
|||
|
||||
// MADDA/MSUBA Opcodes
|
||||
void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, char* opName) {
|
||||
pass1 {
|
||||
opCase1 { mVUanalyzeFMAC1(mVU, 0, _Fs_, _Ft_); }
|
||||
opCase2 { mVUanalyzeFMAC3(mVU, 0, _Fs_, _Ft_); }
|
||||
opCase3 { mVUanalyzeFMAC1(mVU, 0, _Fs_, 0); }
|
||||
opCase4 { mVUanalyzeFMAC1(mVU, 0, _Fs_, 0); }
|
||||
}
|
||||
pass1 { setupPass1(mVU, opCase, 1, 0); }
|
||||
pass2 {
|
||||
int Fs, Ft, ACC;
|
||||
mVU->regAlloc->reset(); // Reset for Testing
|
||||
|
@ -218,12 +216,7 @@ void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, char* opName)
|
|||
|
||||
// MADD Opcodes
|
||||
void mVU_FMACc(microVU* mVU, int recPass, int opCase, char* opName) {
|
||||
pass1 {
|
||||
opCase1 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, _Ft_); }
|
||||
opCase2 { mVUanalyzeFMAC3(mVU, _Fd_, _Fs_, _Ft_); }
|
||||
opCase3 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); }
|
||||
opCase4 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); }
|
||||
}
|
||||
pass1 { setupPass1(mVU, opCase, 0, 0); }
|
||||
pass2 {
|
||||
int Fs, Ft, ACC;
|
||||
mVU->regAlloc->reset(); // Reset for Testing
|
||||
|
@ -255,12 +248,7 @@ void mVU_FMACc(microVU* mVU, int recPass, int opCase, char* opName) {
|
|||
|
||||
// MSUB Opcodes
|
||||
void mVU_FMACd(microVU* mVU, int recPass, int opCase, char* opName) {
|
||||
pass1 {
|
||||
opCase1 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, _Ft_); }
|
||||
opCase2 { mVUanalyzeFMAC3(mVU, _Fd_, _Fs_, _Ft_); }
|
||||
opCase3 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); }
|
||||
opCase4 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); }
|
||||
}
|
||||
pass1 { setupPass1(mVU, opCase, 0, 0); }
|
||||
pass2 {
|
||||
int Fs, Ft, Fd;
|
||||
mVU->regAlloc->reset(); // Reset for Testing
|
||||
|
|
Loading…
Reference in New Issue