- More regalloc work/fixes
- Implemented some untested SSE4.1 optimizations (can't test since don't have sse4.1 cpu)

pcsx2:
- Added an SSE4 instruction to the legacy emitter (just a wrapper to the new emitter function).

Note: Currently tri-ace fix and logical min-max code (thing that mad DaZ safe to use) is broken with mVU. Will fix later.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1547 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
cottonvibes 2009-07-21 06:36:56 +00:00
parent fb0a990605
commit 98c1855916
5 changed files with 62 additions and 50 deletions

View File

@ -1349,6 +1349,7 @@ extern void SSE4_DPPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8);
extern void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8);
extern void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8);
extern void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8);
extern void SSE4_EXTRACTPS_XMM_to_M32(uptr to, x86SSERegType from, u8 imm8);
extern void SSE4_BLENDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8);
extern void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from);

View File

@ -364,6 +364,7 @@ emitterT void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 im
emitterT void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8) { xINSERTPS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); }
emitterT void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8) { xEXTRACTPS( xRegister32(to), xRegisterSSE(from), imm8 ); }
emitterT void SSE4_EXTRACTPS_XMM_to_M32(uptr to, x86SSERegType from, u8 imm8) { xEXTRACTPS( (u32*)to, xRegisterSSE(from), imm8 ); }
emitterT void SSE4_DPPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8) { xDP.PS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); }
emitterT void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8) { xDP.PS( xRegisterSSE(to), (void*)from, imm8 ); }

View File

@ -161,6 +161,7 @@ struct microIR {
// Reg Alloc
//------------------------------------------------------------------
void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW);
void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW);
void mVUloadReg(int reg, uptr offset, int xyzw);
@ -223,7 +224,7 @@ public:
}
void writeBackReg(int reg) {
if ((xmmReg[reg].reg > 0) && xmmReg[reg].xyzw) { // Reg was modified and not Temp or vf0
if (xmmReg[reg].reg == 32) SSE_MOVAPS_XMM_to_M128((uptr)&vuRegs->ACC.UL[0], reg);
if (xmmReg[reg].reg == 32) mVUsaveReg(reg, (uptr)&vuRegs->ACC.UL[0], xmmReg[reg].xyzw, 1);
else mVUsaveReg(reg, (uptr)&vuRegs->VF[xmmReg[reg].reg].UL[0], xmmReg[reg].xyzw, 1);
for (int i = 0; i < xmmTotal; i++) {
if (i == reg) continue;
@ -241,20 +242,26 @@ public:
clearReg(reg); // Clear Reg
}
void clearNeeded(int reg) {
// ToDo: Merge Regs Support
xmmReg[reg].isNeeded = 0;
if (xmmReg[reg].xyzw) { // Reg was modified
if (xmmReg[reg].reg > 0) {
if (xmmReg[reg].xyzw < 0xf) writeBackReg(reg); // Always Write Back Partial Writes
if (xmmReg[reg].reg > 0) {
int mergeRegs = 0;
if (xmmReg[reg].xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes
for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg
if (i == reg) continue;
if (xmmReg[i].reg == xmmReg[reg].reg) {
if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon::Error("microVU Error: clearNeeded()");
clearReg(i);
}
if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon::Error("microVU Error: clearNeeded() [%d]", params xmmReg[i].reg);
if (mergeRegs == 1) {
mVUmergeRegs(i, reg, xmmReg[reg].xyzw, 1);
xmmReg[i].xyzw = 0xf;
xmmReg[i].count = counter;
mergeRegs = 2;
}
else clearReg(i);
}
}
if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged
else if (mergeRegs) writeBackReg(reg); // Write Back Partial Writes if couldn't merge
}
else clearReg(reg); // If Reg was temp or vf0, then invalidate itself
}
@ -296,7 +303,8 @@ public:
writeBackReg(x);
if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading)
if (vfLoadReg == 32) mVUloadReg(x, (uptr)&vuRegs->ACC.UL[0], xyzw);
if ((vfLoadReg == 0) && !(xyzw & 1)) { SSE2_PXOR_XMM_to_XMM(x, x); }
else if (vfLoadReg == 32) mVUloadReg(x, (uptr)&vuRegs->ACC.UL[0], xyzw);
else if (vfLoadReg >= 0) mVUloadReg(x, (uptr)&vuRegs->VF[vfLoadReg].UL[0], xyzw);
xmmReg[x].reg = vfWriteReg;
xmmReg[x].xyzw = xyzw;

View File

@ -104,10 +104,16 @@ void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW) {
return;*/
switch ( xyzw ) {
case 5: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY
case 5: if (cpucaps.hasStreamingSIMD4Extensions) {
SSE4_EXTRACTPS_XMM_to_M32(offset+4, reg, 1);
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3);
}
else {
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY
SSE_MOVSS_XMM_to_M32(offset+4, reg);
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
SSE_MOVSS_XMM_to_M32(offset+12, reg);
}
break; // YW
case 6: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xc9);
SSE_MOVLPS_XMM_to_M64(offset+4, reg);
@ -203,25 +209,33 @@ void mVUsaveReg2(int reg, int gprReg, u32 offset, int xyzw) {
}
}
// Modifies the Source Reg!
void mVUmergeRegs(int dest, int src, int xyzw) {
// Modifies the Source Reg! (ToDo: Optimize modXYZW = 1 cases)
void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0) {
xyzw &= 0xf;
if ( (dest != src) && (xyzw != 0) ) {
if (cpucaps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf)) {
if (modXYZW) {
if (xyzw == 1) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 3, 0)); return; }
else if (xyzw == 2) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 2, 0)); return; }
else if (xyzw == 4) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 1, 0)); return; }
}
xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw);
}
else {
switch (xyzw) {
case 1: SSE_MOVHLPS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4);
case 1: if (modXYZW) mVUunpack_xyzw(src, src, 0);
SSE_MOVHLPS_XMM_to_XMM(src, dest); // src = Sw Sz Dw Dz
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4); // 11 00 01 00
break;
case 2: SSE_MOVHLPS_XMM_to_XMM(src, dest);
case 2: if (modXYZW) mVUunpack_xyzw(src, src, 0);
SSE_MOVHLPS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
break;
case 3: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
break;
case 4: SSE_MOVSS_XMM_to_XMM(src, dest);
case 4: if (modXYZW) mVUunpack_xyzw(src, src, 0);
SSE_MOVSS_XMM_to_XMM(src, dest);
SSE2_MOVSD_XMM_to_XMM(dest, src);
break;
case 5: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);

View File

@ -106,12 +106,21 @@ void mVU_printOP(microVU* mVU, int opCase, char* opName, bool isACC) {
opCase4 { if (isACC) { mVUlogACC(); } else { mVUlogFd(); } mVUlogQ(); }
}
// Sets Up Pass1 Info for Normal, BC, I, and Q Cases
void setupPass1(microVU* mVU, int opCase, bool isACC, bool noFlagUpdate) {
opCase1 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); }
opCase2 { mVUanalyzeFMAC3(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); }
opCase3 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); }
opCase4 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); }
if (noFlagUpdate) { sFLAG.doFlag = 0; }
}
// Sets Up Ft Reg for Normal, BC, I, and Q Cases
void setupFtReg(microVU* mVU, int& Ft, int opCase) {
opCase1 { Ft = mVU->regAlloc->allocReg(_Ft_); }
opCase2 {
if (!_XYZW_SS) {
Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W);
Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf);
mVUunpack_xyzw(Ft, Ft, _bc_);
}
else Ft = mVU->regAlloc->allocReg(_Ft_);
@ -122,13 +131,7 @@ void setupFtReg(microVU* mVU, int& Ft, int opCase) {
// Normal FMAC Opcodes
void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, char* opName) {
pass1 {
opCase1 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); }
opCase2 { mVUanalyzeFMAC3(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); }
opCase3 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); }
opCase4 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); }
if ((opType == 3) || (opType == 4)) { sFLAG.doFlag = 0; }
}
pass1 { setupPass1(mVU, opCase, isACC, ((opType == 3) || (opType == 4))); }
pass2 {
int Fs, Ft, ACC;
mVU->regAlloc->reset(); // Reset for Testing
@ -169,12 +172,7 @@ void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, ch
// MADDA/MSUBA Opcodes
void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, char* opName) {
pass1 {
opCase1 { mVUanalyzeFMAC1(mVU, 0, _Fs_, _Ft_); }
opCase2 { mVUanalyzeFMAC3(mVU, 0, _Fs_, _Ft_); }
opCase3 { mVUanalyzeFMAC1(mVU, 0, _Fs_, 0); }
opCase4 { mVUanalyzeFMAC1(mVU, 0, _Fs_, 0); }
}
pass1 { setupPass1(mVU, opCase, 1, 0); }
pass2 {
int Fs, Ft, ACC;
mVU->regAlloc->reset(); // Reset for Testing
@ -218,12 +216,7 @@ void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, char* opName)
// MADD Opcodes
void mVU_FMACc(microVU* mVU, int recPass, int opCase, char* opName) {
pass1 {
opCase1 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, _Ft_); }
opCase2 { mVUanalyzeFMAC3(mVU, _Fd_, _Fs_, _Ft_); }
opCase3 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); }
opCase4 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); }
}
pass1 { setupPass1(mVU, opCase, 0, 0); }
pass2 {
int Fs, Ft, ACC;
mVU->regAlloc->reset(); // Reset for Testing
@ -255,12 +248,7 @@ void mVU_FMACc(microVU* mVU, int recPass, int opCase, char* opName) {
// MSUB Opcodes
void mVU_FMACd(microVU* mVU, int recPass, int opCase, char* opName) {
pass1 {
opCase1 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, _Ft_); }
opCase2 { mVUanalyzeFMAC3(mVU, _Fd_, _Fs_, _Ft_); }
opCase3 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); }
opCase4 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); }
}
pass1 { setupPass1(mVU, opCase, 0, 0); }
pass2 {
int Fs, Ft, Fd;
mVU->regAlloc->reset(); // Reset for Testing