microVU: more regAlloc work...

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1570 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
cottonvibes 2009-07-26 05:54:30 +00:00
parent e768150a4b
commit c595eab036
8 changed files with 357 additions and 297 deletions

View File

@ -119,7 +119,7 @@ microVUt(void) mVUallocVIb(mV, int GPRreg, int _reg_) {
#define getIreg(reg, modXYZW) { \ #define getIreg(reg, modXYZW) { \
SSE_MOVSS_M32_to_XMM(reg, (uptr)&mVU->regs->VI[REG_I].UL); \ SSE_MOVSS_M32_to_XMM(reg, (uptr)&mVU->regs->VI[REG_I].UL); \
if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2(reg, xmmT2, 8); \ if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2(reg, -1, 8); \
if (!((_XYZW_SS && modXYZW) || (_X_Y_Z_W == 8))) { mVUunpack_xyzw(reg, reg, 0); } \ if (!((_XYZW_SS && modXYZW) || (_X_Y_Z_W == 8))) { mVUunpack_xyzw(reg, reg, 0); } \
} }
@ -137,38 +137,6 @@ microVUt(void) mVUallocVIb(mV, int GPRreg, int _reg_) {
// Lower Instruction Allocator Helpers // Lower Instruction Allocator Helpers
//------------------------------------------------------------------ //------------------------------------------------------------------
#define getReg(reg, _reg_) { \
mVUloadReg(reg, (uptr)&mVU->regs->VF[_reg_].UL[0], _X_Y_Z_W); \
if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2(reg, xmmT2, _X_Y_Z_W); \
}
#define getZero(reg) { \
if (_W) { mVUloadReg(reg, (uptr)&mVU->regs->VF[0].UL[0], _X_Y_Z_W); } \
else { SSE_XORPS_XMM_to_XMM(reg, reg); } \
}
#define getReg6(reg, _reg_) { \
if (!_reg_) { getZero(reg); } \
else { getReg(reg, _reg_); } \
}
#define getReg5(reg, _reg_, _fxf_) { \
if (!_reg_) { \
if (_fxf_ < 3) { SSE_XORPS_XMM_to_XMM(reg, reg); } \
else { mVUloadReg(reg, (uptr)&mVU->regs->VF[_reg_].UL[0], 1); } \
} \
else { \
mVUloadReg(reg, (uptr)&mVU->regs->VF[_reg_].UL[0], (1 << (3 - _fxf_))); \
if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2(reg, xmmT2, (1 << (3 - _fxf_))); \
} \
}
// Doesn't Clamp
#define getReg7(reg, _reg_) { \
if (!_reg_) { getZero(reg); } \
else { mVUloadReg(reg, (uptr)&mVU->regs->VF[_reg_].UL[0], _X_Y_Z_W); } \
}
// VF to GPR // VF to GPR
#define getReg8(GPRreg, _reg_, _fxf_) { \ #define getReg8(GPRreg, _reg_, _fxf_) { \
if (!_reg_ && (_fxf_ < 3)) { XOR32RtoR(GPRreg, GPRreg); } \ if (!_reg_ && (_fxf_ < 3)) { XOR32RtoR(GPRreg, GPRreg); } \

View File

@ -31,30 +31,6 @@
} \ } \
} }
#define doBackupVF1() { \
if (mVUinfo.backupVF && !mVUlow.noWriteVF) { \
DevCon::Status("microVU%d: Backing Up VF Reg [%04x]", params getIndex, xPC); \
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VF[mVUlow.VF_write.reg].UL[0]); \
SSE_MOVAPS_XMM_to_M128((uptr)mVU->xmmVFb, xmmT1); \
} \
}
#define doBackupVF2() { \
if (mVUinfo.backupVF && !mVUlow.noWriteVF) { \
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)mVU->xmmVFb); \
SSE_MOVAPS_M128_to_XMM(xmmT2, (uptr)&mVU->regs->VF[mVUlow.VF_write.reg].UL[0]); \
SSE_MOVAPS_XMM_to_M128((uptr)&mVU->regs->VF[mVUlow.VF_write.reg].UL[0], xmmT1); \
SSE_MOVAPS_XMM_to_M128((uptr)mVU->xmmVFb, xmmT2); \
} \
}
#define doBackupVF3() { \
if (mVUinfo.backupVF && !mVUlow.noWriteVF) { \
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)mVU->xmmVFb); \
SSE_MOVAPS_XMM_to_M128((uptr)&mVU->regs->VF[mVUlow.VF_write.reg].UL[0], xmmT1); \
} \
}
#define startLoop() { \ #define startLoop() { \
mVUdebug1(); \ mVUdebug1(); \
memset(&mVUinfo, 0, sizeof(mVUinfo)); \ memset(&mVUinfo, 0, sizeof(mVUinfo)); \
@ -68,7 +44,6 @@
#define incQ() { mVU->q = (mVU->q+1) & 1; } #define incQ() { mVU->q = (mVU->q+1) & 1; }
#define doUpperOp() { mVUopU(mVU, 1); mVUdivSet(mVU); } #define doUpperOp() { mVUopU(mVU, 1); mVUdivSet(mVU); }
#define doLowerOp() { incPC(-1); mVUopL(mVU, 1); incPC(1); } #define doLowerOp() { incPC(-1); mVUopL(mVU, 1); incPC(1); }
#define doSwapOp() { doBackupVF1(); mVUopL(mVU, 1); doBackupVF2(); incPC(1); doUpperOp(); doBackupVF3(); }
#define doIbit() { if (mVUup.iBit) { incPC(-1); MOV32ItoM((uptr)&mVU->regs->VI[REG_I].UL, curI); incPC(1); } } #define doIbit() { if (mVUup.iBit) { incPC(-1); MOV32ItoM((uptr)&mVU->regs->VI[REG_I].UL, curI); incPC(1); } }
#define blockCreate(addr) { if (!mVUblocks[addr]) mVUblocks[addr] = new microBlockManager(); } #define blockCreate(addr) { if (!mVUblocks[addr]) mVUblocks[addr] = new microBlockManager(); }
@ -76,6 +51,29 @@
// Helper Functions // Helper Functions
//------------------------------------------------------------------ //------------------------------------------------------------------
microVUt(void) doSwapOp(mV) {
if (mVUinfo.backupVF && !mVUlow.noWriteVF) {
DevCon::Status("microVU%d: Backing Up VF Reg [%04x]", params getIndex, xPC);
int t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg);
int t2 = mVU->regAlloc->allocReg();
SSE_MOVAPS_XMM_to_XMM(t2, t1);
mVU->regAlloc->clearNeeded(t1);
mVUopL(mVU, 1);
t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg, mVUlow.VF_write.reg, 0xf, 0);
SSE_XORPS_XMM_to_XMM(t2, t1);
SSE_XORPS_XMM_to_XMM(t1, t2);
SSE_XORPS_XMM_to_XMM(t2, t1);
mVU->regAlloc->clearNeeded(t1);
incPC(1);
doUpperOp();
t1 = mVU->regAlloc->allocReg(-1, mVUlow.VF_write.reg, 0xf);
SSE_MOVAPS_XMM_to_XMM(t1, t2);
mVU->regAlloc->clearNeeded(t1);
mVU->regAlloc->clearNeeded(t2);
}
else { mVUopL(mVU, 1); incPC(1); doUpperOp(); }
}
// Used by mVUsetupRange // Used by mVUsetupRange
microVUt(void) mVUcheckIsSame(mV) { microVUt(void) mVUcheckIsSame(mV) {
@ -169,14 +167,14 @@ microVUt(void) mVUoptimizePipeState(mV) {
microVUt(void) mVUsetupBranch(mV, int* xStatus, int* xMac, int* xClip, int xCycles) { microVUt(void) mVUsetupBranch(mV, int* xStatus, int* xMac, int* xClip, int xCycles) {
mVUprint("mVUsetupBranch"); mVUprint("mVUsetupBranch");
// Flush Allocated Regs
mVU->regAlloc->flushAll();
// Shuffle Flag Instances // Shuffle Flag Instances
mVUsetupFlags(mVU, xStatus, xMac, xClip, xCycles); mVUsetupFlags(mVU, xStatus, xMac, xClip, xCycles);
// Shuffle P/Q regs since every block starts at instance #0 // Shuffle P/Q regs since every block starts at instance #0
if (mVU->p || mVU->q) { SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, shufflePQ); } if (mVU->p || mVU->q) { SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, shufflePQ); }
// Flush Allocated Regs
mVU->regAlloc->flushAll();
} }
microVUt(void) mVUincCycles(mV, int x) { microVUt(void) mVUincCycles(mV, int x) {
@ -435,7 +433,7 @@ microVUr(void*) mVUcompile(microVU* mVU, u32 startPC, uptr pState) {
if (mVUup.mBit) { OR32ItoM((uptr)&mVU->regs->flags, VUFLAG_MFLAGSET); } if (mVUup.mBit) { OR32ItoM((uptr)&mVU->regs->flags, VUFLAG_MFLAGSET); }
if (mVUlow.isNOP) { incPC(1); doUpperOp(); doIbit(); } if (mVUlow.isNOP) { incPC(1); doUpperOp(); doIbit(); }
else if (!mVUinfo.swapOps) { incPC(1); doUpperOp(); doLowerOp(); } else if (!mVUinfo.swapOps) { incPC(1); doUpperOp(); doLowerOp(); }
else { doSwapOp(); } else { doSwapOp(mVU); }
if (mVUinfo.doXGKICK) { mVU_XGKICK_DELAY(mVU, 1); } if (mVUinfo.doXGKICK) { mVU_XGKICK_DELAY(mVU, 1); }
if (!mVUinfo.isBdelay) { incPC(1); } if (!mVUinfo.isBdelay) { incPC(1); }

View File

@ -68,7 +68,6 @@ void mVUdispatcherA(mV) {
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, 0); SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, 0);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->clipFlag, xmmT1); SSE_MOVAPS_XMM_to_M128((uptr)mVU->clipFlag, xmmT1);
//SSE_MOVAPS_M128_to_XMM(xmmACC, (uptr)&mVU->regs->ACC.UL[0]);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_P].UL); SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_P].UL);
SSE_MOVAPS_M128_to_XMM(xmmPQ, (uptr)&mVU->regs->VI[REG_Q].UL); SSE_MOVAPS_M128_to_XMM(xmmPQ, (uptr)&mVU->regs->VI[REG_Q].UL);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmT1, 0); // wzyx = PPQQ SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmT1, 0); // wzyx = PPQQ
@ -84,9 +83,6 @@ void mVUdispatcherB(mV) {
// Load EE's MXCSR state // Load EE's MXCSR state
SSE_LDMXCSR((uptr)&g_sseMXCSR); SSE_LDMXCSR((uptr)&g_sseMXCSR);
// Save Regs (Other Regs Saved in mVUcompile)
//SSE_MOVAPS_XMM_to_M128((uptr)&mVU->regs->ACC.UL[0], xmmACC);
// __fastcall = The first two DWORD or smaller arguments are passed in ECX and EDX registers; all other arguments are passed right to left. // __fastcall = The first two DWORD or smaller arguments are passed in ECX and EDX registers; all other arguments are passed right to left.
if (!isVU1) { CALLFunc((uptr)mVUcleanUpVU0); } if (!isVU1) { CALLFunc((uptr)mVUcleanUpVU0); }
else { CALLFunc((uptr)mVUcleanUpVU1); } else { CALLFunc((uptr)mVUcleanUpVU1); }

View File

@ -215,9 +215,9 @@ microVUt(void) mVUsetupFlags(mV, int* xStatus, int* xMac, int* xClip, int cycles
if (__Clip) { if (__Clip) {
int bClip[4]; int bClip[4];
sortFlag(xClip, bClip, cycles); sortFlag(xClip, bClip, cycles);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)mVU->clipFlag); SSE_MOVAPS_M128_to_XMM(xmmT2, (uptr)mVU->clipFlag);
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, shuffleClip); SSE_SHUFPS_XMM_to_XMM (xmmT2, xmmT2, shuffleClip);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->clipFlag, xmmT1); SSE_MOVAPS_XMM_to_M128((uptr)mVU->clipFlag, xmmT2);
} }
} }

View File

@ -211,9 +211,10 @@ public:
} }
counter = 0; counter = 0;
} }
void flushAll() { void flushAll(bool clearState = 1) {
for (int i = 0; i < xmmTotal; i++) { for (int i = 0; i < xmmTotal; i++) {
writeBackReg(i); writeBackReg(i);
if (clearState) clearReg(i);
} }
} }
void clearReg(int reg) { void clearReg(int reg) {

View File

@ -46,13 +46,15 @@ mVUop(mVU_DIV) {
pass1 { mVUanalyzeFDIV(mVU, _Fs_, _Fsf_, _Ft_, _Ftf_, 7); } pass1 { mVUanalyzeFDIV(mVU, _Fs_, _Fsf_, _Ft_, _Ftf_, 7); }
pass2 { pass2 {
u8 *ajmp, *bjmp, *cjmp, *djmp; u8 *ajmp, *bjmp, *cjmp, *djmp;
getReg5(xmmFs, _Fs_, _Fsf_); mVU->regAlloc->reset(); // Reset for Testing
getReg5(xmmFt, _Ft_, _Ftf_); int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, (1 << (3 - _Ftf_)));
int t1 = mVU->regAlloc->allocReg();
testZero(xmmFt, xmmT1, gprT1); // Test if Ft is zero testZero(Ft, t1, gprT1); // Test if Ft is zero
cjmp = JZ8(0); // Skip if not zero cjmp = JZ8(0); // Skip if not zero
testZero(xmmFs, xmmT1, gprT1); // Test if Fs is zero testZero(Fs, t1, gprT1); // Test if Fs is zero
ajmp = JZ8(0); ajmp = JZ8(0);
MOV32ItoM((uptr)&mVU->divFlag, divI); // Set invalid flag (0/0) MOV32ItoM((uptr)&mVU->divFlag, divI); // Set invalid flag (0/0)
bjmp = JMP8(0); bjmp = JMP8(0);
@ -60,20 +62,25 @@ mVUop(mVU_DIV) {
MOV32ItoM((uptr)&mVU->divFlag, divD); // Zero divide (only when not 0/0) MOV32ItoM((uptr)&mVU->divFlag, divD); // Zero divide (only when not 0/0)
x86SetJ8(bjmp); x86SetJ8(bjmp);
SSE_XORPS_XMM_to_XMM (xmmFs, xmmFt); SSE_XORPS_XMM_to_XMM (Fs, Ft);
SSE_ANDPS_M128_to_XMM(xmmFs, (uptr)mVU_signbit); SSE_ANDPS_M128_to_XMM(Fs, (uptr)mVU_signbit);
SSE_ORPS_M128_to_XMM (xmmFs, (uptr)mVU_maxvals); // If division by zero, then xmmFs = +/- fmax SSE_ORPS_M128_to_XMM (Fs, (uptr)mVU_maxvals); // If division by zero, then xmmFs = +/- fmax
djmp = JMP8(0); djmp = JMP8(0);
x86SetJ8(cjmp); x86SetJ8(cjmp);
MOV32ItoM((uptr)&mVU->divFlag, 0); // Clear I/D flags MOV32ItoM((uptr)&mVU->divFlag, 0); // Clear I/D flags
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmFt); SSE_DIVSS_XMM_to_XMM(Fs, Ft);
mVUclamp1(xmmFs, xmmFt, 8); mVUclamp1(Fs, t1, 8);
x86SetJ8(djmp); x86SetJ8(djmp);
if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1); if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_XMM_to_XMM(xmmPQ, Fs);
if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1); if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->clearNeeded(Ft);
mVU->regAlloc->clearNeeded(t1);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("DIV Q, vf%02d%s, vf%02d%s", _Fs_, _Fsf_String, _Ft_, _Ftf_String); } pass3 { mVUlog("DIV Q, vf%02d%s, vf%02d%s", _Fs_, _Fsf_String, _Ft_, _Ftf_String); }
} }
@ -82,16 +89,20 @@ mVUop(mVU_SQRT) {
pass1 { mVUanalyzeFDIV(mVU, 0, 0, _Ft_, _Ftf_, 7); } pass1 { mVUanalyzeFDIV(mVU, 0, 0, _Ft_, _Ftf_, 7); }
pass2 { pass2 {
u8 *ajmp; u8 *ajmp;
getReg5(xmmFt, _Ft_, _Ftf_); mVU->regAlloc->reset(); // Reset for Testing
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, (1 << (3 - _Ftf_)));
MOV32ItoM((uptr)&mVU->divFlag, 0); // Clear I/D flags MOV32ItoM((uptr)&mVU->divFlag, 0); // Clear I/D flags
testNeg(xmmFt, gprT1, ajmp); // Check for negative sqrt testNeg(Ft, gprT1, ajmp); // Check for negative sqrt
if (CHECK_VU_OVERFLOW) SSE_MINSS_M32_to_XMM(xmmFt, (uptr)mVU_maxvals); // Clamp infinities (only need to do positive clamp since xmmFt is positive) if (CHECK_VU_OVERFLOW) SSE_MINSS_M32_to_XMM(Ft, (uptr)mVU_maxvals); // Clamp infinities (only need to do positive clamp since xmmFt is positive)
SSE_SQRTSS_XMM_to_XMM(xmmFt, xmmFt); SSE_SQRTSS_XMM_to_XMM(Ft, Ft);
if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1); if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFt); SSE_MOVSS_XMM_to_XMM(xmmPQ, Ft);
if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1); if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1);
mVU->regAlloc->clearNeeded(Ft);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("SQRT Q, vf%02d%s", _Ft_, _Ftf_String); } pass3 { mVUlog("SQRT Q, vf%02d%s", _Ft_, _Ftf_String); }
} }
@ -100,17 +111,19 @@ mVUop(mVU_RSQRT) {
pass1 { mVUanalyzeFDIV(mVU, _Fs_, _Fsf_, _Ft_, _Ftf_, 13); } pass1 { mVUanalyzeFDIV(mVU, _Fs_, _Fsf_, _Ft_, _Ftf_, 13); }
pass2 { pass2 {
u8 *ajmp, *bjmp, *cjmp, *djmp; u8 *ajmp, *bjmp, *cjmp, *djmp;
getReg5(xmmFs, _Fs_, _Fsf_); mVU->regAlloc->reset(); // Reset for Testing
getReg5(xmmFt, _Ft_, _Ftf_); int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, (1 << (3 - _Ftf_)));
int t1 = mVU->regAlloc->allocReg();
MOV32ItoM((uptr)&mVU->divFlag, 0); // Clear I/D flags MOV32ItoM((uptr)&mVU->divFlag, 0); // Clear I/D flags
testNeg(xmmFt, gprT1, ajmp); // Check for negative sqrt testNeg(Ft, gprT1, ajmp); // Check for negative sqrt
SSE_SQRTSS_XMM_to_XMM(xmmFt, xmmFt); SSE_SQRTSS_XMM_to_XMM(Ft, Ft);
testZero(xmmFt, xmmT1, gprT1); // Test if Ft is zero testZero(Ft, t1, gprT1); // Test if Ft is zero
ajmp = JZ8(0); // Skip if not zero ajmp = JZ8(0); // Skip if not zero
testZero(xmmFs, xmmT1, gprT1); // Test if Fs is zero testZero(Fs, t1, gprT1); // Test if Fs is zero
bjmp = JZ8(0); // Skip if none are bjmp = JZ8(0); // Skip if none are
MOV32ItoM((uptr)&mVU->divFlag, divI); // Set invalid flag (0/0) MOV32ItoM((uptr)&mVU->divFlag, divI); // Set invalid flag (0/0)
cjmp = JMP8(0); cjmp = JMP8(0);
@ -118,18 +131,23 @@ mVUop(mVU_RSQRT) {
MOV32ItoM((uptr)&mVU->divFlag, divD); // Zero divide flag (only when not 0/0) MOV32ItoM((uptr)&mVU->divFlag, divD); // Zero divide flag (only when not 0/0)
x86SetJ8(cjmp); x86SetJ8(cjmp);
SSE_ANDPS_M128_to_XMM(xmmFs, (uptr)mVU_signbit); SSE_ANDPS_M128_to_XMM(Fs, (uptr)mVU_signbit);
SSE_ORPS_M128_to_XMM (xmmFs, (uptr)mVU_maxvals); // xmmFs = +/-Max SSE_ORPS_M128_to_XMM (Fs, (uptr)mVU_maxvals); // xmmFs = +/-Max
djmp = JMP8(0); djmp = JMP8(0);
x86SetJ8(ajmp); x86SetJ8(ajmp);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmFt); SSE_DIVSS_XMM_to_XMM(Fs, Ft);
mVUclamp1(xmmFs, xmmFt, 8); mVUclamp1(Fs, t1, 8);
x86SetJ8(djmp); x86SetJ8(djmp);
if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1); if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_XMM_to_XMM(xmmPQ, Fs);
if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1); if (mVUinfo.writeQ) SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->clearNeeded(Ft);
mVU->regAlloc->clearNeeded(t1);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("RSQRT Q, vf%02d%s, vf%02d%s", _Fs_, _Fsf_String, _Ft_, _Ftf_String); } pass3 { mVUlog("RSQRT Q, vf%02d%s, vf%02d%s", _Fs_, _Fsf_String, _Ft_, _Ftf_String); }
} }
@ -139,20 +157,18 @@ mVUop(mVU_RSQRT) {
//------------------------------------------------------------------ //------------------------------------------------------------------
#define EATANhelper(addr) { \ #define EATANhelper(addr) { \
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \ SSE_MULSS_XMM_to_XMM (t2, Fs); \
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \ SSE_MULSS_XMM_to_XMM (t2, Fs); \
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmT1); \ SSE_MOVAPS_XMM_to_XMM(t1, t2); \
SSE_MULSS_M32_to_XMM(xmmFt, (uptr)addr); \ SSE_MULSS_M32_to_XMM (t1, (uptr)addr); \
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt); \ SSE_ADDSS_XMM_to_XMM (PQ, t1); \
} }
microVUt(void) mVU_EATAN_(mV) {
// ToDo: Can Be Optimized Further? (takes approximately (~115 cycles + mem access time) on a c2d) // ToDo: Can Be Optimized Further? (takes approximately (~115 cycles + mem access time) on a c2d)
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); microVUt(void) mVU_EATAN_(mV, int PQ, int Fs, int t1, int t2) {
SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_T1); SSE_MOVSS_XMM_to_XMM (PQ, Fs);
SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFs); SSE_MULSS_M32_to_XMM (PQ, (uptr)mVU_T1);
SSE_MOVAPS_XMM_to_XMM(t2, Fs);
EATANhelper(mVU_T2); EATANhelper(mVU_T2);
EATANhelper(mVU_T3); EATANhelper(mVU_T3);
EATANhelper(mVU_T4); EATANhelper(mVU_T4);
@ -160,23 +176,26 @@ microVUt(void) mVU_EATAN_(mV) {
EATANhelper(mVU_T6); EATANhelper(mVU_T6);
EATANhelper(mVU_T7); EATANhelper(mVU_T7);
EATANhelper(mVU_T8); EATANhelper(mVU_T8);
SSE_ADDSS_M32_to_XMM (PQ, (uptr)mVU_Pi4);
SSE_ADDSS_M32_to_XMM(xmmPQ, (uptr)mVU_Pi4); SSE2_PSHUFD_XMM_to_XMM(PQ, PQ, mVUinfo.writeP ? 0x27 : 0xC6);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6);
} }
mVUop(mVU_EATAN) { mVUop(mVU_EATAN) {
pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 54); } pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 54); }
pass2 { pass2 {
getReg5(xmmFs, _Fs_, _Fsf_); int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
int t1 = mVU->regAlloc->allocReg();
int t2 = mVU->regAlloc->allocReg();
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_SUBSS_M32_to_XMM (Fs, (uptr)mVU_one);
SSE_SUBSS_M32_to_XMM(xmmFs, (uptr)mVU_one);
SSE_ADDSS_M32_to_XMM (xmmPQ, (uptr)mVU_one); SSE_ADDSS_M32_to_XMM (xmmPQ, (uptr)mVU_one);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ); SSE_DIVSS_XMM_to_XMM (Fs, xmmPQ);
mVU_EATAN_(mVU, xmmPQ, Fs, t1, t2);
mVU_EATAN_(mVU); mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->clearNeeded(t1);
mVU->regAlloc->clearNeeded(t2);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("EATAN P"); } pass3 { mVUlog("EATAN P"); }
} }
@ -184,16 +203,21 @@ mVUop(mVU_EATAN) {
mVUop(mVU_EATANxy) { mVUop(mVU_EATANxy) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 54); } pass1 { mVUanalyzeEFU2(mVU, _Fs_, 54); }
pass2 { pass2 {
getReg6(xmmFt, _Fs_); mVU->regAlloc->reset(); // Reset for Testing
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFt, 0x01); int t1 = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
int Fs = mVU->regAlloc->allocReg();
int t2 = mVU->regAlloc->allocReg();
SSE2_PSHUFD_XMM_to_XMM(Fs, t1, 0x01);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_SUBSS_XMM_to_XMM (Fs, t1); // y-x, not y-1? ><
SSE_SUBSS_XMM_to_XMM(xmmFs, xmmFt); // y-x, not y-1? >< SSE_ADDSS_XMM_to_XMM (t1, xmmPQ);
SSE_ADDSS_XMM_to_XMM(xmmFt, xmmPQ); SSE_DIVSS_XMM_to_XMM (Fs, t1);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmFt); mVU_EATAN_(mVU, xmmPQ, Fs, t1, t2);
mVU->regAlloc->clearNeeded(Fs);
mVU_EATAN_(mVU); mVU->regAlloc->clearNeeded(t1);
mVU->regAlloc->clearNeeded(t2);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("EATANxy P"); } pass3 { mVUlog("EATANxy P"); }
} }
@ -201,83 +225,95 @@ mVUop(mVU_EATANxy) {
mVUop(mVU_EATANxz) { mVUop(mVU_EATANxz) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 54); } pass1 { mVUanalyzeEFU2(mVU, _Fs_, 54); }
pass2 { pass2 {
getReg6(xmmFt, _Fs_); mVU->regAlloc->reset(); // Reset for Testing
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFt, 0x02); int t1 = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
int Fs = mVU->regAlloc->allocReg();
int t2 = mVU->regAlloc->allocReg();
SSE2_PSHUFD_XMM_to_XMM(Fs, t1, 0x02);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_SUBSS_XMM_to_XMM (Fs, t1);
SSE_SUBSS_XMM_to_XMM(xmmFs, xmmFt); SSE_ADDSS_XMM_to_XMM (t1, xmmPQ);
SSE_ADDSS_XMM_to_XMM(xmmFt, xmmPQ); SSE_DIVSS_XMM_to_XMM (Fs, t1);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmFt); mVU_EATAN_(mVU, xmmPQ, Fs, t1, t2);
mVU->regAlloc->clearNeeded(Fs);
mVU_EATAN_(mVU); mVU->regAlloc->clearNeeded(t1);
mVU->regAlloc->clearNeeded(t2);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("EATANxz P"); } pass3 { mVUlog("EATANxz P"); }
} }
#define eexpHelper(addr) { \ #define eexpHelper(addr) { \
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); \ SSE_MULSS_XMM_to_XMM (t2, Fs); \
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmT1); \ SSE_MOVAPS_XMM_to_XMM(t1, t2); \
SSE_MULSS_M32_to_XMM(xmmFt, (uptr)addr); \ SSE_MULSS_M32_to_XMM (t1, (uptr)addr); \
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt); \ SSE_ADDSS_XMM_to_XMM (xmmPQ, t1); \
} }
mVUop(mVU_EEXP) { mVUop(mVU_EEXP) {
pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 44); } pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 44); }
pass2 { pass2 {
getReg5(xmmFs, _Fs_, _Fsf_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
int t1 = mVU->regAlloc->allocReg();
int t2 = mVU->regAlloc->allocReg();
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
SSE_MULSS_M32_to_XMM (xmmPQ, (uptr)mVU_E1); SSE_MULSS_M32_to_XMM (xmmPQ, (uptr)mVU_E1);
SSE_ADDSS_M32_to_XMM (xmmPQ, (uptr)mVU_one); SSE_ADDSS_M32_to_XMM (xmmPQ, (uptr)mVU_one);
SSE_MOVAPS_XMM_to_XMM (t1, Fs);
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs); SSE_MULSS_XMM_to_XMM (t1, Fs);
SSE_MULSS_XMM_to_XMM(xmmFt, xmmFs); SSE_MOVAPS_XMM_to_XMM (t2, t1);
SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFt); SSE_MULSS_M32_to_XMM (t1, (uptr)mVU_E2);
SSE_MULSS_M32_to_XMM(xmmFt, (uptr)mVU_E2); SSE_ADDSS_XMM_to_XMM (xmmPQ, t1);
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFt);
eexpHelper(mVU_E3); eexpHelper(mVU_E3);
eexpHelper(mVU_E4); eexpHelper(mVU_E4);
eexpHelper(mVU_E5); eexpHelper(mVU_E5);
SSE_MULSS_XMM_to_XMM (t2, Fs);
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs); SSE_MULSS_M32_to_XMM (t2, (uptr)mVU_E6);
SSE_MULSS_M32_to_XMM(xmmT1, (uptr)mVU_E6); SSE_ADDSS_XMM_to_XMM (xmmPQ, t2);
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmT1);
SSE_MULSS_XMM_to_XMM (xmmPQ, xmmPQ); SSE_MULSS_XMM_to_XMM (xmmPQ, xmmPQ);
SSE_MULSS_XMM_to_XMM (xmmPQ, xmmPQ); SSE_MULSS_XMM_to_XMM (xmmPQ, xmmPQ);
SSE_MOVSS_M32_to_XMM(xmmT1, (uptr)mVU_one); SSE_MOVSS_M32_to_XMM (t2, (uptr)mVU_one);
SSE_DIVSS_XMM_to_XMM(xmmT1, xmmPQ); SSE_DIVSS_XMM_to_XMM (t2, xmmPQ);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmT1); SSE_MOVSS_XMM_to_XMM (xmmPQ, t2);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->clearNeeded(t1);
mVU->regAlloc->clearNeeded(t2);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("EEXP P"); } pass3 { mVUlog("EEXP P"); }
} }
microVUt(void) mVU_sumXYZ() { // sumXYZ(): PQ.x = x ^ 2 + y ^ 2 + z ^ 2
// xmmPQ.x = x ^ 2 + y ^ 2 + z ^ 2 microVUt(void) mVU_sumXYZ(int PQ, int Fs) {
if( cpucaps.hasStreamingSIMD4Extensions ) { if( cpucaps.hasStreamingSIMD4Extensions ) {
SSE4_DPPS_XMM_to_XMM(xmmFs, xmmFs, 0x71); SSE4_DPPS_XMM_to_XMM(Fs, Fs, 0x71);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_XMM_to_XMM(PQ, Fs);
} }
else { else {
SSE_MULPS_XMM_to_XMM(xmmFs, xmmFs); // wzyx ^ 2 SSE_MULPS_XMM_to_XMM (Fs, Fs); // wzyx ^ 2
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); // x ^ 2 SSE_MOVSS_XMM_to_XMM (PQ, Fs); // x ^ 2
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFs, 0xe1); // wzyx -> wzxy SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xe1); // wzyx -> wzxy
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); // x ^ 2 + y ^ 2 SSE_ADDSS_XMM_to_XMM (PQ, Fs); // x ^ 2 + y ^ 2
SSE2_PSHUFD_XMM_to_XMM(xmmFs, xmmFs, 0xD2); // wzxy -> wxyz SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xD2); // wzxy -> wxyz
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); // x ^ 2 + y ^ 2 + z ^ 2 SSE_ADDSS_XMM_to_XMM (PQ, Fs); // x ^ 2 + y ^ 2 + z ^ 2
} }
} }
mVUop(mVU_ELENG) { mVUop(mVU_ELENG) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 18); } pass1 { mVUanalyzeEFU2(mVU, _Fs_, 18); }
pass2 { pass2 {
getReg6(xmmFs, _Fs_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ(); mVU_sumXYZ(xmmPQ, Fs);
SSE_SQRTSS_XMM_to_XMM (xmmPQ, xmmPQ); SSE_SQRTSS_XMM_to_XMM (xmmPQ, xmmPQ);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("ELENG P"); } pass3 { mVUlog("ELENG P"); }
} }
@ -285,13 +321,16 @@ mVUop(mVU_ELENG) {
mVUop(mVU_ERCPR) { mVUop(mVU_ERCPR) {
pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 12); } pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 12); }
pass2 { pass2 {
getReg5(xmmFs, _Fs_, _Fsf_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one); SSE_MOVSS_M32_to_XMM (Fs, (uptr)mVU_one);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ); SSE_DIVSS_XMM_to_XMM (Fs, xmmPQ);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("ERCPR P"); } pass3 { mVUlog("ERCPR P"); }
} }
@ -299,14 +338,17 @@ mVUop(mVU_ERCPR) {
mVUop(mVU_ERLENG) { mVUop(mVU_ERLENG) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 24); } pass1 { mVUanalyzeEFU2(mVU, _Fs_, 24); }
pass2 { pass2 {
getReg6(xmmFs, _Fs_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ(); mVU_sumXYZ(xmmPQ, Fs);
SSE_SQRTSS_XMM_to_XMM (xmmPQ, xmmPQ); SSE_SQRTSS_XMM_to_XMM (xmmPQ, xmmPQ);
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one); SSE_MOVSS_M32_to_XMM (Fs, (uptr)mVU_one);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ); SSE_DIVSS_XMM_to_XMM (Fs, xmmPQ);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("ERLENG P"); } pass3 { mVUlog("ERLENG P"); }
} }
@ -314,14 +356,16 @@ mVUop(mVU_ERLENG) {
mVUop(mVU_ERSADD) { mVUop(mVU_ERSADD) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 18); } pass1 { mVUanalyzeEFU2(mVU, _Fs_, 18); }
pass2 { pass2 {
getReg6(xmmFs, _Fs_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ(); mVU_sumXYZ(xmmPQ, Fs);
//SSE_RCPSS_XMM_to_XMM(xmmPQ, xmmPQ); // Lower Precision is bad? SSE_MOVSS_M32_to_XMM (Fs, (uptr)mVU_one);
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one); SSE_DIVSS_XMM_to_XMM (Fs, xmmPQ);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ); SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("ERSADD P"); } pass3 { mVUlog("ERSADD P"); }
} }
@ -329,13 +373,16 @@ mVUop(mVU_ERSADD) {
mVUop(mVU_ERSQRT) { mVUop(mVU_ERSQRT) {
pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 18); } pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 18); }
pass2 { pass2 {
getReg5(xmmFs, _Fs_, _Fsf_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_SQRTSS_XMM_to_XMM (xmmPQ, Fs);
SSE_MOVSS_M32_to_XMM(xmmFs, (uptr)mVU_one); SSE_MOVSS_M32_to_XMM (Fs, (uptr)mVU_one);
SSE_DIVSS_XMM_to_XMM(xmmFs, xmmPQ); SSE_DIVSS_XMM_to_XMM (Fs, xmmPQ);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("ERSQRT P"); } pass3 { mVUlog("ERSQRT P"); }
} }
@ -343,43 +390,50 @@ mVUop(mVU_ERSQRT) {
mVUop(mVU_ESADD) { mVUop(mVU_ESADD) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 11); } pass1 { mVUanalyzeEFU2(mVU, _Fs_, 11); }
pass2 { pass2 {
getReg6(xmmFs, _Fs_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ(); mVU_sumXYZ(xmmPQ, Fs);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("ESADD P"); } pass3 { mVUlog("ESADD P"); }
} }
#define esinHelper(addr) { \ #define esinHelper(addr) { \
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFt); \ SSE_MULSS_XMM_to_XMM (t2, t1); \
SSE_MOVAPS_XMM_to_XMM(xmmFs, xmmT1); \ SSE_MOVAPS_XMM_to_XMM(Fs, t2); \
SSE_MULSS_M32_to_XMM(xmmFs, (uptr)addr); \ SSE_MULSS_M32_to_XMM (Fs, (uptr)addr); \
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs); \ SSE_ADDSS_XMM_to_XMM (xmmPQ, Fs); \
} }
mVUop(mVU_ESIN) { mVUop(mVU_ESIN) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 29); } pass1 { mVUanalyzeEFU2(mVU, _Fs_, 29); }
pass2 { pass2 {
getReg5(xmmFs, _Fs_, _Fsf_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
int t1 = mVU->regAlloc->allocReg();
int t2 = mVU->regAlloc->allocReg();
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
//SSE_MULSS_M32_to_XMM(xmmPQ, (uptr)mVU_one); // Multiplying by 1 is redundant? SSE_MOVAPS_XMM_to_XMM (t1, Fs);
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs); SSE_MULSS_XMM_to_XMM (Fs, t1);
SSE_MULSS_XMM_to_XMM(xmmFs, xmmFt); SSE_MOVAPS_XMM_to_XMM (t2, Fs);
SSE_MOVAPS_XMM_to_XMM(xmmT1, xmmFs); SSE_MULSS_XMM_to_XMM (Fs, t1);
SSE_MULSS_XMM_to_XMM(xmmFs, xmmFt); SSE_MOVAPS_XMM_to_XMM (t1, Fs);
SSE_MOVAPS_XMM_to_XMM(xmmFt, xmmFs); SSE_MULSS_M32_to_XMM (Fs, (uptr)mVU_S2);
SSE_MULSS_M32_to_XMM(xmmFs, (uptr)mVU_S2); SSE_ADDSS_XMM_to_XMM (xmmPQ, Fs);
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmFs);
esinHelper(mVU_S3); esinHelper(mVU_S3);
esinHelper(mVU_S4); esinHelper(mVU_S4);
SSE_MULSS_XMM_to_XMM (t2, t1);
SSE_MULSS_XMM_to_XMM(xmmT1, xmmFt); SSE_MULSS_M32_to_XMM (t2, (uptr)mVU_S5);
SSE_MULSS_M32_to_XMM(xmmT1, (uptr)mVU_S5); SSE_ADDSS_XMM_to_XMM (xmmPQ, t2);
SSE_ADDSS_XMM_to_XMM(xmmPQ, xmmT1);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->clearNeeded(t1);
mVU->regAlloc->clearNeeded(t2);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("ESIN P"); } pass3 { mVUlog("ESIN P"); }
} }
@ -387,10 +441,13 @@ mVUop(mVU_ESIN) {
mVUop(mVU_ESQRT) { mVUop(mVU_ESQRT) {
pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 12); } pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 12); }
pass2 { pass2 {
getReg5(xmmFs, _Fs_, _Fsf_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE_SQRTSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_SQRTSS_XMM_to_XMM (xmmPQ, Fs);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("ESQRT P"); } pass3 { mVUlog("ESQRT P"); }
} }
@ -398,14 +455,19 @@ mVUop(mVU_ESQRT) {
mVUop(mVU_ESUM) { mVUop(mVU_ESUM) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 12); } pass1 { mVUanalyzeEFU2(mVU, _Fs_, 12); }
pass2 { pass2 {
getReg6(xmmFs, _Fs_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
int t1 = mVU->regAlloc->allocReg();
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
SSE2_PSHUFD_XMM_to_XMM(xmmFt, xmmFs, 0x1b); SSE2_PSHUFD_XMM_to_XMM(t1, Fs, 0x1b);
SSE_ADDPS_XMM_to_XMM(xmmFs, xmmFt); SSE_ADDPS_XMM_to_XMM (Fs, t1);
SSE2_PSHUFD_XMM_to_XMM(xmmFt, xmmFs, 0x01); SSE2_PSHUFD_XMM_to_XMM(t1, Fs, 0x01);
SSE_ADDSS_XMM_to_XMM(xmmFs, xmmFt); SSE_ADDSS_XMM_to_XMM (Fs, t1);
SSE_MOVSS_XMM_to_XMM(xmmPQ, xmmFs); SSE_MOVSS_XMM_to_XMM (xmmPQ, Fs);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->clearNeeded(t1);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("ESUM P"); } pass3 { mVUlog("ESUM P"); }
} }
@ -691,11 +753,14 @@ mVUop(mVU_ISUBIU) {
mVUop(mVU_MFIR) { mVUop(mVU_MFIR) {
pass1 { if (!_Ft_) { mVUlow.isNOP = 1; } analyzeVIreg1(_Is_, mVUlow.VI_read[0]); analyzeReg2(_Ft_, mVUlow.VF_write, 1); } pass1 { if (!_Ft_) { mVUlow.isNOP = 1; } analyzeVIreg1(_Is_, mVUlow.VI_read[0]); analyzeReg2(_Ft_, mVUlow.VF_write, 1); }
pass2 { pass2 {
mVU->regAlloc->reset(); // Reset for Testing
int Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
mVUallocVIa(mVU, gprT1, _Is_); mVUallocVIa(mVU, gprT1, _Is_);
MOVSX32R16toR(gprT1, gprT1); MOVSX32R16toR(gprT1, gprT1);
SSE2_MOVD_R_to_XMM(xmmT1, gprT1); SSE2_MOVD_R_to_XMM(Ft, gprT1);
if (!_XYZW_SS) { mVUunpack_xyzw(xmmT1, xmmT1, 0); } if (!_XYZW_SS) { mVUunpack_xyzw(Ft, Ft, 0); }
mVUsaveReg(xmmT1, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W, 1); mVU->regAlloc->clearNeeded(Ft);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("MFIR.%s vf%02d, vi%02d", _XYZW_String, _Ft_, _Fs_); } pass3 { mVUlog("MFIR.%s vf%02d, vi%02d", _XYZW_String, _Ft_, _Fs_); }
} }
@ -703,8 +768,11 @@ mVUop(mVU_MFIR) {
mVUop(mVU_MFP) { mVUop(mVU_MFP) {
pass1 { mVUanalyzeMFP(mVU, _Ft_); } pass1 { mVUanalyzeMFP(mVU, _Ft_); }
pass2 { pass2 {
getPreg(xmmFt); mVU->regAlloc->reset(); // Reset for Testing
mVUsaveReg(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W, 1); int Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
getPreg(Ft);
mVU->regAlloc->clearNeeded(Ft);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("MFP.%s vf%02d, P", _XYZW_String, _Ft_); } pass3 { mVUlog("MFP.%s vf%02d, P", _XYZW_String, _Ft_); }
} }
@ -712,8 +780,10 @@ mVUop(mVU_MFP) {
mVUop(mVU_MOVE) { mVUop(mVU_MOVE) {
pass1 { mVUanalyzeMOVE(mVU, _Fs_, _Ft_); } pass1 { mVUanalyzeMOVE(mVU, _Fs_, _Ft_); }
pass2 { pass2 {
mVUloadReg(xmmT1, (uptr)&mVU->regs->VF[_Fs_].UL[0], _X_Y_Z_W); mVU->regAlloc->reset(); // Reset for Testing
mVUsaveReg(xmmT1, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W, 1); int Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("MOVE.%s vf%02d, vf%02d", _XYZW_String, _Ft_, _Fs_); } pass3 { mVUlog("MOVE.%s vf%02d, vf%02d", _XYZW_String, _Ft_, _Fs_); }
} }
@ -721,9 +791,14 @@ mVUop(mVU_MOVE) {
mVUop(mVU_MR32) { mVUop(mVU_MR32) {
pass1 { mVUanalyzeMR32(mVU, _Fs_, _Ft_); } pass1 { mVUanalyzeMR32(mVU, _Fs_, _Ft_); }
pass2 { pass2 {
mVUloadReg(xmmT1, (uptr)&mVU->regs->VF[_Fs_].UL[0], (_X_Y_Z_W == 8) ? 4 : 15); mVU->regAlloc->reset(); // Reset for Testing
if (_X_Y_Z_W != 8) { SSE2_PSHUFD_XMM_to_XMM(xmmT1, xmmT1, 0x39); } int Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
mVUsaveReg(xmmT1, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W, 0); int Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
if (_XYZW_SS) mVUunpack_xyzw(Ft, Fs, (_X ? 1 : (_Y ? 2 : (_Z ? 3 : 0))));
else SSE2_PSHUFD_XMM_to_XMM(Ft, Fs, 0x39);
mVU->regAlloc->clearNeeded(Ft);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("MR32.%s vf%02d, vf%02d", _XYZW_String, _Ft_, _Fs_); } pass3 { mVUlog("MR32.%s vf%02d, vf%02d", _XYZW_String, _Ft_, _Fs_); }
} }
@ -731,8 +806,12 @@ mVUop(mVU_MR32) {
mVUop(mVU_MTIR) { mVUop(mVU_MTIR) {
pass1 { if (!_It_) { mVUlow.isNOP = 1; } analyzeReg5(_Fs_, _Fsf_, mVUlow.VF_read[0]); analyzeVIreg2(_It_, mVUlow.VI_write, 1); } pass1 { if (!_It_) { mVUlow.isNOP = 1; } analyzeReg5(_Fs_, _Fsf_, mVUlow.VF_read[0]); analyzeVIreg2(_It_, mVUlow.VI_write, 1); }
pass2 { pass2 {
MOVZX32M16toR(gprT1, (uptr)&mVU->regs->VF[_Fs_].UL[_Fsf_]); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
SSE2_MOVD_XMM_to_R(gprT1, Fs);
mVUallocVIb(mVU, gprT1, _It_); mVUallocVIb(mVU, gprT1, _It_);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("MTIR vi%02d, vf%02d%s", _Ft_, _Fs_, _Fsf_String); } pass3 { mVUlog("MTIR vi%02d, vf%02d%s", _Ft_, _Fs_, _Fsf_String); }
} }
@ -835,17 +914,17 @@ mVUop(mVU_ISWR) {
mVUop(mVU_LQ) { mVUop(mVU_LQ) {
pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, 0); } pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, 0); }
pass2 { pass2 {
if (!_Is_) { mVU->regAlloc->reset(); // Reset for Testing
mVUloadReg(xmmFt, (uptr)mVU->regs->Mem + getVUmem(_Imm11_), _X_Y_Z_W); int Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
mVUsaveReg(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W, 1); if (_Is_) {
}
else {
mVUallocVIa(mVU, gprT1, _Is_); mVUallocVIa(mVU, gprT1, _Is_);
ADD32ItoR(gprT1, _Imm11_); ADD32ItoR(gprT1, _Imm11_);
mVUaddrFix(mVU, gprT1); mVUaddrFix(mVU, gprT1);
mVUloadReg2(xmmFt, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W); mVUloadReg2(Ft, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
mVUsaveReg(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W, 1);
} }
else mVUloadReg(Ft, (uptr)mVU->regs->Mem + getVUmem(_Imm11_), _X_Y_Z_W);
mVU->regAlloc->clearNeeded(Ft);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("LQ.%s vf%02d, vi%02d + %d", _XYZW_String, _Ft_, _Fs_, _Imm11_); } pass3 { mVUlog("LQ.%s vf%02d, vi%02d + %d", _XYZW_String, _Ft_, _Fs_, _Imm11_); }
} }
@ -853,20 +932,24 @@ mVUop(mVU_LQ) {
mVUop(mVU_LQD) { mVUop(mVU_LQD) {
pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, 1); } pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, 1); }
pass2 { pass2 {
if (!_Is_ && !mVUlow.noWriteVF) { mVU->regAlloc->reset(); // Reset for Testing
mVUloadReg(xmmFt, (uptr)mVU->regs->Mem, _X_Y_Z_W); if (_Is_) {
mVUsaveReg(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W, 1);
}
else {
mVUallocVIa(mVU, gprT1, _Is_); mVUallocVIa(mVU, gprT1, _Is_);
SUB16ItoR(gprT1, 1); SUB16ItoR(gprT1, 1);
mVUallocVIb(mVU, gprT1, _Is_); mVUallocVIb(mVU, gprT1, _Is_);
if (!mVUlow.noWriteVF) { if (!mVUlow.noWriteVF) {
int Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
mVUaddrFix(mVU, gprT1); mVUaddrFix(mVU, gprT1);
mVUloadReg2(xmmFt, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W); mVUloadReg2(Ft, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
mVUsaveReg(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W, 1); mVU->regAlloc->clearNeeded(Ft);
} }
} }
else if (!mVUlow.noWriteVF) {
int Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
mVUloadReg(Ft, (uptr)mVU->regs->Mem, _X_Y_Z_W);
mVU->regAlloc->clearNeeded(Ft);
}
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("LQD.%s vf%02d, --vi%02d", _XYZW_String, _Ft_, _Is_); } pass3 { mVUlog("LQD.%s vf%02d, --vi%02d", _XYZW_String, _Ft_, _Is_); }
} }
@ -874,21 +957,25 @@ mVUop(mVU_LQD) {
mVUop(mVU_LQI) { mVUop(mVU_LQI) {
pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, 1); } pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, 1); }
pass2 { pass2 {
if (!_Is_ && !mVUlow.noWriteVF) { mVU->regAlloc->reset(); // Reset for Testing
mVUloadReg(xmmFt, (uptr)mVU->regs->Mem, _X_Y_Z_W); if (_Is_) {
mVUsaveReg(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W, 1);
}
else {
mVUallocVIa(mVU, (!mVUlow.noWriteVF) ? gprT1 : gprT2, _Is_); mVUallocVIa(mVU, (!mVUlow.noWriteVF) ? gprT1 : gprT2, _Is_);
if (!mVUlow.noWriteVF) { if (!mVUlow.noWriteVF) {
int Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
MOV32RtoR(gprT2, gprT1); MOV32RtoR(gprT2, gprT1);
mVUaddrFix(mVU, gprT1); mVUaddrFix(mVU, gprT1);
mVUloadReg2(xmmFt, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W); mVUloadReg2(Ft, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
mVUsaveReg(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W, 1); mVU->regAlloc->clearNeeded(Ft);
} }
ADD16ItoR(gprT2, 1); ADD16ItoR(gprT2, 1);
mVUallocVIb(mVU, gprT2, _Is_); mVUallocVIb(mVU, gprT2, _Is_);
} }
else if (!mVUlow.noWriteVF) {
int Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
mVUloadReg(Ft, (uptr)mVU->regs->Mem, _X_Y_Z_W);
mVU->regAlloc->clearNeeded(Ft);
}
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("LQI.%s vf%02d, vi%02d++", _XYZW_String, _Ft_, _Fs_); } pass3 { mVUlog("LQI.%s vf%02d, vi%02d++", _XYZW_String, _Ft_, _Fs_); }
} }
@ -900,17 +987,17 @@ mVUop(mVU_LQI) {
mVUop(mVU_SQ) { mVUop(mVU_SQ) {
pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, 0); } pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, 0); }
pass2 { pass2 {
if (!_It_) { mVU->regAlloc->reset(); // Reset for Testing
getReg7(xmmFs, _Fs_); int Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
mVUsaveReg(xmmFs, (uptr)mVU->regs->Mem + getVUmem(_Imm11_), _X_Y_Z_W, 1); if (_It_) {
}
else {
mVUallocVIa(mVU, gprT1, _It_); mVUallocVIa(mVU, gprT1, _It_);
ADD32ItoR(gprT1, _Imm11_); ADD32ItoR(gprT1, _Imm11_);
mVUaddrFix(mVU, gprT1); mVUaddrFix(mVU, gprT1);
getReg7(xmmFs, _Fs_); mVUsaveReg2(Fs, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
mVUsaveReg2(xmmFs, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
} }
else mVUsaveReg(Fs, (uptr)mVU->regs->Mem + getVUmem(_Imm11_), _X_Y_Z_W, 1);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("SQ.%s vf%02d, vi%02d + %d", _XYZW_String, _Fs_, _Ft_, _Imm11_); } pass3 { mVUlog("SQ.%s vf%02d, vi%02d + %d", _XYZW_String, _Fs_, _Ft_, _Imm11_); }
} }
@ -918,18 +1005,18 @@ mVUop(mVU_SQ) {
mVUop(mVU_SQD) { mVUop(mVU_SQD) {
pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, 1); } pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, 1); }
pass2 { pass2 {
if (!_It_) { mVU->regAlloc->reset(); // Reset for Testing
getReg7(xmmFs, _Fs_); int Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
mVUsaveReg(xmmFs, (uptr)mVU->regs->Mem, _X_Y_Z_W, 1); if (_It_) {
}
else {
mVUallocVIa(mVU, gprT1, _It_); mVUallocVIa(mVU, gprT1, _It_);
SUB16ItoR(gprT1, 1); SUB16ItoR(gprT1, 1);
mVUallocVIb(mVU, gprT1, _It_); mVUallocVIb(mVU, gprT1, _It_);
mVUaddrFix(mVU, gprT1); mVUaddrFix(mVU, gprT1);
getReg7(xmmFs, _Fs_); mVUsaveReg2(Fs, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
mVUsaveReg2(xmmFs, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
} }
else mVUsaveReg(Fs, (uptr)mVU->regs->Mem, _X_Y_Z_W, 1);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("SQD.%s vf%02d, --vi%02d", _XYZW_String, _Fs_, _Ft_); } pass3 { mVUlog("SQD.%s vf%02d, --vi%02d", _XYZW_String, _Fs_, _Ft_); }
} }
@ -937,19 +1024,19 @@ mVUop(mVU_SQD) {
mVUop(mVU_SQI) { mVUop(mVU_SQI) {
pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, 1); } pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, 1); }
pass2 { pass2 {
if (!_It_) { mVU->regAlloc->reset(); // Reset for Testing
getReg7(xmmFs, _Fs_); int Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
mVUsaveReg(xmmFs, (uptr)mVU->regs->Mem, _X_Y_Z_W, 1); if (_It_) {
}
else {
mVUallocVIa(mVU, gprT1, _It_); mVUallocVIa(mVU, gprT1, _It_);
MOV32RtoR(gprT2, gprT1); MOV32RtoR(gprT2, gprT1);
mVUaddrFix(mVU, gprT1); mVUaddrFix(mVU, gprT1);
getReg7(xmmFs, _Fs_); mVUsaveReg2(Fs, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
mVUsaveReg2(xmmFs, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
ADD16ItoR(gprT2, 1); ADD16ItoR(gprT2, 1);
mVUallocVIb(mVU, gprT2, _It_); mVUallocVIb(mVU, gprT2, _It_);
} }
else mVUsaveReg(Fs, (uptr)mVU->regs->Mem, _X_Y_Z_W, 1);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
pass3 { mVUlog("SQI.%s vf%02d, vi%02d++", _XYZW_String, _Fs_, _Ft_); } pass3 { mVUlog("SQI.%s vf%02d, vi%02d++", _XYZW_String, _Fs_, _Ft_); }
} }
@ -962,10 +1049,14 @@ mVUop(mVU_RINIT) {
pass1 { mVUanalyzeR1(mVU, _Fs_, _Fsf_); } pass1 { mVUanalyzeR1(mVU, _Fs_, _Fsf_); }
pass2 { pass2 {
if (_Fs_ || (_Fsf_ == 3)) { if (_Fs_ || (_Fsf_ == 3)) {
getReg8(gprT1, _Fs_, _Fsf_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
SSE2_MOVD_XMM_to_R(gprT1, Fs);
AND32ItoR(gprT1, 0x007fffff); AND32ItoR(gprT1, 0x007fffff);
OR32ItoR (gprT1, 0x3f800000); OR32ItoR (gprT1, 0x3f800000);
MOV32RtoM(Rmem, gprT1); MOV32RtoM(Rmem, gprT1);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
else MOV32ItoM(Rmem, 0x3f800000); else MOV32ItoM(Rmem, 0x3f800000);
} }
@ -974,10 +1065,12 @@ mVUop(mVU_RINIT) {
microVUt(void) mVU_RGET_(mV, int Rreg) { microVUt(void) mVU_RGET_(mV, int Rreg) {
if (!mVUlow.noWriteVF) { if (!mVUlow.noWriteVF) {
if (_X) MOV32RtoM((uptr)&mVU->regs->VF[_Ft_].UL[0], Rreg); mVU->regAlloc->reset(); // Reset for Testing
if (_Y) MOV32RtoM((uptr)&mVU->regs->VF[_Ft_].UL[1], Rreg); int Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
if (_Z) MOV32RtoM((uptr)&mVU->regs->VF[_Ft_].UL[2], Rreg); SSE2_MOVD_R_to_XMM(Ft, Rreg);
if (_W) MOV32RtoM((uptr)&mVU->regs->VF[_Ft_].UL[3], Rreg); if (!_XYZW_SS) mVUunpack_xyzw(Ft, Ft, 0);
mVU->regAlloc->clearNeeded(Ft);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
} }
@ -1016,9 +1109,13 @@ mVUop(mVU_RXOR) {
pass1 { mVUanalyzeR1(mVU, _Fs_, _Fsf_); } pass1 { mVUanalyzeR1(mVU, _Fs_, _Fsf_); }
pass2 { pass2 {
if (_Fs_ || (_Fsf_ == 3)) { if (_Fs_ || (_Fsf_ == 3)) {
getReg8(gprT1, _Fs_, _Fsf_); mVU->regAlloc->reset(); // Reset for Testing
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
SSE2_MOVD_XMM_to_R(gprT1, Fs);
AND32ItoR(gprT1, 0x7fffff); AND32ItoR(gprT1, 0x7fffff);
XOR32RtoM(Rmem, gprT1); XOR32RtoM(Rmem, gprT1);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->flushAll(); // Flush All for Testing
} }
} }
pass3 { mVUlog("RXOR R, vf%02d%s", _Fs_, _Fsf_String); } pass3 { mVUlog("RXOR R, vf%02d%s", _Fs_, _Fsf_String); }

View File

@ -115,13 +115,13 @@ declareAllVariables
#define offsetSS ((_X) ? (0) : ((_Y) ? (4) : ((_Z) ? 8: 12))) #define offsetSS ((_X) ? (0) : ((_Y) ? (4) : ((_Z) ? 8: 12)))
#define offsetReg ((_X) ? (0) : ((_Y) ? (1) : ((_Z) ? 2: 3))) #define offsetReg ((_X) ? (0) : ((_Y) ? (1) : ((_Z) ? 2: 3)))
#define xmmT1 0 // Temp Reg #define xmmT1 0 // Used for regAlloc
#define xmmFs 1 // Holds the Value of Fs (writes back result Fd) #define xmmT2 1 // Used for regAlloc
#define xmmFt 2 // Holds the Value of Ft #define xmmT3 2 // Used for regAlloc
#define xmmT2 3 // Temp Reg? #define xmmT4 3 // Used for regAlloc
#define xmmT3 4 // Temp Reg? #define xmmT5 4 // Used for regAlloc
#define xmmT4 5 // Temp Reg? #define xmmT6 5 // Used for regAlloc
#define xmmACC 6 // Holds ACC #define xmmT7 6 // Used for regAlloc
#define xmmPQ 7 // Holds the Value and Backup Values of P and Q regs #define xmmPQ 7 // Holds the Value and Backup Values of P and Q regs
#define gprT1 0 // Temp Reg #define gprT1 0 // Temp Reg

View File

@ -40,7 +40,7 @@ void mVUclamp1(int reg, int regT1, int xyzw) {
// Used for Operand Clamping // Used for Operand Clamping
void mVUclamp2(int reg, int regT1, int xyzw) { void mVUclamp2(int reg, int regT1, int xyzw) {
if (CHECK_VU_SIGN_OVERFLOW) { if (CHECK_VU_SIGN_OVERFLOW && (regT1 >= 0)) {
switch (xyzw) { switch (xyzw) {
case 1: case 2: case 4: case 8: case 1: case 2: case 4: case 8:
SSE_MOVSS_XMM_to_XMM (regT1, reg); SSE_MOVSS_XMM_to_XMM (regT1, reg);