microVU: converted all code to the new emitter style. If anything breaks, blame the guy below me.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3406 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
sudonim1 2010-07-06 20:05:21 +00:00
parent a4afe629e5
commit b53a92e019
19 changed files with 1242 additions and 1264 deletions

View File

@ -363,6 +363,8 @@ template< typename T > void xWrite( T val );
bool operator==( const xRegisterSSE& src ) const { return this->Id == src.Id; } bool operator==( const xRegisterSSE& src ) const { return this->Id == src.Id; }
bool operator!=( const xRegisterSSE& src ) const { return this->Id != src.Id; } bool operator!=( const xRegisterSSE& src ) const { return this->Id != src.Id; }
void operator=( xRegisterSSE src ) { Id = src.Id; }
xRegisterSSE& operator++() xRegisterSSE& operator++()
{ {
++Id &= (iREGCNT_XMM-1); ++Id &= (iREGCNT_XMM-1);

View File

@ -289,6 +289,8 @@ void EmitSibMagic( uint regfield, const xIndirectVoid& info )
int displacement_size = (info.Displacement == 0) ? 0 : int displacement_size = (info.Displacement == 0) ? 0 :
( ( info.IsByteSizeDisp() ) ? 1 : 2 ); ( ( info.IsByteSizeDisp() ) ? 1 : 2 );
assert(!info.Base.IsEmpty() || !info.Index.IsEmpty() || displacement_size == 2);
if( !NeedsSibMagic( info ) ) if( !NeedsSibMagic( info ) )
{ {
// Use ModRm-only encoding, with the rm field holding an index/base register, if // Use ModRm-only encoding, with the rm field holding an index/base register, if

View File

@ -29,8 +29,8 @@ using namespace x86Emitter;
#include "R5900OpcodeTables.h" #include "R5900OpcodeTables.h"
#include "x86emitter/x86emitter.h" #include "x86emitter/x86emitter.h"
#include "SamplProf.h" #include "SamplProf.h"
#include "microVU_IR.h"
#include "microVU_Misc.h" #include "microVU_Misc.h"
#include "microVU_IR.h"
struct microBlockLink { struct microBlockLink {
microBlock* block; microBlock* block;

View File

@ -23,148 +23,164 @@
// Flag Allocators // Flag Allocators
//------------------------------------------------------------------ //------------------------------------------------------------------
#define getFlagReg(regX, fInst) { \ _f static x32 getFlagReg(int fInst)
switch (fInst) { \ {
case 0: regX = gprF0; break; \ if (fInst >= 0 && fInst < 4)
case 1: regX = gprF1; break; \ {
case 2: regX = gprF2; break; \ return gprF[fInst];
case 3: regX = gprF3; break; \ }
default: \ else
Console.Error("microVU Error: fInst = %d", fInst); \ {
regX = gprF0; \ Console.Error("microVU Error: fInst = %d", fInst);
break; \ return gprF[0];
} \ }
} }
#define setBitSFLAG(bitTest, bitSet) { \ _f void setBitSFLAG(x32 reg, x32 regT, int bitTest, int bitSet)
TEST32ItoR(regT, bitTest); \ {
pjmp = JZ8(0); \ xTEST(regT, bitTest);
OR32ItoR(reg, bitSet); \ xForwardJZ8 skip;
x86SetJ8(pjmp); \ xOR(reg, bitSet);
skip.SetTarget();
} }
#define setBitFSEQ(bitX) { \ _f void setBitFSEQ(x32 reg, int bitX)
TEST32ItoR(gprT1, bitX); \ {
pjmp = JZ8(0); \ xTEST(reg, bitX);
OR32ItoR(gprT1, bitX); \ xForwardJump8 skip(Jcc_Zero);
x86SetJ8(pjmp); \ xOR(reg, bitX);
skip.SetTarget();
} }
_f void mVUallocSFLAGa(int reg, int fInstance) { _f void mVUallocSFLAGa(x32 reg, int fInstance)
getFlagReg(fInstance, fInstance); {
MOV32RtoR(reg, fInstance); xMOV(reg, getFlagReg(fInstance));
} }
_f void mVUallocSFLAGb(int reg, int fInstance) { _f void mVUallocSFLAGb(x32 reg, int fInstance)
getFlagReg(fInstance, fInstance); {
MOV32RtoR(fInstance, reg); xMOV(getFlagReg(fInstance), reg);
} }
// Normalize Status Flag // Normalize Status Flag
_f void mVUallocSFLAGc(int reg, int regT, int fInstance) { _f void mVUallocSFLAGc(x32 reg, x32 regT, int fInstance)
u8 *pjmp; {
XOR32RtoR(reg, reg); xXOR(reg, reg);
mVUallocSFLAGa(regT, fInstance); mVUallocSFLAGa(regT, fInstance);
setBitSFLAG(0x0f00, 0x0001); // Z Bit setBitSFLAG(reg, regT, 0x0f00, 0x0001); // Z Bit
setBitSFLAG(0xf000, 0x0002); // S Bit setBitSFLAG(reg, regT, 0xf000, 0x0002); // S Bit
setBitSFLAG(0x000f, 0x0040); // ZS Bit setBitSFLAG(reg, regT, 0x000f, 0x0040); // ZS Bit
setBitSFLAG(0x00f0, 0x0080); // SS Bit setBitSFLAG(reg, regT, 0x00f0, 0x0080); // SS Bit
AND32ItoR(regT, 0xffff0000); // DS/DI/OS/US/D/I/O/U Bits xAND(regT, 0xffff0000); // DS/DI/OS/US/D/I/O/U Bits
SHR32ItoR(regT, 14); xSHR(regT, 14);
OR32RtoR(reg, regT); xOR(reg, regT);
} }
// Denormalizes Status Flag // Denormalizes Status Flag
_f void mVUallocSFLAGd(uptr memAddr, bool setAllflags) { _f void mVUallocSFLAGd(u32* memAddr, bool setAllflags) {
// Cannot use EBP (gprF1) here; as this function is used by mVU0 macro and // Cannot use EBP (gprF[1]) here; as this function is used by mVU0 macro and
// the EErec needs EBP preserved. // the EErec needs EBP preserved.
MOV32MtoR(gprF0, memAddr); xMOV(gprF[0], ptr32[memAddr]);
MOV32RtoR(gprF3, gprF0); xMOV(gprF[3], gprF[0]);
SHR32ItoR(gprF3, 3); xSHR(gprF[3], 3);
AND32ItoR(gprF3, 0x18); xAND(gprF[3], 0x18);
MOV32RtoR(gprF2, gprF0); xMOV(gprF[2], gprF[0]);
SHL32ItoR(gprF2, 11); xSHL(gprF[2], 11);
AND32ItoR(gprF2, 0x1800); xAND(gprF[2], 0x1800);
OR32RtoR (gprF3, gprF2); xOR (gprF[3], gprF[2]);
SHL32ItoR(gprF0, 14); xSHL(gprF[0], 14);
AND32ItoR(gprF0, 0x3cf0000); xAND(gprF[0], 0x3cf0000);
OR32RtoR (gprF3, gprF0); xOR (gprF[3], gprF[0]);
if (setAllflags) { if (setAllflags) {
// this code should be run in mVU micro mode only, so writing to // this code should be run in mVU micro mode only, so writing to
// EBP (gprF1) is ok (and needed for vuMicro optimizations). // EBP (gprF[1]) is ok (and needed for vuMicro optimizations).
MOV32RtoR(gprF0, gprF3); xMOV(gprF[0], gprF[3]);
MOV32RtoR(gprF1, gprF3); xMOV(gprF[1], gprF[3]);
MOV32RtoR(gprF2, gprF3); xMOV(gprF[2], gprF[3]);
} }
} }
_f void mVUallocMFLAGa(mV, int reg, int fInstance) { _f void mVUallocMFLAGa(mV, x32 reg, int fInstance)
MOVZX32M16toR(reg, (uptr)&mVU->macFlag[fInstance]); {
xMOVZX(reg, ptr16[&mVU->macFlag[fInstance]]);
} }
_f void mVUallocMFLAGb(mV, int reg, int fInstance) { _f void mVUallocMFLAGb(mV, x32 reg, int fInstance)
//AND32ItoR(reg, 0xffff); {
if (fInstance < 4) MOV32RtoM((uptr)&mVU->macFlag[fInstance], reg); // microVU //xAND(reg, 0xffff);
else MOV32RtoM((uptr)&mVU->regs->VI[REG_MAC_FLAG].UL, reg); // macroVU if (fInstance < 4) xMOV(ptr32[&mVU->macFlag[fInstance]], reg); // microVU
else xMOV(ptr32[&mVU->regs->VI[REG_MAC_FLAG].UL], reg); // macroVU
} }
_f void mVUallocCFLAGa(mV, int reg, int fInstance) { _f void mVUallocCFLAGa(mV, x32 reg, int fInstance)
if (fInstance < 4) MOV32MtoR(reg, (uptr)&mVU->clipFlag[fInstance]); // microVU {
else MOV32MtoR(reg, (uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL); // macroVU if (fInstance < 4) xMOV(reg, ptr32[&mVU->clipFlag[fInstance]]); // microVU
else xMOV(reg, ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL]); // macroVU
} }
_f void mVUallocCFLAGb(mV, int reg, int fInstance) { _f void mVUallocCFLAGb(mV, x32 reg, int fInstance)
if (fInstance < 4) MOV32RtoM((uptr)&mVU->clipFlag[fInstance], reg); // microVU {
else MOV32RtoM((uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL, reg); // macroVU if (fInstance < 4) xMOV(ptr32[&mVU->clipFlag[fInstance]], reg); // microVU
else xMOV(ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL], reg); // macroVU
} }
//------------------------------------------------------------------ //------------------------------------------------------------------
// VI Reg Allocators // VI Reg Allocators
//------------------------------------------------------------------ //------------------------------------------------------------------
_f void mVUallocVIa(mV, int GPRreg, int _reg_) { _f void mVUallocVIa(mV, x32 GPRreg, int _reg_, bool signext = false)
if (!_reg_) { XOR32RtoR(GPRreg, GPRreg); } {
else { MOVZX32M16toR(GPRreg, (uptr)&mVU->regs->VI[_reg_].UL); } if (!_reg_)
xXOR(GPRreg, GPRreg);
else
if (signext)
xMOVSX(GPRreg, ptr16[&mVU->regs->VI[_reg_].SL]);
else
xMOVZX(GPRreg, ptr16[&mVU->regs->VI[_reg_].UL]);
} }
_f void mVUallocVIb(mV, int GPRreg, int _reg_) { _f void mVUallocVIb(mV, x32 GPRreg, int _reg_)
{
if (mVUlow.backupVI) { // Backs up reg to memory (used when VI is modified b4 a branch) if (mVUlow.backupVI) { // Backs up reg to memory (used when VI is modified b4 a branch)
MOVZX32M16toR(gprT3, (uptr)&mVU->regs->VI[_reg_].UL); xMOVZX(edx, ptr16[&mVU->regs->VI[_reg_].UL]);
MOV32RtoM((uptr)&mVU->VIbackup, gprT3); xMOV(ptr32[&mVU->VIbackup], edx);
} }
if (_reg_ == 0) { return; } if (_reg_ == 0) { return; }
else if (_reg_ < 16) { MOV16RtoM((uptr)&mVU->regs->VI[_reg_].UL, GPRreg); } else if (_reg_ < 16) { xMOV(ptr16[&mVU->regs->VI[_reg_].UL], xRegister16(GPRreg.Id)); }
} }
//------------------------------------------------------------------ //------------------------------------------------------------------
// P/Q Reg Allocators // P/Q Reg Allocators
//------------------------------------------------------------------ //------------------------------------------------------------------
_f void getPreg(mV, int reg) { _f void getPreg(mV, xmm reg)
{
mVUunpack_xyzw(reg, xmmPQ, (2 + mVUinfo.readP)); mVUunpack_xyzw(reg, xmmPQ, (2 + mVUinfo.readP));
/*if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2(reg, xmmT1, 15);*/ /*if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2(reg, xmmT1, 15);*/
} }
_f void getQreg(int reg, int qInstance) { _f void getQreg(xmm reg, int qInstance)
{
mVUunpack_xyzw(reg, xmmPQ, qInstance); mVUunpack_xyzw(reg, xmmPQ, qInstance);
/*if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2<vuIndex>(reg, xmmT1, 15);*/ /*if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2<vuIndex>(reg, xmmT1, 15);*/
} }
_f void writeQreg(int reg, int qInstance) { _f void writeQreg(xmm reg, int qInstance)
{
if (qInstance) { if (qInstance) {
if (!x86caps.hasStreamingSIMD4Extensions) { if (!x86caps.hasStreamingSIMD4Extensions) {
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1); xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
SSE_MOVSS_XMM_to_XMM(xmmPQ, reg); xMOVSS(xmmPQ, reg);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1); xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
} }
else SSE4_INSERTPS_XMM_to_XMM(xmmPQ, reg, _MM_MK_INSERTPS_NDX(0, 1, 0)); else xINSERTPS(xmmPQ, reg, _MM_MK_INSERTPS_NDX(0, 1, 0));
} }
else SSE_MOVSS_XMM_to_XMM(xmmPQ, reg); else xMOVSS(xmmPQ, reg);
} }

View File

@ -55,34 +55,33 @@ _f void mVUendProgram(mV, microFlagCycles* mFC, int isEbit) {
} }
// Save P/Q Regs // Save P/Q Regs
if (qInst) { SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe5); } if (qInst) { xPSHUF.D(xmmPQ, xmmPQ, 0xe5); }
SSE_MOVSS_XMM_to_M32((uptr)&mVU->regs->VI[REG_Q].UL, xmmPQ); xMOVSS(ptr32[&mVU->regs->VI[REG_Q].UL], xmmPQ);
if (isVU1) { if (isVU1) {
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, pInst ? 3 : 2); xPSHUF.D(xmmPQ, xmmPQ, pInst ? 3 : 2);
SSE_MOVSS_XMM_to_M32((uptr)&mVU->regs->VI[REG_P].UL, xmmPQ); xMOVSS(ptr32[&mVU->regs->VI[REG_P].UL], xmmPQ);
} }
// Save Flag Instances // Save Flag Instances
#if 1 // CHECK_MACROVU0 - Always on now #if 1 // CHECK_MACROVU0 - Always on now
getFlagReg(fStatus, fStatus); xMOV(ptr32[&mVU->regs->VI[REG_STATUS_FLAG].UL], getFlagReg(fStatus));
MOV32RtoM((uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL, fStatus);
#else #else
mVUallocSFLAGc(gprT1, gprT2, fStatus); mVUallocSFLAGc(gprT1, fStatus);
MOV32RtoM((uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL, gprT1); xMOV(ptr32[&mVU->regs->VI[REG_STATUS_FLAG].UL], gprT1);
#endif #endif
mVUallocMFLAGa(mVU, gprT1, fMac); mVUallocMFLAGa(mVU, gprT1, fMac);
mVUallocCFLAGa(mVU, gprT2, fClip); mVUallocCFLAGa(mVU, gprT2, fClip);
MOV32RtoM((uptr)&mVU->regs->VI[REG_MAC_FLAG].UL, gprT1); xMOV(ptr32[&mVU->regs->VI[REG_MAC_FLAG].UL], gprT1);
MOV32RtoM((uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL, gprT2); xMOV(ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL], gprT2);
if (isEbit || isVU1) { // Clear 'is busy' Flags if (isEbit || isVU1) { // Clear 'is busy' Flags
AND32ItoM((uptr)&VU0.VI[REG_VPU_STAT].UL, (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag xAND(ptr32[&VU0.VI[REG_VPU_STAT].UL], (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
AND32ItoM((uptr)&mVU->regs->vifRegs->stat, ~VIF1_STAT_VEW); // Clear VU 'is busy' signal for vif xAND(ptr32[&mVU->regs->vifRegs->stat], ~VIF1_STAT_VEW); // Clear VU 'is busy' signal for vif
} }
if (isEbit != 2) { // Save PC, and Jump to Exit Point if (isEbit != 2) { // Save PC, and Jump to Exit Point
MOV32ItoM((uptr)&mVU->regs->VI[REG_TPC].UL, xPC); xMOV(ptr32[&mVU->regs->VI[REG_TPC].UL], xPC);
JMP32((uptr)mVU->exitFunct - ((uptr)x86Ptr + 5)); xJMP(mVU->exitFunct);
} }
} }
@ -93,7 +92,7 @@ _f void mVUsetupBranch(mV, microFlagCycles& mFC) {
mVUsetupFlags(mVU, mFC); // Shuffle Flag Instances mVUsetupFlags(mVU, mFC); // Shuffle Flag Instances
// Shuffle P/Q regs since every block starts at instance #0 // Shuffle P/Q regs since every block starts at instance #0
if (mVU->p || mVU->q) { SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, shufflePQ); } if (mVU->p || mVU->q) { xPSHUF.D(xmmPQ, xmmPQ, shufflePQ); }
} }
void normBranchCompile(microVU* mVU, u32 branchPC) { void normBranchCompile(microVU* mVU, u32 branchPC) {
@ -109,15 +108,15 @@ void normJumpCompile(mV, microFlagCycles& mFC, bool isEvilJump) {
mVUsetupBranch(mVU, mFC); mVUsetupBranch(mVU, mFC);
mVUbackupRegs(mVU); mVUbackupRegs(mVU);
if (isEvilJump) MOV32MtoR(gprT2, (uptr)&mVU->evilBranch); if (isEvilJump) xMOV(gprT2, ptr32[&mVU->evilBranch]);
else MOV32MtoR(gprT2, (uptr)&mVU->branch); else xMOV(gprT2, ptr32[&mVU->branch]);
MOV32ItoR(gprT3, (u32)&mVUpBlock->pStateEnd); xMOV(gprT3, (uptr)&mVUpBlock->pStateEnd);
if (!mVU->index) xCALL(mVUcompileJIT<0>); //(u32 startPC, uptr pState) if (!mVU->index) xCALL(mVUcompileJIT<0>); //(u32 startPC, uptr pState)
else xCALL(mVUcompileJIT<1>); else xCALL(mVUcompileJIT<1>);
mVUrestoreRegs(mVU); mVUrestoreRegs(mVU);
JMPR(gprT1); // Jump to rec-code address xJMP(gprT1); // Jump to rec-code address
} }
void normBranch(mV, microFlagCycles& mFC) { void normBranch(mV, microFlagCycles& mFC) {
@ -132,7 +131,7 @@ void normBranch(mV, microFlagCycles& mFC) {
void condBranch(mV, microFlagCycles& mFC, int JMPcc) { void condBranch(mV, microFlagCycles& mFC, int JMPcc) {
mVUsetupBranch(mVU, mFC); mVUsetupBranch(mVU, mFC);
xCMP(ptr16[(u16*)&mVU->branch], 0); xCMP(ptr16[&mVU->branch], 0);
incPC(3); incPC(3);
if (mVUup.eBit) { // Conditional Branch With E-Bit Set if (mVUup.eBit) { // Conditional Branch With E-Bit Set
mVUendProgram(mVU, &mFC, 2); mVUendProgram(mVU, &mFC, 2);
@ -190,8 +189,8 @@ void normJump(mV, microFlagCycles& mFC) {
if (mVUup.eBit) { // E-bit Jump if (mVUup.eBit) { // E-bit Jump
mVUendProgram(mVU, &mFC, 2); mVUendProgram(mVU, &mFC, 2);
MOV32MtoR(gprT1, (uptr)&mVU->branch); xMOV(gprT1, ptr32[&mVU->branch]);
MOV32RtoM((uptr)&mVU->regs->VI[REG_TPC].UL, gprT1); xMOV(ptr32[&mVU->regs->VI[REG_TPC].UL], gprT1);
xJMP(mVU->exitFunct); xJMP(mVU->exitFunct);
} }
else normJumpCompile(mVU, mFC, 0); else normJumpCompile(mVU, mFC, 0);

View File

@ -34,16 +34,16 @@ const __aligned16 u32 sse4_maxvals[2][4] = {
// gotten a NaN value, then something went wrong; and the NaN's sign // gotten a NaN value, then something went wrong; and the NaN's sign
// is not to be trusted. Games like positive values better usually, // is not to be trusted. Games like positive values better usually,
// and its faster... so just always make NaNs into positive infinity. // and its faster... so just always make NaNs into positive infinity.
void mVUclamp1(int reg, int regT1, int xyzw, bool bClampE = 0) { void mVUclamp1(xmm reg, xmm regT1, int xyzw, bool bClampE = 0) {
if ((!clampE && CHECK_VU_OVERFLOW) || (clampE && bClampE)) { if ((!clampE && CHECK_VU_OVERFLOW) || (clampE && bClampE)) {
switch (xyzw) { switch (xyzw) {
case 1: case 2: case 4: case 8: case 1: case 2: case 4: case 8:
SSE_MINSS_M32_to_XMM(reg, (uptr)mVUglob.maxvals); xMIN.SS(reg, ptr32[mVUglob.maxvals]);
SSE_MAXSS_M32_to_XMM(reg, (uptr)mVUglob.minvals); xMAX.SS(reg, ptr32[mVUglob.minvals]);
break; break;
default: default:
SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals); xMIN.PS(reg, ptr32[mVUglob.maxvals]);
SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals); xMAX.PS(reg, ptr32[mVUglob.minvals]);
break; break;
} }
} }
@ -54,44 +54,41 @@ void mVUclamp1(int reg, int regT1, int xyzw, bool bClampE = 0) {
// Note 2: Using regalloc here seems to contaminate some regs in certain games. // Note 2: Using regalloc here seems to contaminate some regs in certain games.
// Must be some specific case I've overlooked (or I used regalloc improperly on an opcode) // Must be some specific case I've overlooked (or I used regalloc improperly on an opcode)
// so we just use a temporary mem location for our backup for now... (non-sse4 version only) // so we just use a temporary mem location for our backup for now... (non-sse4 version only)
void mVUclamp2(microVU* mVU, int reg, int regT1, int xyzw, bool bClampE = 0) { void mVUclamp2(microVU* mVU, xmm reg, xmm regT1in, int xyzw, bool bClampE = 0) {
if ((!clampE && CHECK_VU_SIGN_OVERFLOW) || (clampE && bClampE && CHECK_VU_SIGN_OVERFLOW)) { if ((!clampE && CHECK_VU_SIGN_OVERFLOW) || (clampE && bClampE && CHECK_VU_SIGN_OVERFLOW)) {
if (x86caps.hasStreamingSIMD4Extensions) { if (x86caps.hasStreamingSIMD4Extensions) {
int i = (xyzw==1||xyzw==2||xyzw==4||xyzw==8) ? 0: 1; int i = (xyzw==1||xyzw==2||xyzw==4||xyzw==8) ? 0: 1;
SSE4_PMINSD_M128_to_XMM(reg, (uptr)&sse4_maxvals[i][0]); xPMIN.SD(reg, ptr128[&sse4_maxvals[i][0]]);
SSE4_PMINUD_M128_to_XMM(reg, (uptr)&sse4_minvals[i][0]); xPMIN.UD(reg, ptr128[&sse4_minvals[i][0]]);
return; return;
} }
int regT1b = 0; //xmm regT1 = regT1b ? mVU->regAlloc->allocReg() : regT1in;
if (regT1 < 0) { xmm regT1 = regT1in.IsEmpty() ? xmm((reg.Id + 1) % 8) : regT1in;
regT1b = 1; regT1=(reg+1)%8; if (regT1 != regT1in) xMOVAPS(ptr128[mVU->xmmCTemp], regT1);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->xmmCTemp, regT1);
//regT1 = mVU->regAlloc->allocReg();
}
switch (xyzw) { switch (xyzw) {
case 1: case 2: case 4: case 8: case 1: case 2: case 4: case 8:
SSE_MOVAPS_XMM_to_XMM(regT1, reg); xMOVAPS(regT1, reg);
SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit); xAND.PS(regT1, ptr128[mVUglob.signbit]);
SSE_MINSS_M32_to_XMM (reg, (uptr)mVUglob.maxvals); xMIN.SS(reg, ptr128[mVUglob.maxvals]);
SSE_MAXSS_M32_to_XMM (reg, (uptr)mVUglob.minvals); xMAX.SS(reg, ptr128[mVUglob.minvals]);
SSE_ORPS_XMM_to_XMM (reg, regT1); xOR.PS (reg, regT1);
break; break;
default: default:
SSE_MOVAPS_XMM_to_XMM(regT1, reg); xMOVAPS(regT1, reg);
SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit); xAND.PS(regT1, ptr128[mVUglob.signbit]);
SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals); xMIN.PS(reg, ptr128[mVUglob.maxvals]);
SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals); xMAX.PS(reg, ptr128[mVUglob.minvals]);
SSE_ORPS_XMM_to_XMM (reg, regT1); xOR.PS (reg, regT1);
break; break;
} }
//if (regT1b) mVU->regAlloc->clearNeeded(regT1); //if (regT1 != regT1in) mVU->regAlloc->clearNeeded(regT1);
if (regT1b) SSE_MOVAPS_M128_to_XMM(regT1, (uptr)mVU->xmmCTemp); if (regT1 != regT1in) xMOVAPS(regT1, ptr128[mVU->xmmCTemp]);
} }
else mVUclamp1(reg, regT1, xyzw, bClampE); else mVUclamp1(reg, regT1in, xyzw, bClampE);
} }
// Used for operand clamping on every SSE instruction (add/sub/mul/div) // Used for operand clamping on every SSE instruction (add/sub/mul/div)
void mVUclamp3(microVU* mVU, int reg, int regT1, int xyzw) { void mVUclamp3(microVU* mVU, xmm reg, xmm regT1, int xyzw) {
if (clampE) mVUclamp2(mVU, reg, regT1, xyzw, 1); if (clampE) mVUclamp2(mVU, reg, regT1, xyzw, 1);
} }
@ -101,6 +98,6 @@ void mVUclamp3(microVU* mVU, int reg, int regT1, int xyzw) {
// emulated opcodes (causing crashes). Since we're clamping the operands // emulated opcodes (causing crashes). Since we're clamping the operands
// with mVUclamp3, we should almost never be getting a NaN result, // with mVUclamp3, we should almost never be getting a NaN result,
// but this clamp is just a precaution just-in-case. // but this clamp is just a precaution just-in-case.
void mVUclamp4(int reg, int regT1, int xyzw) { void mVUclamp4(xmm reg, xmm regT1, int xyzw) {
if (clampE && !CHECK_VU_SIGN_OVERFLOW) mVUclamp1(reg, regT1, xyzw, 1); if (clampE && !CHECK_VU_SIGN_OVERFLOW) mVUclamp1(reg, regT1, xyzw, 1);
} }

View File

@ -126,7 +126,7 @@ void doIbit(mV) {
} }
else tempI = curI; else tempI = curI;
MOV32ItoM((uptr)&mVU->regs->VI[REG_I].UL, tempI); xMOV(ptr32[&mVU->regs->VI[REG_I].UL], tempI);
incPC(1); incPC(1);
} }
} }
@ -134,21 +134,27 @@ void doIbit(mV) {
void doSwapOp(mV) { void doSwapOp(mV) {
if (mVUinfo.backupVF && !mVUlow.noWriteVF) { if (mVUinfo.backupVF && !mVUlow.noWriteVF) {
DevCon.WriteLn(Color_Green, "microVU%d: Backing Up VF Reg [%04x]", getIndex, xPC); DevCon.WriteLn(Color_Green, "microVU%d: Backing Up VF Reg [%04x]", getIndex, xPC);
int t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg); xmm t2 = mVU->regAlloc->allocReg();
int t2 = mVU->regAlloc->allocReg(); {
SSE_MOVAPS_XMM_to_XMM(t2, t1); xmm t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg);
mVU->regAlloc->clearNeeded(t1); xMOVAPS(t2, t1);
mVU->regAlloc->clearNeeded(t1);
}
mVUopL(mVU, 1); mVUopL(mVU, 1);
t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg, mVUlow.VF_write.reg, 0xf, 0); {
SSE_XORPS_XMM_to_XMM(t2, t1); xmm t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg, mVUlow.VF_write.reg, 0xf, 0);
SSE_XORPS_XMM_to_XMM(t1, t2); xXOR.PS(t2, t1);
SSE_XORPS_XMM_to_XMM(t2, t1); xXOR.PS(t1, t2);
mVU->regAlloc->clearNeeded(t1); xXOR.PS(t2, t1);
mVU->regAlloc->clearNeeded(t1);
}
incPC(1); incPC(1);
doUpperOp(); doUpperOp();
t1 = mVU->regAlloc->allocReg(-1, mVUlow.VF_write.reg, 0xf); {
SSE_MOVAPS_XMM_to_XMM(t1, t2); xmm t1 = mVU->regAlloc->allocReg(-1, mVUlow.VF_write.reg, 0xf);
mVU->regAlloc->clearNeeded(t1); xMOVAPS(t1, t2);
mVU->regAlloc->clearNeeded(t1);
}
mVU->regAlloc->clearNeeded(t2); mVU->regAlloc->clearNeeded(t2);
} }
else { mVUopL(mVU, 1); incPC(1); doUpperOp(); } else { mVUopL(mVU, 1); incPC(1); doUpperOp(); }
@ -165,9 +171,9 @@ _f void mVUcheckBadOp(mV) {
// Prints msg when exiting block early if 1st op was a bad opcode (Dawn of Mana Level 2) // Prints msg when exiting block early if 1st op was a bad opcode (Dawn of Mana Level 2)
_f void handleBadOp(mV, int count) { _f void handleBadOp(mV, int count) {
if (mVUinfo.isBadOp && count == 0) { if (mVUinfo.isBadOp && count == 0) {
MOV32ItoR(gprT2, (uptr)mVU); xMOV(ecx, (uptr)mVU);
if (!isVU1) CALLFunc((uptr)mVUbadOp0); if (!isVU1) xCALL(mVUbadOp0);
else CALLFunc((uptr)mVUbadOp1); else xCALL(mVUbadOp1);
} }
} }
@ -302,7 +308,7 @@ _f bool doEarlyExit(microVU* mVU) {
_f void mVUsavePipelineState(microVU* mVU) { _f void mVUsavePipelineState(microVU* mVU) {
u32* lpS = (u32*)&mVU->prog.lpState.vi15; u32* lpS = (u32*)&mVU->prog.lpState.vi15;
for (int i = 0; i < (sizeof(microRegInfo)-4)/4; i++, lpS++) { for (int i = 0; i < (sizeof(microRegInfo)-4)/4; i++, lpS++) {
MOV32ItoM((uptr)lpS, lpS[0]); xMOV(ptr32[lpS], lpS[0]);
} }
} }
@ -311,18 +317,19 @@ void mVUtestCycles(microVU* mVU) {
iPC = mVUstartPC; iPC = mVUstartPC;
mVUdebugNOW(0); mVUdebugNOW(0);
if (doEarlyExit(mVU)) { if (doEarlyExit(mVU)) {
CMP32ItoM((uptr)&mVU->cycles, 0); xCMP(ptr32[&mVU->cycles], 0);
u32* jmp32 = JG32(0); xForwardJG32 skip;
// FIXME: uh... actually kind of a pain with xForwardJump
//if (!isVU1) { TEST32ItoM((uptr)&mVU->regs->flags, VUFLAG_MFLAGSET); vu0jmp = JZ32(0); } //if (!isVU1) { TEST32ItoM((uptr)&mVU->regs->flags, VUFLAG_MFLAGSET); vu0jmp = JZ32(0); }
MOV32ItoR(gprT2, (uptr)mVU); xMOV(ecx, (uptr)mVU);
if (isVU1) CALLFunc((uptr)mVUwarning1); if (isVU1) xCALL(mVUwarning1);
//else CALLFunc((uptr)mVUwarning0); // VU0 is allowed early exit for COP2 Interlock Simulation //else xCALL(mVUwarning0); // VU0 is allowed early exit for COP2 Interlock Simulation
mVUsavePipelineState(mVU); mVUsavePipelineState(mVU);
mVUendProgram(mVU, NULL, 0); mVUendProgram(mVU, NULL, 0);
//if (!isVU1) x86SetJ32(vu0jmp); //if (!isVU1) vu0jmp.SetTarget();
x86SetJ32(jmp32); skip.SetTarget();
} }
SUB32ItoM((uptr)&mVU->cycles, mVUcycles); xSUB(ptr32[&mVU->cycles], mVUcycles);
} }
// Initialize VI Constants (vi15 propagates through blocks) // Initialize VI Constants (vi15 propagates through blocks)
@ -410,7 +417,7 @@ _r void* mVUcompile(microVU* mVU, u32 startPC, uptr pState) {
u32 x = 0; u32 x = 0;
for (; x < endCount; x++) { for (; x < endCount; x++) {
if (mVUinfo.isEOB) { handleBadOp(mVU, x); x = 0xffff; } if (mVUinfo.isEOB) { handleBadOp(mVU, x); x = 0xffff; }
if (mVUup.mBit) { OR32ItoM((uptr)&mVU->regs->flags, VUFLAG_MFLAGSET); } if (mVUup.mBit) { xOR(ptr32[&mVU->regs->flags], VUFLAG_MFLAGSET); }
if (mVUlow.isNOP) { incPC(1); doUpperOp(); doIbit(mVU); } if (mVUlow.isNOP) { incPC(1); doUpperOp(); doIbit(mVU); }
else if (!mVUinfo.swapOps) { incPC(1); doUpperOp(); doLowerOp(); } else if (!mVUinfo.swapOps) { incPC(1); doUpperOp(); doLowerOp(); }
else { doSwapOp(mVU); } else { doSwapOp(mVU); }

View File

@ -43,25 +43,25 @@ void mVUdispatcherA(mV) {
// Load Regs // Load Regs
#if 1 // CHECK_MACROVU0 - Always on now #if 1 // CHECK_MACROVU0 - Always on now
MOV32MtoR(gprF0, (uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL); xMOV(gprF[0], ptr32[&mVU->regs->VI[REG_STATUS_FLAG].UL]);
MOV32RtoR(gprF1, gprF0); xMOV(gprF[1], gprF[0]);
MOV32RtoR(gprF2, gprF0); xMOV(gprF[2], gprF[0]);
MOV32RtoR(gprF3, gprF0); xMOV(gprF[3], gprF[0]);
#else #else
mVUallocSFLAGd((uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL, 1); mVUallocSFLAGd((uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL, 1);
#endif #endif
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_MAC_FLAG].UL); xMOVAPS(xmmT1, ptr128[&mVU->regs->VI[REG_MAC_FLAG].UL]);
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, 0); xSHUF.PS(xmmT1, xmmT1, 0);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->macFlag, xmmT1); xMOVAPS(ptr128[&mVU->macFlag[0]], xmmT1);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL); xMOVAPS(xmmT1, ptr128[&mVU->regs->VI[REG_CLIP_FLAG].UL]);
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, 0); xSHUF.PS(xmmT1, xmmT1, 0);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->clipFlag, xmmT1); xMOVAPS(ptr128[&mVU->clipFlag[0]], xmmT1);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_P].UL); xMOVAPS(xmmT1, ptr128[&mVU->regs->VI[REG_P].UL]);
SSE_MOVAPS_M128_to_XMM(xmmPQ, (uptr)&mVU->regs->VI[REG_Q].UL); xMOVAPS(xmmPQ, ptr128[&mVU->regs->VI[REG_Q].UL]);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmT1, 0); // wzyx = PPQQ xSHUF.PS(xmmPQ, xmmT1, 0); // wzyx = PPQQ
// Jump to Recompiled Code Block // Jump to Recompiled Code Block
xJMP(eax); xJMP(eax);

View File

@ -20,12 +20,10 @@
// Sets FDIV Flags at the proper time // Sets FDIV Flags at the proper time
_f void mVUdivSet(mV) { _f void mVUdivSet(mV) {
int flagReg1, flagReg2;
if (mVUinfo.doDivFlag) { if (mVUinfo.doDivFlag) {
getFlagReg(flagReg1, sFLAG.write); if (!sFLAG.doFlag) { xMOV(getFlagReg(sFLAG.write), getFlagReg(sFLAG.lastWrite)); }
if (!sFLAG.doFlag) { getFlagReg(flagReg2, sFLAG.lastWrite); MOV32RtoR(flagReg1, flagReg2); } xAND(getFlagReg(sFLAG.write), 0xfff3ffff);
AND32ItoR(flagReg1, 0xfff3ffff); xOR (getFlagReg(sFLAG.write), ptr32[&mVU->divFlag]);
OR32MtoR (flagReg1, (uptr)&mVU->divFlag);
} }
} }
@ -159,9 +157,8 @@ _f void mVUsetFlags(mV, microFlagCycles& mFC) {
iPC = endPC; iPC = endPC;
} }
#define getFlagReg1(x) ((x == 3) ? gprF3 : ((x == 2) ? gprF2 : ((x == 1) ? gprF1 : gprF0))) #define getFlagReg2(x) ((bStatus[0] == x) ? getFlagReg(x) : gprT1)
#define getFlagReg2(x) ((bStatus[0] == x) ? getFlagReg1(x) : gprT1) #define getFlagReg3(x) ((gFlag == x) ? gprT1 : getFlagReg(x))
#define getFlagReg3(x) ((gFlag == x) ? gprT1 : getFlagReg1(x))
#define getFlagReg4(x) ((gFlag == x) ? gprT1 : gprT2) #define getFlagReg4(x) ((gFlag == x) ? gprT1 : gprT2)
#define shuffleMac ((bMac [3]<<6)|(bMac [2]<<4)|(bMac [1]<<2)|bMac [0]) #define shuffleMac ((bMac [3]<<6)|(bMac [2]<<4)|(bMac [1]<<2)|bMac [0])
#define shuffleClip ((bClip[3]<<6)|(bClip[2]<<4)|(bClip[1]<<2)|bClip[0]) #define shuffleClip ((bClip[3]<<6)|(bClip[2]<<4)|(bClip[1]<<2)|bClip[0])
@ -175,52 +172,52 @@ _f void mVUsetupFlags(mV, microFlagCycles& mFC) {
// DevCon::Status("sortRegs = %d", params sortRegs); // DevCon::Status("sortRegs = %d", params sortRegs);
// Note: Emitter will optimize out mov(reg1, reg1) cases... // Note: Emitter will optimize out mov(reg1, reg1) cases...
if (sortRegs == 1) { if (sortRegs == 1) {
MOV32RtoR(gprF0, getFlagReg1(bStatus[0])); xMOV(gprF[0], getFlagReg(bStatus[0]));
MOV32RtoR(gprF1, getFlagReg1(bStatus[1])); xMOV(gprF[1], getFlagReg(bStatus[1]));
MOV32RtoR(gprF2, getFlagReg1(bStatus[2])); xMOV(gprF[2], getFlagReg(bStatus[2]));
MOV32RtoR(gprF3, getFlagReg1(bStatus[3])); xMOV(gprF[3], getFlagReg(bStatus[3]));
} }
else if (sortRegs == 2) { else if (sortRegs == 2) {
MOV32RtoR(gprT1, getFlagReg1(bStatus[3])); xMOV(gprT1, getFlagReg(bStatus[3]));
MOV32RtoR(gprF0, getFlagReg1(bStatus[0])); xMOV(gprF[0], getFlagReg(bStatus[0]));
MOV32RtoR(gprF1, getFlagReg2(bStatus[1])); xMOV(gprF[1], getFlagReg2(bStatus[1]));
MOV32RtoR(gprF2, getFlagReg2(bStatus[2])); xMOV(gprF[2], getFlagReg2(bStatus[2]));
MOV32RtoR(gprF3, gprT1); xMOV(gprF[3], gprT1);
} }
else if (sortRegs == 3) { else if (sortRegs == 3) {
int gFlag = (bStatus[0] == bStatus[1]) ? bStatus[2] : bStatus[1]; int gFlag = (bStatus[0] == bStatus[1]) ? bStatus[2] : bStatus[1];
MOV32RtoR(gprT1, getFlagReg1(gFlag)); xMOV(gprT1, getFlagReg(gFlag));
MOV32RtoR(gprT2, getFlagReg1(bStatus[3])); xMOV(gprT2, getFlagReg(bStatus[3]));
MOV32RtoR(gprF0, getFlagReg1(bStatus[0])); xMOV(gprF[0], getFlagReg(bStatus[0]));
MOV32RtoR(gprF1, getFlagReg3(bStatus[1])); xMOV(gprF[1], getFlagReg3(bStatus[1]));
MOV32RtoR(gprF2, getFlagReg4(bStatus[2])); xMOV(gprF[2], getFlagReg4(bStatus[2]));
MOV32RtoR(gprF3, gprT2); xMOV(gprF[3], gprT2);
} }
else { else {
MOV32RtoR(gprT1, getFlagReg1(bStatus[0])); xMOV(gprT1, getFlagReg(bStatus[0]));
MOV32RtoR(gprT2, getFlagReg1(bStatus[1])); xMOV(gprT2, getFlagReg(bStatus[1]));
MOV32RtoR(gprT3, getFlagReg1(bStatus[2])); xMOV(gprT3, getFlagReg(bStatus[2]));
MOV32RtoR(gprF3, getFlagReg1(bStatus[3])); xMOV(gprF[3], getFlagReg(bStatus[3]));
MOV32RtoR(gprF0, gprT1); xMOV(gprF[0], gprT1);
MOV32RtoR(gprF1, gprT2); xMOV(gprF[1], gprT2);
MOV32RtoR(gprF2, gprT3); xMOV(gprF[2], gprT3);
} }
} }
if (__Mac) { if (__Mac) {
int bMac[4]; int bMac[4];
sortFlag(mFC.xMac, bMac, mFC.cycles); sortFlag(mFC.xMac, bMac, mFC.cycles);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)mVU->macFlag); xMOVAPS(xmmT1, ptr128[mVU->macFlag]);
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, shuffleMac); xSHUF.PS(xmmT1, xmmT1, shuffleMac);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->macFlag, xmmT1); xMOVAPS(ptr128[mVU->macFlag], xmmT1);
} }
if (__Clip) { if (__Clip) {
int bClip[4]; int bClip[4];
sortFlag(mFC.xClip, bClip, mFC.cycles); sortFlag(mFC.xClip, bClip, mFC.cycles);
SSE_MOVAPS_M128_to_XMM(xmmT2, (uptr)mVU->clipFlag); xMOVAPS(xmmT2, ptr128[mVU->clipFlag]);
SSE_SHUFPS_XMM_to_XMM (xmmT2, xmmT2, shuffleClip); xSHUF.PS(xmmT2, xmmT2, shuffleClip);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->clipFlag, xmmT2); xMOVAPS(ptr128[mVU->clipFlag], xmmT2);
} }
} }

View File

@ -163,11 +163,6 @@ struct microIR {
// Reg Alloc // Reg Alloc
//------------------------------------------------------------------ //------------------------------------------------------------------
void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW);
void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW);
void mVUloadReg(int reg, uptr offset, int xyzw);
void mVUloadIreg(int reg, int xyzw, VURegs* vuRegs);
struct microMapXMM { struct microMapXMM {
int VFreg; // VF Reg Number Stored (-1 = Temp; 0 = vf0 and will not be written back; 32 = ACC; 33 = I reg) int VFreg; // VF Reg Number Stored (-1 = Temp; 0 = vf0 and will not be written back; 32 = ACC; 33 = I reg)
int xyzw; // xyzw to write back (0 = Don't write back anything AND cached vfReg has all vectors valid) int xyzw; // xyzw to write back (0 = Don't write back anything AND cached vfReg has all vectors valid)
@ -209,18 +204,18 @@ public:
} }
void reset() { void reset() {
for (int i = 0; i < xmmTotal; i++) { for (int i = 0; i < xmmTotal; i++) {
clearReg(i); clearReg(xmm(i));
} }
counter = 0; counter = 0;
} }
void flushAll(bool clearState = 1) { void flushAll(bool clearState = 1) {
for (int i = 0; i < xmmTotal; i++) { for (int i = 0; i < xmmTotal; i++) {
writeBackReg(i); writeBackReg(xmm(i));
if (clearState) clearReg(i); if (clearState) clearReg(xmm(i));
} }
} }
void clearReg(int reg) { void clearReg(xmm reg) {
microMapXMM& clear( xmmMap[reg] ); microMapXMM& clear( xmmMap[reg.Id] );
clear.VFreg = -1; clear.VFreg = -1;
clear.count = 0; clear.count = 0;
clear.xyzw = 0; clear.xyzw = 0;
@ -228,23 +223,23 @@ public:
} }
void clearRegVF(int VFreg) { void clearRegVF(int VFreg) {
for (int i = 0; i < xmmTotal; i++) { for (int i = 0; i < xmmTotal; i++) {
if (xmmMap[i].VFreg == VFreg) clearReg(i); if (xmmMap[i].VFreg == VFreg) clearReg(xmm(i));
} }
} }
void writeBackReg(int reg, bool invalidateRegs = 1) { void writeBackReg(xmm reg, bool invalidateRegs = 1) {
microMapXMM& write( xmmMap[reg] ); microMapXMM& write( xmmMap[reg.Id] );
if ((write.VFreg > 0) && write.xyzw) { // Reg was modified and not Temp or vf0 if ((write.VFreg > 0) && write.xyzw) { // Reg was modified and not Temp or vf0
if (write.VFreg == 33) SSE_MOVSS_XMM_to_M32((uptr)&vuRegs->VI[REG_I].UL, reg); if (write.VFreg == 33) xMOVSS(ptr32[&vuRegs->VI[REG_I].UL], reg);
else if (write.VFreg == 32) mVUsaveReg(reg, (uptr)&vuRegs->ACC.UL[0], write.xyzw, 1); else if (write.VFreg == 32) mVUsaveReg(reg, ptr[&vuRegs->ACC.UL[0]], write.xyzw, 1);
else mVUsaveReg(reg, (uptr)&vuRegs->VF[write.VFreg].UL[0], write.xyzw, 1); else mVUsaveReg(reg, ptr[&vuRegs->VF[write.VFreg].UL[0]], write.xyzw, 1);
if (invalidateRegs) { if (invalidateRegs) {
for (int i = 0; i < xmmTotal; i++) { for (int i = 0; i < xmmTotal; i++) {
microMapXMM& imap (xmmMap[i]); microMapXMM& imap (xmmMap[i]);
if ((i == reg) || imap.isNeeded) continue; if ((i == reg.Id) || imap.isNeeded) continue;
if (imap.VFreg == write.VFreg) { if (imap.VFreg == write.VFreg) {
if (imap.xyzw && imap.xyzw < 0xf) DevCon.Error("microVU Error: writeBackReg() [%d]", imap.VFreg); if (imap.xyzw && imap.xyzw < 0xf) DevCon.Error("microVU Error: writeBackReg() [%d]", imap.VFreg);
clearReg(i); // Invalidate any Cached Regs of same vf Reg clearReg(xmm(i)); // Invalidate any Cached Regs of same vf Reg
} }
} }
} }
@ -257,27 +252,28 @@ public:
} }
clearReg(reg); // Clear Reg clearReg(reg); // Clear Reg
} }
void clearNeeded(int reg) { void clearNeeded(xmm reg)
if ((reg < 0) || (reg >= xmmTotal)) return; {
if ((reg.Id < 0) || (reg.Id >= xmmTotal)) return;
microMapXMM& clear (xmmMap[reg]); microMapXMM& clear (xmmMap[reg.Id]);
clear.isNeeded = 0; clear.isNeeded = 0;
if (clear.xyzw) { // Reg was modified if (clear.xyzw) { // Reg was modified
if (clear.VFreg > 0) { if (clear.VFreg > 0) {
int mergeRegs = 0; int mergeRegs = 0;
if (clear.xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes if (clear.xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes
for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg
if (i == reg) continue; if (i == reg.Id) continue;
microMapXMM& imap (xmmMap[i]); microMapXMM& imap (xmmMap[i]);
if (imap.VFreg == clear.VFreg) { if (imap.VFreg == clear.VFreg) {
if (imap.xyzw && imap.xyzw < 0xf) DevCon.Error("microVU Error: clearNeeded() [%d]", imap.VFreg); if (imap.xyzw && imap.xyzw < 0xf) DevCon.Error("microVU Error: clearNeeded() [%d]", imap.VFreg);
if (mergeRegs == 1) { if (mergeRegs == 1) {
mVUmergeRegs(i, reg, clear.xyzw, 1); mVUmergeRegs(xmm(i), reg, clear.xyzw, 1);
imap.xyzw = 0xf; imap.xyzw = 0xf;
imap.count = counter; imap.count = counter;
mergeRegs = 2; mergeRegs = 2;
} }
else clearReg(i); else clearReg(xmm(i));
} }
} }
if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged
@ -286,10 +282,11 @@ public:
else clearReg(reg); // If Reg was temp or vf0, then invalidate itself else clearReg(reg); // If Reg was temp or vf0, then invalidate itself
} }
} }
int allocReg(int vfLoadReg = -1, int vfWriteReg = -1, int xyzw = 0, bool cloneWrite = 1) { xmm allocReg(int vfLoadReg = -1, int vfWriteReg = -1, int xyzw = 0, bool cloneWrite = 1) {
counter++; counter++;
if (vfLoadReg >= 0) { // Search For Cached Regs if (vfLoadReg >= 0) { // Search For Cached Regs
for (int i = 0; i < xmmTotal; i++) { for (int i = 0; i < xmmTotal; i++) {
xmm xmmi(i);
microMapXMM& imap (xmmMap[i]); microMapXMM& imap (xmmMap[i]);
if ((imap.VFreg == vfLoadReg) && (!imap.xyzw // Reg Was Not Modified if ((imap.VFreg == vfLoadReg) && (!imap.xyzw // Reg Was Not Modified
|| (imap.VFreg && (imap.xyzw==0xf)))) { // Reg Had All Vectors Modified and != VF0 || (imap.VFreg && (imap.xyzw==0xf)))) { // Reg Had All Vectors Modified and != VF0
@ -297,49 +294,51 @@ public:
if (vfWriteReg >= 0) { // Reg will be modified if (vfWriteReg >= 0) { // Reg will be modified
if (cloneWrite) { // Clone Reg so as not to use the same Cached Reg if (cloneWrite) { // Clone Reg so as not to use the same Cached Reg
z = findFreeReg(); z = findFreeReg();
writeBackReg(z); xmm xmmz(z);
if (z!=i && xyzw==8) SSE_MOVAPS_XMM_to_XMM (z, i); writeBackReg(xmmz);
else if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1); if (z!=i && xyzw==8) xMOVAPS (xmmz, xmmi);
else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2); else if (xyzw == 4) xPSHUF.D(xmmz, xmmi, 1);
else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3); else if (xyzw == 2) xPSHUF.D(xmmz, xmmi, 2);
else if (z != i) SSE_MOVAPS_XMM_to_XMM (z, i); else if (xyzw == 1) xPSHUF.D(xmmz, xmmi, 3);
else if (z != i) xMOVAPS (xmmz, xmmi);
imap.count = counter; // Reg i was used, so update counter imap.count = counter; // Reg i was used, so update counter
} }
else { // Don't clone reg, but shuffle to adjust for SS ops else { // Don't clone reg, but shuffle to adjust for SS ops
if ((vfLoadReg != vfWriteReg) || (xyzw != 0xf)) { writeBackReg(z); } if ((vfLoadReg != vfWriteReg) || (xyzw != 0xf)) { writeBackReg(xmmi); }
if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1); if (xyzw == 4) xPSHUF.D(xmmi, xmmi, 1);
else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2); else if (xyzw == 2) xPSHUF.D(xmmi, xmmi, 2);
else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3); else if (xyzw == 1) xPSHUF.D(xmmi, xmmi, 3);
} }
xmmMap[z].VFreg = vfWriteReg; xmmMap[z].VFreg = vfWriteReg;
xmmMap[z].xyzw = xyzw; xmmMap[z].xyzw = xyzw;
} }
xmmMap[z].count = counter; xmmMap[z].count = counter;
xmmMap[z].isNeeded = 1; xmmMap[z].isNeeded = 1;
return z; return xmm(z);
} }
} }
} }
int x = findFreeReg(); int x = findFreeReg();
writeBackReg(x); xmm xmmx(x);
writeBackReg(xmmx);
if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading) if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading)
if ((vfLoadReg == 0) && !(xyzw & 1)) { SSE2_PXOR_XMM_to_XMM(x, x); } if ((vfLoadReg == 0) && !(xyzw & 1)) { xPXOR(xmmx, xmmx); }
else if (vfLoadReg == 33) mVUloadIreg(x, xyzw, vuRegs); else if (vfLoadReg == 33) mVUloadIreg(xmmx, xyzw, vuRegs);
else if (vfLoadReg == 32) mVUloadReg (x, (uptr)&vuRegs->ACC.UL[0], xyzw); else if (vfLoadReg == 32) mVUloadReg (xmmx, ptr[&vuRegs->ACC.UL[0]], xyzw);
else if (vfLoadReg >= 0) mVUloadReg (x, (uptr)&vuRegs->VF[vfLoadReg].UL[0], xyzw); else if (vfLoadReg >= 0) mVUloadReg (xmmx, ptr[&vuRegs->VF[vfLoadReg].UL[0]], xyzw);
xmmMap[x].VFreg = vfWriteReg; xmmMap[x].VFreg = vfWriteReg;
xmmMap[x].xyzw = xyzw; xmmMap[x].xyzw = xyzw;
} }
else { // Reg Will Not Be Modified (always load full reg for caching) else { // Reg Will Not Be Modified (always load full reg for caching)
if (vfLoadReg == 33) mVUloadIreg(x, 0xf, vuRegs); if (vfLoadReg == 33) mVUloadIreg(xmmx, 0xf, vuRegs);
else if (vfLoadReg == 32) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->ACC.UL[0]); else if (vfLoadReg == 32) xMOVAPS(xmmx, ptr128[&vuRegs->ACC.UL[0]]);
else if (vfLoadReg >= 0) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->VF[vfLoadReg].UL[0]); else if (vfLoadReg >= 0) xMOVAPS(xmmx, ptr128[&vuRegs->VF[vfLoadReg].UL[0]]);
xmmMap[x].VFreg = vfLoadReg; xmmMap[x].VFreg = vfLoadReg;
xmmMap[x].xyzw = 0; xmmMap[x].xyzw = 0;
} }
xmmMap[x].count = counter; xmmMap[x].count = counter;
xmmMap[x].isNeeded = 1; xmmMap[x].isNeeded = 1;
return x; return xmmx;
} }
}; };

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,7 @@ void setupMacroOp(int mode, const char* opName) {
iFlushCall(FLUSH_EVERYTHING); iFlushCall(FLUSH_EVERYTHING);
microVU0.regAlloc->reset(); microVU0.regAlloc->reset();
if (mode & 0x01) { // Q-Reg will be Read if (mode & 0x01) { // Q-Reg will be Read
SSE_MOVSS_M32_to_XMM(xmmPQ, (uptr)&microVU0.regs->VI[REG_Q].UL); xMOVSSZX(xmmPQ, ptr32[&microVU0.regs->VI[REG_Q].UL]);
} }
if (mode & 0x08) { // Clip Instruction if (mode & 0x08) { // Clip Instruction
microVU0.prog.IRinfo.info[0].cFlag.write = 0xff; microVU0.prog.IRinfo.info[0].cFlag.write = 0xff;
@ -51,16 +51,16 @@ void setupMacroOp(int mode, const char* opName) {
microVU0.prog.IRinfo.info[0].sFlag.lastWrite = 0; microVU0.prog.IRinfo.info[0].sFlag.lastWrite = 0;
microVU0.prog.IRinfo.info[0].mFlag.doFlag = 1; microVU0.prog.IRinfo.info[0].mFlag.doFlag = 1;
microVU0.prog.IRinfo.info[0].mFlag.write = 0xff; microVU0.prog.IRinfo.info[0].mFlag.write = 0xff;
MOV32MtoR(gprF0, (uptr)&microVU0.regs->VI[REG_STATUS_FLAG].UL); xMOV(gprF[0], ptr32[&microVU0.regs->VI[REG_STATUS_FLAG].UL]);
} }
} }
void endMacroOp(int mode) { void endMacroOp(int mode) {
if (mode & 0x02) { // Q-Reg was Written To if (mode & 0x02) { // Q-Reg was Written To
SSE_MOVSS_XMM_to_M32((uptr)&microVU0.regs->VI[REG_Q].UL, xmmPQ); xMOVSS(ptr32[&microVU0.regs->VI[REG_Q].UL], xmmPQ);
} }
if (mode & 0x10) { // Status/Mac Flags were Updated if (mode & 0x10) { // Status/Mac Flags were Updated
MOV32RtoM((uptr)&microVU0.regs->VI[REG_STATUS_FLAG].UL, gprF0); xMOV(ptr32[&microVU0.regs->VI[REG_STATUS_FLAG].UL], gprF[0]);
} }
microVU0.regAlloc->flushAll(); microVU0.regAlloc->flushAll();
microVU0.cop2 = 0; microVU0.cop2 = 0;
@ -253,11 +253,11 @@ void COP2_Interlock(bool mBitSync) {
} }
void TEST_FBRST_RESET(FnType_Void* resetFunct, int vuIndex) { void TEST_FBRST_RESET(FnType_Void* resetFunct, int vuIndex) {
TEST32ItoR(EAX, (vuIndex) ? 0x200 : 0x002); xTEST(eax, (vuIndex) ? 0x200 : 0x002);
j8Ptr[0] = JZ8(0); xForwardJZ8 skip;
xCALL(resetFunct); xCALL(resetFunct);
MOV32MtoR(EAX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]); xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
x86SetJ8(j8Ptr[0]); skip.SetTarget();
} }
static void recCFC2() { static void recCFC2() {
@ -269,19 +269,19 @@ static void recCFC2() {
iFlushCall(FLUSH_EVERYTHING); iFlushCall(FLUSH_EVERYTHING);
if (_Rd_ == REG_STATUS_FLAG) { // Normalize Status Flag if (_Rd_ == REG_STATUS_FLAG) { // Normalize Status Flag
MOV32MtoR(gprF0, (uptr)&microVU0.regs->VI[REG_STATUS_FLAG].UL); xMOV(gprF[0], ptr32[&microVU0.regs->VI[REG_STATUS_FLAG].UL]);
mVUallocSFLAGc(EAX, gprF0, 0); mVUallocSFLAGc(eax, gprF[0], 0);
} }
else MOV32MtoR(EAX, (uptr)&microVU0.regs->VI[_Rd_].UL); else xMOV(eax, ptr32[&microVU0.regs->VI[_Rd_].UL]);
// FixMe: Should R-Reg have upper 9 bits 0? // FixMe: Should R-Reg have upper 9 bits 0?
MOV32RtoM((uptr)&cpuRegs.GPR.r[_Rt_].UL[0], EAX); xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]], eax);
if (_Rd_ >= 16) { if (_Rd_ >= 16) {
CDQ(); // Sign Extend xCDQ(); // Sign Extend
MOV32RtoM ((uptr)&cpuRegs.GPR.r[_Rt_].UL[1], EDX); xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], edx);
} }
else MOV32ItoM((uptr)&cpuRegs.GPR.r[_Rt_].UL[1], 0); else xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], 0);
// FixMe: I think this is needed, but not sure how it works // FixMe: I think this is needed, but not sure how it works
_eeOnWriteReg(_Rt_, 1); _eeOnWriteReg(_Rt_, 1);
@ -298,36 +298,36 @@ static void recCTC2() {
case REG_MAC_FLAG: case REG_TPC: case REG_MAC_FLAG: case REG_TPC:
case REG_VPU_STAT: break; // Read Only Regs case REG_VPU_STAT: break; // Read Only Regs
case REG_R: case REG_R:
MOV32MtoR(EAX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]); xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
OR32ItoR (EAX, 0x3f800000); xOR (eax, 0x3f800000);
MOV32RtoM((uptr)&microVU0.regs->VI[REG_R].UL, EAX); xMOV(ptr32[&microVU0.regs->VI[REG_R].UL], eax);
break; break;
case REG_STATUS_FLAG: case REG_STATUS_FLAG:
if (_Rt_) { // Denormalizes flag into gprF1 if (_Rt_) { // Denormalizes flag into gprF1
mVUallocSFLAGd((uptr)&cpuRegs.GPR.r[_Rt_].UL[0], 0); mVUallocSFLAGd(&cpuRegs.GPR.r[_Rt_].UL[0], 0);
MOV32RtoM((uptr)&microVU0.regs->VI[_Rd_].UL, gprF1); xMOV(ptr32[&microVU0.regs->VI[_Rd_].UL], gprF[1]);
} }
else MOV32ItoM((uptr)&microVU0.regs->VI[_Rd_].UL, 0); else xMOV(ptr32[&microVU0.regs->VI[_Rd_].UL], 0);
break; break;
case REG_CMSAR1: // Execute VU1 Micro SubRoutine case REG_CMSAR1: // Execute VU1 Micro SubRoutine
if (_Rt_) { if (_Rt_) {
MOV32MtoR(ECX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]); xMOV(ecx, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
} }
else XOR32RtoR(ECX,ECX); else xXOR(ecx, ecx);
xCALL(vu1ExecMicro); xCALL(vu1ExecMicro);
break; break;
case REG_FBRST: case REG_FBRST:
if (!_Rt_) { if (!_Rt_) {
MOV32ItoM((uptr)&microVU0.regs->VI[REG_FBRST].UL, 0); xMOV(ptr32[&microVU0.regs->VI[REG_FBRST].UL], 0);
return; return;
} }
else MOV32MtoR(EAX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]); else xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
TEST_FBRST_RESET(vu0ResetRegs, 0); TEST_FBRST_RESET(vu0ResetRegs, 0);
TEST_FBRST_RESET(vu1ResetRegs, 1); TEST_FBRST_RESET(vu1ResetRegs, 1);
AND32ItoR(EAX, 0x0C0C); xAND(eax, 0x0C0C);
MOV32RtoM((uptr)&microVU0.regs->VI[REG_FBRST].UL, EAX); xMOV(ptr32[&microVU0.regs->VI[REG_FBRST].UL], eax);
break; break;
default: default:
// Executing vu0 block here fixes the intro of Ratchet and Clank // Executing vu0 block here fixes the intro of Ratchet and Clank
@ -349,8 +349,8 @@ static void recQMFC2() {
// FixMe: For some reason this line is needed or else games break: // FixMe: For some reason this line is needed or else games break:
_eeOnWriteReg(_Rt_, 0); _eeOnWriteReg(_Rt_, 0);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&microVU0.regs->VF[_Rd_].UL[0]); xMOVAPS(xmmT1, ptr128[&microVU0.regs->VF[_Rd_]]);
SSE_MOVAPS_XMM_to_M128((uptr)&cpuRegs.GPR.r[_Rt_].UL[0], xmmT1); xMOVAPS(ptr128[&cpuRegs.GPR.r[_Rt_]], xmmT1);
} }
static void recQMTC2() { static void recQMTC2() {
@ -360,8 +360,8 @@ static void recQMTC2() {
if (!_Rd_) return; if (!_Rd_) return;
iFlushCall(FLUSH_EVERYTHING); iFlushCall(FLUSH_EVERYTHING);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]); xMOVAPS(xmmT1, ptr128[&cpuRegs.GPR.r[_Rt_]]);
SSE_MOVAPS_XMM_to_M128((uptr)&microVU0.regs->VF[_Rd_].UL[0], xmmT1); xMOVAPS(ptr128[&microVU0.regs->VF[_Rd_]], xmmT1);
} }
//------------------------------------------------------------------ //------------------------------------------------------------------

View File

@ -15,6 +15,8 @@
#pragma once #pragma once
using namespace x86Emitter;
//------------------------------------------------------------------ //------------------------------------------------------------------
// Global Variables // Global Variables
//------------------------------------------------------------------ //------------------------------------------------------------------
@ -32,6 +34,9 @@ struct mVU_Globals {
extern const __aligned(32) mVU_Globals mVUglob; extern const __aligned(32) mVU_Globals mVUglob;
typedef xRegisterSSE xmm;
typedef xRegister32 x32;
//------------------------------------------------------------------ //------------------------------------------------------------------
// Helper Macros // Helper Macros
//------------------------------------------------------------------ //------------------------------------------------------------------
@ -87,23 +92,21 @@ extern const __aligned(32) mVU_Globals mVUglob;
#define offsetSS ((_X) ? (0) : ((_Y) ? (4) : ((_Z) ? 8: 12))) #define offsetSS ((_X) ? (0) : ((_Y) ? (4) : ((_Z) ? 8: 12)))
#define offsetReg ((_X) ? (0) : ((_Y) ? (1) : ((_Z) ? 2: 3))) #define offsetReg ((_X) ? (0) : ((_Y) ? (1) : ((_Z) ? 2: 3)))
#define xmmT1 0 // Used for regAlloc const xmm
#define xmmT2 1 // Used for regAlloc xmmT1 = xmm(0), // Used for regAlloc
#define xmmT3 2 // Used for regAlloc xmmT2 = xmm(1), // Used for regAlloc
#define xmmT4 3 // Used for regAlloc xmmT3 = xmm(2), // Used for regAlloc
#define xmmT5 4 // Used for regAlloc xmmT4 = xmm(3), // Used for regAlloc
#define xmmT6 5 // Used for regAlloc xmmT5 = xmm(4), // Used for regAlloc
#define xmmT7 6 // Used for regAlloc xmmT6 = xmm(5), // Used for regAlloc
#define xmmPQ 7 // Holds the Value and Backup Values of P and Q regs xmmT7 = xmm(6), // Used for regAlloc
xmmPQ = xmm(7); // Holds the Value and Backup Values of P and Q regs
#define gprT1 0 // Temp Reg const x32
#define gprT2 1 // Temp Reg gprT1 = x32(0), // eax - Temp Reg
#define gprT3 2 // Temp Reg gprT2 = x32(1), // ecx - Temp Reg
#define gprF0 3 // Status Flag 0 gprT3 = x32(2), // edx - Temp Reg
#define gprESP 4 // Don't use? gprF[4] = {x32(3), x32(5), x32(6), x32(7)}; // ebx, ebp, esi, edi - Status Flags
#define gprF1 5 // Status Flag 1
#define gprF2 6 // Status Flag 2
#define gprF3 7 // Status Flag 3
// Function Params // Function Params
#define mP microVU* mVU, int recPass #define mP microVU* mVU, int recPass
@ -192,7 +195,7 @@ typedef u32 (__fastcall *mVUCall)(void*, void*);
#define branchAddrN ((xPC + 16 + (_Imm11_ * 8)) & (mVU->microMemSize-8)) #define branchAddrN ((xPC + 16 + (_Imm11_ * 8)) & (mVU->microMemSize-8))
#define shufflePQ (((mVU->p) ? 0xb0 : 0xe0) | ((mVU->q) ? 0x01 : 0x04)) #define shufflePQ (((mVU->p) ? 0xb0 : 0xe0) | ((mVU->q) ? 0x01 : 0x04))
#define cmpOffset(x) ((u8*)&(((u8*)x)[it[0].start])) #define cmpOffset(x) ((u8*)&(((u8*)x)[it[0].start]))
#define Rmem (uptr)&mVU->regs->VI[REG_R].UL #define Rmem &mVU->regs->VI[REG_R].UL
#define aWrap(x, m) ((x > m) ? 0 : x) #define aWrap(x, m) ((x > m) ? 0 : x)
#define shuffleSS(x) ((x==1)?(0x27):((x==2)?(0xc6):((x==4)?(0xe1):(0xe4)))) #define shuffleSS(x) ((x==1)?(0x27):((x==2)?(0xc6):((x==4)?(0xe1):(0xe4))))
#define _1mb (0x100000) #define _1mb (0x100000)
@ -295,8 +298,13 @@ typedef u32 (__fastcall *mVUCall)(void*, void*);
#define mVUdebugNOW(isEndPC) { \ #define mVUdebugNOW(isEndPC) { \
if (mVUdebugNow) { \ if (mVUdebugNow) { \
MOV32ItoR(gprT2, xPC); \ xMOV(gprT2, xPC); \
if (isEndPC) { CALLFunc((uptr)mVUprintPC2); } \ if (isEndPC) { xCALL(mVUprintPC2); } \
else { CALLFunc((uptr)mVUprintPC1); } \ else { xCALL(mVUprintPC1); } \
} \ } \
} }
void mVUmergeRegs(xmm dest, xmm src, int xyzw, bool modXYZW=false);
void mVUsaveReg(xmm reg, xAddressVoid ptr, int xyzw, bool modXYZW);
void mVUloadReg(xmm reg, xAddressVoid ptr, int xyzw);
void mVUloadIreg(xmm reg, int xyzw, VURegs* vuRegs);

View File

@ -1,6 +1,6 @@
/* PCSX2 - PS2 Emulator for PCs /* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2010 PCSX2 Dev Team * Copyright (C) 2002-2010 PCSX2 Dev Team
* *
* PCSX2 is free software: you can redistribute it and/or modify it under the terms * PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found- * of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version. * ation, either version 3 of the License, or (at your option) any later version.
@ -19,247 +19,190 @@
// Micro VU - Reg Loading/Saving/Shuffling/Unpacking/Merging... // Micro VU - Reg Loading/Saving/Shuffling/Unpacking/Merging...
//------------------------------------------------------------------ //------------------------------------------------------------------
void mVUunpack_xyzw(int dstreg, int srcreg, int xyzw) { void mVUunpack_xyzw(xmm dstreg, xmm srcreg, int xyzw)
{
switch ( xyzw ) { switch ( xyzw ) {
case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break; case 0: xPSHUF.D(dstreg, srcreg, 0x00); break;
case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break; case 1: xPSHUF.D(dstreg, srcreg, 0x55); break;
case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break; case 2: xPSHUF.D(dstreg, srcreg, 0xaa); break;
case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break; case 3: xPSHUF.D(dstreg, srcreg, 0xff); break;
} }
} }
void mVUloadReg(int reg, uptr offset, int xyzw) { void mVUloadReg(xmm reg, xAddressVoid ptr, int xyzw)
{
switch( xyzw ) { switch( xyzw ) {
case 8: SSE_MOVSS_M32_to_XMM(reg, offset); break; // X case 8: xMOVSSZX(reg, ptr32[ptr]); break; // X
case 4: SSE_MOVSS_M32_to_XMM(reg, offset+4); break; // Y case 4: xMOVSSZX(reg, ptr32[ptr+4]); break; // Y
case 2: SSE_MOVSS_M32_to_XMM(reg, offset+8); break; // Z case 2: xMOVSSZX(reg, ptr32[ptr+8]); break; // Z
case 1: SSE_MOVSS_M32_to_XMM(reg, offset+12); break; // W case 1: xMOVSSZX(reg, ptr32[ptr+12]); break; // W
default: SSE_MOVAPS_M128_to_XMM(reg, offset); break; default: xMOVAPS(reg, ptr128[ptr]); break;
} }
} }
void mVUloadReg2(int reg, int gprReg, uptr offset, int xyzw) { void mVUloadIreg(xmm reg, int xyzw, VURegs* vuRegs)
switch( xyzw ) { {
case 8: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset); break; // X xMOVSSZX(reg, ptr32[&vuRegs->VI[REG_I].UL]);
case 4: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+4); break; // Y if (!_XYZWss(xyzw)) xSHUF.PS(reg, reg, 0);
case 2: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+8); break; // Z
case 1: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+12); break; // W
default: SSE_MOVAPSRmtoR(reg, gprReg, offset); break;
}
}
void mVUloadIreg(int reg, int xyzw, VURegs* vuRegs) {
SSE_MOVSS_M32_to_XMM(reg, (uptr)&vuRegs->VI[REG_I].UL);
if (!_XYZWss(xyzw)) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0);
} }
// Modifies the Source Reg! // Modifies the Source Reg!
void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW) { void mVUsaveReg(xmm reg, xAddressVoid ptr, int xyzw, bool modXYZW)
/*SSE_MOVAPS_M128_to_XMM(xmmT2, offset); {
/*xMOVAPS(xmmT2, ptr128[ptr]);
if (modXYZW && (xyzw == 8 || xyzw == 4 || xyzw == 2 || xyzw == 1)) { if (modXYZW && (xyzw == 8 || xyzw == 4 || xyzw == 2 || xyzw == 1)) {
mVUunpack_xyzw<vuIndex>(reg, reg, 0); mVUunpack_xyzw(reg, reg, 0);
} }
mVUmergeRegs(xmmT2, reg, xyzw); mVUmergeRegs(xmmT2, reg, xyzw);
SSE_MOVAPS_XMM_to_M128(offset, xmmT2); xMOVAPS(ptr128[ptr], xmmT2);
return;*/ return;*/
switch ( xyzw ) { switch ( xyzw ) {
case 5: if (x86caps.hasStreamingSIMD4Extensions) { case 5: if (x86caps.hasStreamingSIMD4Extensions) {
SSE4_EXTRACTPS_XMM_to_M32(offset+4, reg, 1); xEXTRACTPS(ptr32[ptr+4], reg, 1);
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3); xEXTRACTPS(ptr32[ptr+12], reg, 3);
} }
else { else {
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY xPSHUF.D(reg, reg, 0xe1); //WZXY
SSE_MOVSS_XMM_to_M32(offset+4, reg); xMOVSS(ptr32[ptr+4], reg);
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW xPSHUF.D(reg, reg, 0xff); //WWWW
SSE_MOVSS_XMM_to_M32(offset+12, reg); xMOVSS(ptr32[ptr+12], reg);
} }
break; // YW break; // YW
case 6: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xc9); case 6: xPSHUF.D(reg, reg, 0xc9);
SSE_MOVLPS_XMM_to_M64(offset+4, reg); xMOVL.PS(ptr64[ptr+4], reg);
break; // YZ break; // YZ
case 7: if (x86caps.hasStreamingSIMD4Extensions) { case 7: if (x86caps.hasStreamingSIMD4Extensions) {
SSE_MOVHPS_XMM_to_M64(offset+8, reg); xMOVH.PS(ptr64[ptr+8], reg);
SSE4_EXTRACTPS_XMM_to_M32(offset+4, reg, 1); xEXTRACTPS(ptr32[ptr+4], reg, 1);
} }
else { else {
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x93); //ZYXW xPSHUF.D(reg, reg, 0x93); //ZYXW
SSE_MOVHPS_XMM_to_M64(offset+4, reg); xMOVH.PS(ptr64[ptr+4], reg);
SSE_MOVSS_XMM_to_M32(offset+12, reg); xMOVSS(ptr32[ptr+12], reg);
} }
break; // YZW break; // YZW
case 9: if (x86caps.hasStreamingSIMD4Extensions) { case 9: if (x86caps.hasStreamingSIMD4Extensions) {
SSE_MOVSS_XMM_to_M32(offset, reg); xMOVSS(ptr32[ptr], reg);
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3); xEXTRACTPS(ptr32[ptr+12], reg, 3);
} }
else { else {
SSE_MOVSS_XMM_to_M32(offset, reg); xMOVSS(ptr32[ptr], reg);
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW xPSHUF.D(reg, reg, 0xff); //WWWW
SSE_MOVSS_XMM_to_M32(offset+12, reg); xMOVSS(ptr32[ptr+12], reg);
} }
break; // XW break; // XW
case 10: if (x86caps.hasStreamingSIMD4Extensions) { case 10: if (x86caps.hasStreamingSIMD4Extensions) {
SSE_MOVSS_XMM_to_M32(offset, reg); xMOVSS(ptr32[ptr], reg);
SSE4_EXTRACTPS_XMM_to_M32(offset+8, reg, 2); xEXTRACTPS(ptr32[ptr+8], reg, 2);
} }
else { else {
SSE_MOVSS_XMM_to_M32(offset, reg); xMOVSS(ptr32[ptr], reg);
SSE_MOVHLPS_XMM_to_XMM(reg, reg); xMOVHL.PS(reg, reg);
SSE_MOVSS_XMM_to_M32(offset+8, reg); xMOVSS(ptr32[ptr+8], reg);
} }
break; //XZ break; //XZ
case 11: SSE_MOVSS_XMM_to_M32(offset, reg); case 11: xMOVSS(ptr32[ptr], reg);
SSE_MOVHPS_XMM_to_M64(offset+8, reg); xMOVH.PS(ptr64[ptr+8], reg);
break; //XZW break; //XZW
case 13: if (x86caps.hasStreamingSIMD4Extensions) { case 13: if (x86caps.hasStreamingSIMD4Extensions) {
SSE_MOVLPS_XMM_to_M64(offset, reg); xMOVL.PS(ptr64[ptr], reg);
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3); xEXTRACTPS(ptr32[ptr+12], reg, 3);
} }
else { else {
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x4b); //YXZW xPSHUF.D(reg, reg, 0x4b); //YXZW
SSE_MOVHPS_XMM_to_M64(offset, reg); xMOVH.PS(ptr64[ptr], reg);
SSE_MOVSS_XMM_to_M32(offset+12, reg); xMOVSS(ptr32[ptr+12], reg);
} }
break; // XYW break; // XYW
case 14: if (x86caps.hasStreamingSIMD4Extensions) { case 14: if (x86caps.hasStreamingSIMD4Extensions) {
SSE_MOVLPS_XMM_to_M64(offset, reg); xMOVL.PS(ptr64[ptr], reg);
SSE4_EXTRACTPS_XMM_to_M32(offset+8, reg, 2); xEXTRACTPS(ptr32[ptr+8], reg, 2);
} }
else { else {
SSE_MOVLPS_XMM_to_M64(offset, reg); xMOVL.PS(ptr64[ptr], reg);
SSE_MOVHLPS_XMM_to_XMM(reg, reg); xMOVHL.PS(reg, reg);
SSE_MOVSS_XMM_to_M32(offset+8, reg); xMOVSS(ptr32[ptr+8], reg);
} }
break; // XYZ break; // XYZ
case 4: if (!modXYZW) mVUunpack_xyzw(reg, reg, 1); case 4: if (!modXYZW) mVUunpack_xyzw(reg, reg, 1);
SSE_MOVSS_XMM_to_M32(offset+4, reg); xMOVSS(ptr32[ptr+4], reg);
break; // Y break; // Y
case 2: if (!modXYZW) mVUunpack_xyzw(reg, reg, 2); case 2: if (!modXYZW) mVUunpack_xyzw(reg, reg, 2);
SSE_MOVSS_XMM_to_M32(offset+8, reg); xMOVSS(ptr32[ptr+8], reg);
break; // Z break; // Z
case 1: if (!modXYZW) mVUunpack_xyzw(reg, reg, 3); case 1: if (!modXYZW) mVUunpack_xyzw(reg, reg, 3);
SSE_MOVSS_XMM_to_M32(offset+12, reg); xMOVSS(ptr32[ptr+12], reg);
break; // W break; // W
case 8: SSE_MOVSS_XMM_to_M32(offset, reg); break; // X case 8: xMOVSS(ptr32[ptr], reg); break; // X
case 12: SSE_MOVLPS_XMM_to_M64(offset, reg); break; // XY case 12: xMOVL.PS(ptr64[ptr], reg); break; // XY
case 3: SSE_MOVHPS_XMM_to_M64(offset+8, reg); break; // ZW case 3: xMOVH.PS(ptr64[ptr+8], reg); break; // ZW
default: SSE_MOVAPS_XMM_to_M128(offset, reg); break; // XYZW default: xMOVAPS(ptr128[ptr], reg); break; // XYZW
}
}
// Modifies the Source Reg!
void mVUsaveReg2(int reg, int gprReg, u32 offset, int xyzw) {
/*SSE_MOVAPSRmtoR(xmmT2, gprReg, offset);
if (xyzw == 8 || xyzw == 4 || xyzw == 2 || xyzw == 1) {
mVUunpack_xyzw<vuIndex>(reg, reg, 0);
}
mVUmergeRegs(xmmT2, reg, xyzw);
SSE_MOVAPSRtoRm(gprReg, xmmT2, offset);
return;*/
switch ( xyzw ) {
case 5: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+4);
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
break; // YW
case 6: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xc9);
SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset+4);
break; // YZ
case 7: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x93); //ZYXW
SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+4);
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
break; // YZW
case 9: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
break; // XW
case 10: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
SSE_MOVHLPS_XMM_to_XMM(reg, reg);
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+8);
break; //XZ
case 11: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+8);
break; //XZW
case 13: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x4b); //YXZW
SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset);
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
break; // XYW
case 14: SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset);
SSE_MOVHLPS_XMM_to_XMM(reg, reg);
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+8);
break; // XYZ
case 8: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset); break; // X
case 4: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+4); break; // Y
case 2: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+8); break; // Z
case 1: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12); break; // W
case 12: SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset); break; // XY
case 3: SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+8); break; // ZW
default: SSE_MOVAPSRtoRm(gprReg, reg, offset); break; // XYZW
} }
} }
// Modifies the Source Reg! (ToDo: Optimize modXYZW = 1 cases) // Modifies the Source Reg! (ToDo: Optimize modXYZW = 1 cases)
void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0) { void mVUmergeRegs(xmm dest, xmm src, int xyzw, bool modXYZW)
{
xyzw &= 0xf; xyzw &= 0xf;
if ( (dest != src) && (xyzw != 0) ) { if ( (dest != src) && (xyzw != 0) ) {
if (x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf)) { if (x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf)) {
if (modXYZW) { if (modXYZW) {
if (xyzw == 1) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 3, 0)); return; } if (xyzw == 1) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 3, 0)); return; }
else if (xyzw == 2) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 2, 0)); return; } else if (xyzw == 2) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 2, 0)); return; }
else if (xyzw == 4) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 1, 0)); return; } else if (xyzw == 4) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 1, 0)); return; }
} }
xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3); xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw); xBLEND.PS(dest, src, xyzw);
} }
else { else {
switch (xyzw) { switch (xyzw) {
case 1: if (modXYZW) mVUunpack_xyzw(src, src, 0); case 1: if (modXYZW) mVUunpack_xyzw(src, src, 0);
SSE_MOVHLPS_XMM_to_XMM(src, dest); // src = Sw Sz Dw Dz xMOVHL.PS(src, dest); // src = Sw Sz Dw Dz
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4); // 11 00 01 00 xSHUF.PS(dest, src, 0xc4); // 11 00 01 00
break; break;
case 2: if (modXYZW) mVUunpack_xyzw(src, src, 0); case 2: if (modXYZW) mVUunpack_xyzw(src, src, 0);
SSE_MOVHLPS_XMM_to_XMM(src, dest); xMOVHL.PS(src, dest);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64); xSHUF.PS(dest, src, 0x64);
break; break;
case 3: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); case 3: xSHUF.PS(dest, src, 0xe4);
break; break;
case 4: if (modXYZW) mVUunpack_xyzw(src, src, 0); case 4: if (modXYZW) mVUunpack_xyzw(src, src, 0);
SSE_MOVSS_XMM_to_XMM(src, dest); xMOVSS(src, dest);
SSE2_MOVSD_XMM_to_XMM(dest, src); xMOVSD(dest, src);
break; break;
case 5: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8); case 5: xSHUF.PS(dest, src, 0xd8);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd8); xPSHUF.D(dest, dest, 0xd8);
break; break;
case 6: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c); case 6: xSHUF.PS(dest, src, 0x9c);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x78); xPSHUF.D(dest, dest, 0x78);
break; break;
case 7: SSE_MOVSS_XMM_to_XMM(src, dest); case 7: xMOVSS(src, dest);
SSE_MOVAPS_XMM_to_XMM(dest, src); xMOVAPS(dest, src);
break; break;
case 8: SSE_MOVSS_XMM_to_XMM(dest, src); case 8: xMOVSS(dest, src);
break; break;
case 9: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9); case 9: xSHUF.PS(dest, src, 0xc9);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd2); xPSHUF.D(dest, dest, 0xd2);
break; break;
case 10: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d); case 10: xSHUF.PS(dest, src, 0x8d);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x72); xPSHUF.D(dest, dest, 0x72);
break; break;
case 11: SSE_MOVSS_XMM_to_XMM(dest, src); case 11: xMOVSS(dest, src);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); xSHUF.PS(dest, src, 0xe4);
break; break;
case 12: SSE2_MOVSD_XMM_to_XMM(dest, src); case 12: xMOVSD(dest, src);
break; break;
case 13: SSE_MOVHLPS_XMM_to_XMM(dest, src); case 13: xMOVHL.PS(dest, src);
SSE_SHUFPS_XMM_to_XMM(src, dest, 0x64); xSHUF.PS(src, dest, 0x64);
SSE_MOVAPS_XMM_to_XMM(dest, src); xMOVAPS(dest, src);
break; break;
case 14: SSE_MOVHLPS_XMM_to_XMM(dest, src); case 14: xMOVHL.PS(dest, src);
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xc4); xSHUF.PS(src, dest, 0xc4);
SSE_MOVAPS_XMM_to_XMM(dest, src); xMOVAPS(dest, src);
break; break;
default: SSE_MOVAPS_XMM_to_XMM(dest, src); default: xMOVAPS(dest, src);
break; break;
} }
} }
@ -271,33 +214,35 @@ void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0) {
//------------------------------------------------------------------ //------------------------------------------------------------------
// Transforms the Address in gprReg to valid VU0/VU1 Address // Transforms the Address in gprReg to valid VU0/VU1 Address
_f void mVUaddrFix(mV, int gprReg) { _f void mVUaddrFix(mV, x32 gprReg)
{
if (isVU1) { if (isVU1) {
AND32ItoR(gprReg, 0x3ff); // wrap around xAND(gprReg, 0x3ff); // wrap around
SHL32ItoR(gprReg, 4); xSHL(gprReg, 4);
} }
else { else {
u8 *jmpA, *jmpB; xCMP(gprReg, 0x400);
CMP32ItoR(gprReg, 0x400); xForwardJL8 jmpA; // if addr >= 0x4000, reads VU1's VF regs and VI regs
jmpA = JL8(0); // if addr >= 0x4000, reads VU1's VF regs and VI regs xAND(gprReg, 0x43f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs!
AND32ItoR(gprReg, 0x43f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs! xForwardJump8 jmpB;
jmpB = JMP8(0); jmpA.SetTarget();
x86SetJ8(jmpA); xAND(gprReg, 0xff); // if addr < 0x4000, wrap around
AND32ItoR(gprReg, 0xff); // if addr < 0x4000, wrap around jmpB.SetTarget();
x86SetJ8(jmpB); xSHL(gprReg, 4); // multiply by 16 (shift left by 4)
SHL32ItoR(gprReg, 4); // multiply by 16 (shift left by 4)
} }
} }
// Backup Volatile Regs (EAX, ECX, EDX, MM0~7, XMM0~7, are all volatile according to 32bit Win/Linux ABI) // Backup Volatile Regs (EAX, ECX, EDX, MM0~7, XMM0~7, are all volatile according to 32bit Win/Linux ABI)
_f void mVUbackupRegs(microVU* mVU) { _f void mVUbackupRegs(microVU* mVU)
{
mVU->regAlloc->flushAll(); mVU->regAlloc->flushAll();
SSE_MOVAPS_XMM_to_M128((uptr)&mVU->xmmPQb[0], xmmPQ); xMOVAPS(ptr128[&mVU->xmmPQb[0]], xmmPQ);
} }
// Restore Volatile Regs // Restore Volatile Regs
_f void mVUrestoreRegs(microVU* mVU) { _f void mVUrestoreRegs(microVU* mVU)
SSE_MOVAPS_M128_to_XMM(xmmPQ, (uptr)&mVU->xmmPQb[0]); {
xMOVAPS(xmmPQ, ptr128[&mVU->xmmPQb[0]]);
} }
//------------------------------------------------------------------ //------------------------------------------------------------------
@ -306,7 +251,7 @@ _f void mVUrestoreRegs(microVU* mVU) {
struct SSEMaskPair { u32 mask1[4], mask2[4]; }; struct SSEMaskPair { u32 mask1[4], mask2[4]; };
static const __aligned16 SSEMaskPair MIN_MAX = static const __aligned16 SSEMaskPair MIN_MAX =
{ {
{0xffffffff, 0x80000000, 0xffffffff, 0x80000000}, {0xffffffff, 0x80000000, 0xffffffff, 0x80000000},
{0x00000000, 0x40000000, 0x00000000, 0x40000000} {0x00000000, 0x40000000, 0x00000000, 0x40000000}
@ -314,121 +259,117 @@ static const __aligned16 SSEMaskPair MIN_MAX =
// Warning: Modifies t1 and t2 // Warning: Modifies t1 and t2
void MIN_MAX_PS(microVU* mVU, int to, int from, int t1, int t2, bool min) { void MIN_MAX_PS(microVU* mVU, xmm to, xmm from, xmm t1in, xmm t2in, bool min)
bool t1b = 0, t2b = 0; {
if (t1 < 0) { t1 = mVU->regAlloc->allocReg(); t1b = 1; } xmm t1 = t1in.IsEmpty() ? mVU->regAlloc->allocReg() : t1in;
if (t2 < 0) { t2 = mVU->regAlloc->allocReg(); t2b = 1; } xmm t2 = t2in.IsEmpty() ? mVU->regAlloc->allocReg() : t2in;
// ZW // ZW
SSE2_PSHUFD_XMM_to_XMM(t1, to, 0xfa); xPSHUF.D(t1, to, 0xfa);
SSE2_PAND_M128_to_XMM (t1, (uptr)MIN_MAX.mask1); xPAND (t1, ptr128[MIN_MAX.mask1]);
SSE2_POR_M128_to_XMM (t1, (uptr)MIN_MAX.mask2); xPOR (t1, ptr128[MIN_MAX.mask2]);
SSE2_PSHUFD_XMM_to_XMM(t2, from, 0xfa); xPSHUF.D(t2, from, 0xfa);
SSE2_PAND_M128_to_XMM (t2, (uptr)MIN_MAX.mask1); xPAND (t2, ptr128[MIN_MAX.mask1]);
SSE2_POR_M128_to_XMM (t2, (uptr)MIN_MAX.mask2); xPOR (t2, ptr128[MIN_MAX.mask2]);
if (min) SSE2_MINPD_XMM_to_XMM(t1, t2); if (min) xMIN.PD(t1, t2);
else SSE2_MAXPD_XMM_to_XMM(t1, t2); else xMAX.PD(t1, t2);
// XY // XY
SSE2_PSHUFD_XMM_to_XMM(t2, from, 0x50); xPSHUF.D(t2, from, 0x50);
SSE2_PAND_M128_to_XMM (t2, (uptr)MIN_MAX.mask1); xPAND (t2, ptr128[MIN_MAX.mask1]);
SSE2_POR_M128_to_XMM (t2, (uptr)MIN_MAX.mask2); xPOR (t2, ptr128[MIN_MAX.mask2]);
SSE2_PSHUFD_XMM_to_XMM(to, to, 0x50); xPSHUF.D(to, to, 0x50);
SSE2_PAND_M128_to_XMM (to, (uptr)MIN_MAX.mask1); xPAND (to, ptr128[MIN_MAX.mask1]);
SSE2_POR_M128_to_XMM (to, (uptr)MIN_MAX.mask2); xPOR (to, ptr128[MIN_MAX.mask2]);
if (min) SSE2_MINPD_XMM_to_XMM(to, t2); if (min) xMIN.PD(to, t2);
else SSE2_MAXPD_XMM_to_XMM(to, t2); else xMAX.PD(to, t2);
SSE_SHUFPS_XMM_to_XMM(to, t1, 0x88); xSHUF.PS(to, t1, 0x88);
if (t1b) mVU->regAlloc->clearNeeded(t1); if (t1 != t1in) mVU->regAlloc->clearNeeded(t1);
if (t2b) mVU->regAlloc->clearNeeded(t2); if (t2 != t2in) mVU->regAlloc->clearNeeded(t2);
} }
// Warning: Modifies to's upper 3 vectors, and t1 // Warning: Modifies to's upper 3 vectors, and t1
void MIN_MAX_SS(mV, int to, int from, int t1, bool min) { void MIN_MAX_SS(mV, xmm to, xmm from, xmm t1in, bool min)
bool t1b = 0; {
if (t1 < 0) { t1 = mVU->regAlloc->allocReg(); t1b = 1; } xmm t1 = t1in.IsEmpty() ? mVU->regAlloc->allocReg() : t1in;
SSE_SHUFPS_XMM_to_XMM (to, from, 0); xSHUF.PS(to, from, 0);
SSE2_PAND_M128_to_XMM (to, (uptr)MIN_MAX.mask1); xPAND (to, ptr128[MIN_MAX.mask1]);
SSE2_POR_M128_to_XMM (to, (uptr)MIN_MAX.mask2); xPOR (to, ptr128[MIN_MAX.mask2]);
SSE2_PSHUFD_XMM_to_XMM(t1, to, 0xee); xPSHUF.D(t1, to, 0xee);
if (min) SSE2_MINPD_XMM_to_XMM(to, t1); if (min) xMIN.PD(to, t1);
else SSE2_MAXPD_XMM_to_XMM(to, t1); else xMAX.PD(to, t1);
if (t1b) mVU->regAlloc->clearNeeded(t1); if (t1 != t1in) mVU->regAlloc->clearNeeded(t1);
} }
// Warning: Modifies all vectors in 'to' and 'from', and Modifies xmmT1 and xmmT2 // Warning: Modifies all vectors in 'to' and 'from', and Modifies xmmT1 and xmmT2
void ADD_SS(microVU* mVU, int to, int from, int t1, int t2) { void ADD_SS(microVU* mVU, xmm to, xmm from, xmm t1in, xmm t2in)
{
xmm t1 = t1in.IsEmpty() ? mVU->regAlloc->allocReg() : t1in;
xmm t2 = t2in.IsEmpty() ? mVU->regAlloc->allocReg() : t2in;
u8 *localptr[8]; xMOVAPS(t1, to);
bool t1b = 0, t2b = 0; xMOVAPS(t2, from);
if (t1 < 0) { t1 = mVU->regAlloc->allocReg(); t1b = 1; } xMOVD(ecx, to);
if (t2 < 0) { t2 = mVU->regAlloc->allocReg(); t2b = 1; } xSHR(ecx, 23);
xMOVD(eax, from);
xSHR(eax, 23);
xAND(ecx, 0xff);
xAND(eax, 0xff);
xSUB(ecx, eax); //ecx = exponent difference
SSE_MOVAPS_XMM_to_XMM(t1, to); xCMP(ecx, 25);
SSE_MOVAPS_XMM_to_XMM(t2, from); xForwardJGE8 case2;
SSE2_MOVD_XMM_to_R(gprT2, to); xCMP(ecx, 0);
SHR32ItoR(gprT2, 23); xForwardJG8 case3;
SSE2_MOVD_XMM_to_R(gprT1, from); xForwardJE8 toend1;
SHR32ItoR(gprT1, 23); xCMP(ecx, -25);
AND32ItoR(gprT2, 0xff); xForwardJLE8 case4;
AND32ItoR(gprT1, 0xff);
SUB32RtoR(gprT2, gprT1); //gprT2 = exponent difference
CMP32ItoR(gprT2, 25); // negative small
localptr[0] = JGE8(0); xNOT(ecx); // -ecx - 1
CMP32ItoR(gprT2, 0); xMOV(eax, 0xffffffff);
localptr[1] = JG8(0); xSHL(eax, cl);
localptr[2] = JE8(0); xPCMP.EQB(to, to);
CMP32ItoR(gprT2, -25); xMOVDZX(from, eax);
localptr[3] = JLE8(0); xMOVSS(to, from);
NEG32R(gprT2); xPCMP.EQB(from, from);
DEC32R(gprT2); xForwardJump8 toend2;
MOV32ItoR(gprT1, 0xffffffff);
SHL32CLtoR(gprT1);
SSE2_PCMPEQB_XMM_to_XMM(to, to);
SSE2_MOVD_R_to_XMM(from, gprT1);
SSE_MOVSS_XMM_to_XMM(to, from);
SSE2_PCMPEQB_XMM_to_XMM(from, from);
localptr[4] = JMP8(0);
x86SetJ8(localptr[0]); case2.SetTarget(); // positive large
MOV32ItoR(gprT1, 0x80000000); xMOV(eax, 0x80000000);
SSE2_PCMPEQB_XMM_to_XMM(from, from); xPCMP.EQB(from, from);
SSE2_MOVD_R_to_XMM(to, gprT1); xMOVDZX(to, eax);
SSE_MOVSS_XMM_to_XMM(from, to); xMOVSS(from, to);
SSE2_PCMPEQB_XMM_to_XMM(to, to); xPCMP.EQB(to, to);
localptr[5] = JMP8(0); xForwardJump8 toend3;
x86SetJ8(localptr[1]); case3.SetTarget(); // positive small
DEC32R(gprT2); xDEC(ecx);
MOV32ItoR(gprT1, 0xffffffff); xMOV(eax, 0xffffffff);
SHL32CLtoR(gprT1); xSHL(eax, cl);
SSE2_PCMPEQB_XMM_to_XMM(from, from); xPCMP.EQB(from, from);
SSE2_MOVD_R_to_XMM(to, gprT1); xMOVDZX(to, eax);
SSE_MOVSS_XMM_to_XMM(from, to); xMOVSS(from, to);
SSE2_PCMPEQB_XMM_to_XMM(to, to); xPCMP.EQB(to, to);
localptr[6] = JMP8(0); xForwardJump8 toend4;
x86SetJ8(localptr[3]); case4.SetTarget(); // negative large
MOV32ItoR(gprT1, 0x80000000); xMOV(eax, 0x80000000);
SSE2_PCMPEQB_XMM_to_XMM(to, to); xPCMP.EQB(to, to);
SSE2_MOVD_R_to_XMM(from, gprT1); xMOVDZX(from, eax);
SSE_MOVSS_XMM_to_XMM(to, from); xMOVSS(to, from);
SSE2_PCMPEQB_XMM_to_XMM(from, from); xPCMP.EQB(from, from);
localptr[7] = JMP8(0);
x86SetJ8(localptr[2]); toend1.SetTarget();
x86SetJ8(localptr[4]); toend2.SetTarget();
x86SetJ8(localptr[5]); toend3.SetTarget();
x86SetJ8(localptr[6]); toend4.SetTarget();
x86SetJ8(localptr[7]);
SSE_ANDPS_XMM_to_XMM(to, t1); // to contains mask xAND.PS(to, t1); // to contains mask
SSE_ANDPS_XMM_to_XMM(from, t2); // from contains mask xAND.PS(from, t2); // from contains mask
SSE_ADDSS_XMM_to_XMM(to, from); xADD.SS(to, from);
if (t1b) mVU->regAlloc->clearNeeded(t1); if (t1 != t1in) mVU->regAlloc->clearNeeded(t1);
if (t2b) mVU->regAlloc->clearNeeded(t2); if (t2 != t2in) mVU->regAlloc->clearNeeded(t2);
} }
#define clampOp(opX, isPS) { \ #define clampOp(opX, isPS) { \
@ -438,53 +379,68 @@ void ADD_SS(microVU* mVU, int to, int from, int t1, int t2) {
mVUclamp4(to, t1, (isPS)?0xf:0x8); \ mVUclamp4(to, t1, (isPS)?0xf:0x8); \
} }
void SSE_MAXPS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_MAXPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
if (CHECK_VU_MINMAXHACK) { SSE_MAXPS_XMM_to_XMM(to, from); } {
if (CHECK_VU_MINMAXHACK) { xMAX.PS(to, from); }
else { MIN_MAX_PS(mVU, to, from, t1, t2, 0); } else { MIN_MAX_PS(mVU, to, from, t1, t2, 0); }
} }
void SSE_MINPS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_MINPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
if (CHECK_VU_MINMAXHACK) { SSE_MINPS_XMM_to_XMM(to, from); } {
if (CHECK_VU_MINMAXHACK) { xMIN.PS(to, from); }
else { MIN_MAX_PS(mVU, to, from, t1, t2, 1); } else { MIN_MAX_PS(mVU, to, from, t1, t2, 1); }
} }
void SSE_MAXSS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_MAXSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
if (CHECK_VU_MINMAXHACK) { SSE_MAXSS_XMM_to_XMM(to, from); } {
if (CHECK_VU_MINMAXHACK) { xMAX.SS(to, from); }
else { MIN_MAX_SS(mVU, to, from, t1, 0); } else { MIN_MAX_SS(mVU, to, from, t1, 0); }
} }
void SSE_MINSS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_MINSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
if (CHECK_VU_MINMAXHACK) { SSE_MINSS_XMM_to_XMM(to, from); } {
if (CHECK_VU_MINMAXHACK) { xMIN.SS(to, from); }
else { MIN_MAX_SS(mVU, to, from, t1, 1); } else { MIN_MAX_SS(mVU, to, from, t1, 1); }
} }
void SSE_ADD2SS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_ADD2SS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
if (!CHECK_VUADDSUBHACK) { clampOp(SSE_ADDSS_XMM_to_XMM, 0); } {
if (!CHECK_VUADDSUBHACK) { clampOp(xADD.SS, 0); }
else { ADD_SS(mVU, to, from, t1, t2); } else { ADD_SS(mVU, to, from, t1, t2); }
} }
void SSE_ADD2PS(mV, int to, int from, int t1 = -1, int t2 = -1) { // FIXME: why do we need two identical definitions with different names?
clampOp(SSE_ADDPS_XMM_to_XMM, 1); void SSE_ADD2PS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
clampOp(xADD.PS, 1);
} }
void SSE_ADDPS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_ADDPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
clampOp(SSE_ADDPS_XMM_to_XMM, 1); {
clampOp(xADD.PS, 1);
} }
void SSE_ADDSS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_ADDSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
clampOp(SSE_ADDSS_XMM_to_XMM, 0); {
clampOp(xADD.SS, 0);
} }
void SSE_SUBPS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_SUBPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
clampOp(SSE_SUBPS_XMM_to_XMM, 1); {
clampOp(xSUB.PS, 1);
} }
void SSE_SUBSS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_SUBSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
clampOp(SSE_SUBSS_XMM_to_XMM, 0); {
clampOp(xSUB.SS, 0);
} }
void SSE_MULPS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_MULPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
clampOp(SSE_MULPS_XMM_to_XMM, 1); {
clampOp(xMUL.PS, 1);
} }
void SSE_MULSS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_MULSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
clampOp(SSE_MULSS_XMM_to_XMM, 0); {
clampOp(xMUL.SS, 0);
} }
void SSE_DIVPS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_DIVPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
clampOp(SSE_DIVPS_XMM_to_XMM, 1); {
clampOp(xDIV.PS, 1);
} }
void SSE_DIVSS(mV, int to, int from, int t1 = -1, int t2 = -1) { void SSE_DIVSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
clampOp(SSE_DIVSS_XMM_to_XMM, 0); {
clampOp(xDIV.SS, 0);
} }
//------------------------------------------------------------------ //------------------------------------------------------------------
@ -493,7 +449,7 @@ void SSE_DIVSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
static __pagealigned u8 mVUsearchXMM[__pagesize]; static __pagealigned u8 mVUsearchXMM[__pagesize];
// Generates a custom optimized block-search function // Generates a custom optimized block-search function
// Note: Structs must be 16-byte aligned! (GCC doesn't guarantee this) // Note: Structs must be 16-byte aligned! (GCC doesn't guarantee this)
void mVUcustomSearch() { void mVUcustomSearch() {
HostSys::MemProtectStatic(mVUsearchXMM, Protect_ReadWrite, false); HostSys::MemProtectStatic(mVUsearchXMM, Protect_ReadWrite, false);

View File

@ -21,54 +21,61 @@
#define AND_XYZW ((_XYZW_SS && modXYZW) ? (1) : (mFLAG.doFlag ? (_X_Y_Z_W) : (flipMask[_X_Y_Z_W]))) #define AND_XYZW ((_XYZW_SS && modXYZW) ? (1) : (mFLAG.doFlag ? (_X_Y_Z_W) : (flipMask[_X_Y_Z_W])))
#define ADD_XYZW ((_XYZW_SS && modXYZW) ? (_X ? 3 : (_Y ? 2 : (_Z ? 1 : 0))) : 0) #define ADD_XYZW ((_XYZW_SS && modXYZW) ? (_X ? 3 : (_Y ? 2 : (_Z ? 1 : 0))) : 0)
#define SHIFT_XYZW(gprReg) { if (_XYZW_SS && modXYZW && !_W) { SHL32ItoR(gprReg, ADD_XYZW); } } #define SHIFT_XYZW(gprReg) { if (_XYZW_SS && modXYZW && !_W) { xSHL(gprReg, ADD_XYZW); } }
// Note: If modXYZW is true, then it adjusts XYZW for Single Scalar operations // Note: If modXYZW is true, then it adjusts XYZW for Single Scalar operations
static void mVUupdateFlags(mV, int reg, int regT1 = -1, int regT2 = -1, bool modXYZW = 1) { static void mVUupdateFlags(mV, xmm reg, xmm regT1in = xEmptyReg, xmm regT2 = xEmptyReg, bool modXYZW = 1) {
int sReg, mReg = gprT1, regT1b = 0, regT2b = 0; x32 mReg = gprT1;
bool regT2b = false;
static const u16 flipMask[16] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15}; static const u16 flipMask[16] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
//SysPrintf("Status = %d; Mac = %d\n", sFLAG.doFlag, mFLAG.doFlag); //SysPrintf("Status = %d; Mac = %d\n", sFLAG.doFlag, mFLAG.doFlag);
if (!sFLAG.doFlag && !mFLAG.doFlag) { return; } if (!sFLAG.doFlag && !mFLAG.doFlag) { return; }
if ((mFLAG.doFlag && !(_XYZW_SS && modXYZW))) {
if (regT2 < 0) { regT2 = mVU->regAlloc->allocReg(); regT2b = 1; } xmm regT1 = regT1in.IsEmpty() ? mVU->regAlloc->allocReg() : regT1in;
SSE2_PSHUFD_XMM_to_XMM(regT2, reg, 0x1B); // Flip wzyx to xyzw if ((mFLAG.doFlag && !(_XYZW_SS && modXYZW)))
{
if (regT2.IsEmpty())
{
regT2 = mVU->regAlloc->allocReg();
regT2b = true;
}
xPSHUF.D(regT2, reg, 0x1B); // Flip wzyx to xyzw
} }
else regT2 = reg; else
regT2 = reg;
if (sFLAG.doFlag) { if (sFLAG.doFlag) {
getFlagReg(sReg, sFLAG.write); // Set sReg to valid GPR by Cur Flag Instance mVUallocSFLAGa(getFlagReg(sFLAG.write), sFLAG.lastWrite); // Get Prev Status Flag
mVUallocSFLAGa(sReg, sFLAG.lastWrite); // Get Prev Status Flag if (sFLAG.doNonSticky) xAND(getFlagReg(sFLAG.write), 0xfffc00ff); // Clear O,U,S,Z flags
if (sFLAG.doNonSticky) AND32ItoR(sReg, 0xfffc00ff); // Clear O,U,S,Z flags
} }
if (regT1 < 0) { regT1 = mVU->regAlloc->allocReg(); regT1b = 1; }
//-------------------------Check for Signed flags------------------------------ //-------------------------Check for Signed flags------------------------------
SSE_MOVMSKPS_XMM_to_R32(mReg, regT2); // Move the Sign Bits of the t2reg xMOVMSKPS(mReg, regT2); // Move the Sign Bits of the t2reg
SSE_XORPS_XMM_to_XMM (regT1, regT1); // Clear regT1 xXOR.PS (regT1, regT1); // Clear regT1
SSE_CMPEQPS_XMM_to_XMM (regT1, regT2); // Set all F's if each vector is zero xCMPEQ.PS(regT1, regT2); // Set all F's if each vector is zero
SSE_MOVMSKPS_XMM_to_R32(gprT2, regT1); // Used for Zero Flag Calculation xMOVMSKPS(gprT2, regT1); // Used for Zero Flag Calculation
AND32ItoR(mReg, AND_XYZW); // Grab "Is Signed" bits from the previous calculation xAND(mReg, AND_XYZW); // Grab "Is Signed" bits from the previous calculation
SHL32ItoR(mReg, 4 + ADD_XYZW); xSHL(mReg, 4 + ADD_XYZW);
//-------------------------Check for Zero flags------------------------------ //-------------------------Check for Zero flags------------------------------
AND32ItoR(gprT2, AND_XYZW); // Grab "Is Zero" bits from the previous calculation xAND(gprT2, AND_XYZW); // Grab "Is Zero" bits from the previous calculation
if (mFLAG.doFlag) { SHIFT_XYZW(gprT2); } if (mFLAG.doFlag) { SHIFT_XYZW(gprT2); }
OR32RtoR(mReg, gprT2); xOR(mReg, gprT2);
//-------------------------Write back flags------------------------------ //-------------------------Write back flags------------------------------
if (mFLAG.doFlag) mVUallocMFLAGb(mVU, mReg, mFLAG.write); // Set Mac Flag if (mFLAG.doFlag) mVUallocMFLAGb(mVU, mReg, mFLAG.write); // Set Mac Flag
if (sFLAG.doFlag) { if (sFLAG.doFlag) {
OR32RtoR (sReg, mReg); xOR(getFlagReg(sFLAG.write), mReg);
if (sFLAG.doNonSticky) { if (sFLAG.doNonSticky) {
SHL32ItoR(mReg, 8); xSHL(mReg, 8);
OR32RtoR (sReg, mReg); xOR(getFlagReg(sFLAG.write), mReg);
} }
} }
if (regT1b) mVU->regAlloc->clearNeeded(regT1); if (regT1 != regT1in) mVU->regAlloc->clearNeeded(regT1);
if (regT2b) mVU->regAlloc->clearNeeded(regT2); if (regT2b) mVU->regAlloc->clearNeeded(regT2);
} }
@ -76,7 +83,7 @@ static void mVUupdateFlags(mV, int reg, int regT1 = -1, int regT2 = -1, bool mod
// Helper Macros and Functions // Helper Macros and Functions
//------------------------------------------------------------------ //------------------------------------------------------------------
static void (*SSE_PS[]) (microVU*, int, int, int, int) = { static void (*SSE_PS[]) (microVU*, xmm, xmm, xmm, xmm) = {
SSE_ADDPS, // 0 SSE_ADDPS, // 0
SSE_SUBPS, // 1 SSE_SUBPS, // 1
SSE_MULPS, // 2 SSE_MULPS, // 2
@ -85,7 +92,7 @@ static void (*SSE_PS[]) (microVU*, int, int, int, int) = {
SSE_ADD2PS // 5 SSE_ADD2PS // 5
}; };
static void (*SSE_SS[]) (microVU*, int, int, int, int) = { static void (*SSE_SS[]) (microVU*, xmm, xmm, xmm, xmm) = {
SSE_ADDSS, // 0 SSE_ADDSS, // 0
SSE_SUBSS, // 1 SSE_SUBSS, // 1
SSE_MULSS, // 2 SSE_MULSS, // 2
@ -122,9 +129,9 @@ void setupPass1(microVU* mVU, int opCase, bool isACC, bool noFlagUpdate) {
bool doSafeSub(microVU* mVU, int opCase, int opType, bool isACC) { bool doSafeSub(microVU* mVU, int opCase, int opType, bool isACC) {
opCase1 { opCase1 {
if ((opType == 1) && (_Ft_ == _Fs_)) { if ((opType == 1) && (_Ft_ == _Fs_)) {
int Fs = mVU->regAlloc->allocReg(-1, isACC ? 32 : _Fd_, _X_Y_Z_W); xmm Fs = mVU->regAlloc->allocReg(-1, isACC ? 32 : _Fd_, _X_Y_Z_W);
SSE2_PXOR_XMM_to_XMM(Fs, Fs); // Set to Positive 0 xPXOR(Fs, Fs); // Set to Positive 0
mVUupdateFlags(mVU, Fs, -1); mVUupdateFlags(mVU, Fs);
mVU->regAlloc->clearNeeded(Fs); mVU->regAlloc->clearNeeded(Fs);
return 1; return 1;
} }
@ -133,11 +140,11 @@ bool doSafeSub(microVU* mVU, int opCase, int opType, bool isACC) {
} }
// Sets Up Ft Reg for Normal, BC, I, and Q Cases // Sets Up Ft Reg for Normal, BC, I, and Q Cases
void setupFtReg(microVU* mVU, int& Ft, int& tempFt, int opCase) { void setupFtReg(microVU* mVU, xmm& Ft, xmm& tempFt, int opCase) {
opCase1 { opCase1 {
if (_XYZW_SS2) { Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W); tempFt = Ft; } if (_XYZW_SS2) { Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W); tempFt = Ft; }
else if (clampE) { Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf); tempFt = Ft; } else if (clampE) { Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf); tempFt = Ft; }
else { Ft = mVU->regAlloc->allocReg(_Ft_); tempFt = -1; } else { Ft = mVU->regAlloc->allocReg(_Ft_); tempFt = xEmptyReg; }
} }
opCase2 { opCase2 {
tempFt = mVU->regAlloc->allocReg(_Ft_); tempFt = mVU->regAlloc->allocReg(_Ft_);
@ -148,7 +155,7 @@ void setupFtReg(microVU* mVU, int& Ft, int& tempFt, int opCase) {
} }
opCase3 { Ft = mVU->regAlloc->allocReg(33, 0, _X_Y_Z_W); tempFt = Ft; } opCase3 { Ft = mVU->regAlloc->allocReg(33, 0, _X_Y_Z_W); tempFt = Ft; }
opCase4 { opCase4 {
if (!clampE && _XYZW_SS && !mVUinfo.readQ) { Ft = xmmPQ; tempFt = -1; } if (!clampE && _XYZW_SS && !mVUinfo.readQ) { Ft = xmmPQ; tempFt = xEmptyReg; }
else { Ft = mVU->regAlloc->allocReg(); tempFt = Ft; getQreg(Ft, mVUinfo.readQ); } else { Ft = mVU->regAlloc->allocReg(); tempFt = Ft; getQreg(Ft, mVUinfo.readQ); }
} }
} }
@ -159,27 +166,27 @@ void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, co
pass2 { pass2 {
if (doSafeSub(mVU, opCase, opType, isACC)) return; if (doSafeSub(mVU, opCase, opType, isACC)) return;
int Fs, Ft, ACC, tempFt; xmm Fs, Ft, ACC, tempFt;
setupFtReg(mVU, Ft, tempFt, opCase); setupFtReg(mVU, Ft, tempFt, opCase);
if (isACC) { if (isACC) {
Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W); Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
ACC = mVU->regAlloc->allocReg((_X_Y_Z_W == 0xf) ? -1 : 32, 32, 0xf, 0); ACC = mVU->regAlloc->allocReg((_X_Y_Z_W == 0xf) ? -1 : 32, 32, 0xf, 0);
if (_XYZW_SS2) SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); if (_XYZW_SS2) xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W));
} }
else { Fs = mVU->regAlloc->allocReg(_Fs_, _Fd_, _X_Y_Z_W); } else { Fs = mVU->regAlloc->allocReg(_Fs_, _Fd_, _X_Y_Z_W); }
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W); if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W); if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
if (_XYZW_SS) SSE_SS[opType](mVU, Fs, Ft, -1, -1); if (_XYZW_SS) SSE_SS[opType](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
else SSE_PS[opType](mVU, Fs, Ft, -1, -1); else SSE_PS[opType](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
if (isACC) { if (isACC) {
if (_XYZW_SS) SSE_MOVSS_XMM_to_XMM(ACC, Fs); if (_XYZW_SS) xMOVSS(ACC, Fs);
else mVUmergeRegs(ACC, Fs, _X_Y_Z_W); else mVUmergeRegs(ACC, Fs, _X_Y_Z_W);
mVUupdateFlags(mVU, ACC, Fs, tempFt); mVUupdateFlags(mVU, ACC, Fs, tempFt);
if (_XYZW_SS2) SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); if (_XYZW_SS2) xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W));
mVU->regAlloc->clearNeeded(ACC); mVU->regAlloc->clearNeeded(ACC);
} }
else mVUupdateFlags(mVU, Fs, tempFt); else mVUupdateFlags(mVU, Fs, tempFt);
@ -195,30 +202,30 @@ void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, co
void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, const char* opName, int clampType) { void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, const char* opName, int clampType) {
pass1 { setupPass1(mVU, opCase, 1, 0); } pass1 { setupPass1(mVU, opCase, 1, 0); }
pass2 { pass2 {
int Fs, Ft, ACC, tempFt; xmm Fs, Ft, ACC, tempFt;
setupFtReg(mVU, Ft, tempFt, opCase); setupFtReg(mVU, Ft, tempFt, opCase);
Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W); Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
ACC = mVU->regAlloc->allocReg(32, 32, 0xf, 0); ACC = mVU->regAlloc->allocReg(32, 32, 0xf, 0);
if (_XYZW_SS2) { SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); } if (_XYZW_SS2) { xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W); if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W); if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
if (_XYZW_SS) SSE_SS[2](mVU, Fs, Ft, -1, -1); if (_XYZW_SS) SSE_SS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
else SSE_PS[2](mVU, Fs, Ft, -1, -1); else SSE_PS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
if (_XYZW_SS || _X_Y_Z_W == 0xf) { if (_XYZW_SS || _X_Y_Z_W == 0xf) {
if (_XYZW_SS) SSE_SS[opType](mVU, ACC, Fs, tempFt, -1); if (_XYZW_SS) SSE_SS[opType](mVU, ACC, Fs, tempFt, xEmptyReg);
else SSE_PS[opType](mVU, ACC, Fs, tempFt, -1); else SSE_PS[opType](mVU, ACC, Fs, tempFt, xEmptyReg);
mVUupdateFlags(mVU, ACC, Fs, tempFt); mVUupdateFlags(mVU, ACC, Fs, tempFt);
if (_XYZW_SS && _X_Y_Z_W != 8) SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); if (_XYZW_SS && _X_Y_Z_W != 8) xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W));
} }
else { else {
int tempACC = mVU->regAlloc->allocReg(); xmm tempACC = mVU->regAlloc->allocReg();
SSE_MOVAPS_XMM_to_XMM(tempACC, ACC); xMOVAPS(tempACC, ACC);
SSE_PS[opType](mVU, tempACC, Fs, tempFt, -1); SSE_PS[opType](mVU, tempACC, Fs, tempFt, xEmptyReg);
mVUmergeRegs(ACC, tempACC, _X_Y_Z_W); mVUmergeRegs(ACC, tempACC, _X_Y_Z_W);
mVUupdateFlags(mVU, ACC, Fs, tempFt); mVUupdateFlags(mVU, ACC, Fs, tempFt);
mVU->regAlloc->clearNeeded(tempACC); mVU->regAlloc->clearNeeded(tempACC);
@ -236,22 +243,22 @@ void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, const char* op
void mVU_FMACc(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) { void mVU_FMACc(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) {
pass1 { setupPass1(mVU, opCase, 0, 0); } pass1 { setupPass1(mVU, opCase, 0, 0); }
pass2 { pass2 {
int Fs, Ft, ACC, tempFt; xmm Fs, Ft, ACC, tempFt;
setupFtReg(mVU, Ft, tempFt, opCase); setupFtReg(mVU, Ft, tempFt, opCase);
ACC = mVU->regAlloc->allocReg(32); ACC = mVU->regAlloc->allocReg(32);
Fs = mVU->regAlloc->allocReg(_Fs_, _Fd_, _X_Y_Z_W); Fs = mVU->regAlloc->allocReg(_Fs_, _Fd_, _X_Y_Z_W);
if (_XYZW_SS2) { SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); } if (_XYZW_SS2) { xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W); if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W); if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
if (clampType & cACC) mVUclamp2(mVU, ACC, -1, _X_Y_Z_W); if (clampType & cACC) mVUclamp2(mVU, ACC, xEmptyReg, _X_Y_Z_W);
if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, -1, -1); SSE_SS[0](mVU, Fs, ACC, tempFt, -1); } if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_SS[0](mVU, Fs, ACC, tempFt, xEmptyReg); }
else { SSE_PS[2](mVU, Fs, Ft, -1, -1); SSE_PS[0](mVU, Fs, ACC, tempFt, -1); } else { SSE_PS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_PS[0](mVU, Fs, ACC, tempFt, xEmptyReg); }
if (_XYZW_SS2) { SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); } if (_XYZW_SS2) { xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
mVUupdateFlags(mVU, Fs, tempFt); mVUupdateFlags(mVU, Fs, tempFt);
@ -267,18 +274,18 @@ void mVU_FMACc(microVU* mVU, int recPass, int opCase, const char* opName, int cl
void mVU_FMACd(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) { void mVU_FMACd(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) {
pass1 { setupPass1(mVU, opCase, 0, 0); } pass1 { setupPass1(mVU, opCase, 0, 0); }
pass2 { pass2 {
int Fs, Ft, Fd, tempFt; xmm Fs, Ft, Fd, tempFt;
setupFtReg(mVU, Ft, tempFt, opCase); setupFtReg(mVU, Ft, tempFt, opCase);
Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W); Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
Fd = mVU->regAlloc->allocReg(32, _Fd_, _X_Y_Z_W); Fd = mVU->regAlloc->allocReg(32, _Fd_, _X_Y_Z_W);
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W); if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W); if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
if (clampType & cACC) mVUclamp2(mVU, Fd, -1, _X_Y_Z_W); if (clampType & cACC) mVUclamp2(mVU, Fd, xEmptyReg, _X_Y_Z_W);
if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, -1, -1); SSE_SS[1](mVU, Fd, Fs, tempFt, -1); } if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_SS[1](mVU, Fd, Fs, tempFt, xEmptyReg); }
else { SSE_PS[2](mVU, Fs, Ft, -1, -1); SSE_PS[1](mVU, Fd, Fs, tempFt, -1); } else { SSE_PS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_PS[1](mVU, Fd, Fs, tempFt, xEmptyReg); }
mVUupdateFlags(mVU, Fd, Fs, tempFt); mVUupdateFlags(mVU, Fd, Fs, tempFt);
@ -295,8 +302,8 @@ mVUop(mVU_ABS) {
pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); } pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); }
pass2 { pass2 {
if (!_Ft_) return; if (!_Ft_) return;
int Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf))); xmm Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
SSE_ANDPS_M128_to_XMM(Fs, (uptr)mVUglob.absclip); xAND.PS(Fs, ptr128[mVUglob.absclip]);
mVU->regAlloc->clearNeeded(Fs); mVU->regAlloc->clearNeeded(Fs);
} }
pass3 { mVUlog("ABS"); mVUlogFtFs(); } pass3 { mVUlog("ABS"); mVUlogFtFs(); }
@ -306,11 +313,11 @@ mVUop(mVU_ABS) {
mVUop(mVU_OPMULA) { mVUop(mVU_OPMULA) {
pass1 { mVUanalyzeFMAC1(mVU, 0, _Fs_, _Ft_); } pass1 { mVUanalyzeFMAC1(mVU, 0, _Fs_, _Ft_); }
pass2 { pass2 {
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W); xmm Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W);
int Fs = mVU->regAlloc->allocReg(_Fs_, 32, _X_Y_Z_W); xmm Fs = mVU->regAlloc->allocReg(_Fs_, 32, _X_Y_Z_W);
SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY xPSHUF.D(Fs, Fs, 0xC9); // WXZY
SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ xPSHUF.D(Ft, Ft, 0xD2); // WYXZ
SSE_MULPS(mVU, Fs, Ft); SSE_MULPS(mVU, Fs, Ft);
mVU->regAlloc->clearNeeded(Ft); mVU->regAlloc->clearNeeded(Ft);
mVUupdateFlags(mVU, Fs); mVUupdateFlags(mVU, Fs);
@ -324,12 +331,12 @@ mVUop(mVU_OPMULA) {
mVUop(mVU_OPMSUB) { mVUop(mVU_OPMSUB) {
pass1 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, _Ft_); } pass1 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, _Ft_); }
pass2 { pass2 {
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf); xmm Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf);
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf); xmm Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
int ACC = mVU->regAlloc->allocReg(32, _Fd_, _X_Y_Z_W); xmm ACC = mVU->regAlloc->allocReg(32, _Fd_, _X_Y_Z_W);
SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY xPSHUF.D(Fs, Fs, 0xC9); // WXZY
SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ xPSHUF.D(Ft, Ft, 0xD2); // WYXZ
SSE_MULPS(mVU, Fs, Ft); SSE_MULPS(mVU, Fs, Ft);
SSE_SUBPS(mVU, ACC, Fs); SSE_SUBPS(mVU, ACC, Fs);
mVU->regAlloc->clearNeeded(Fs); mVU->regAlloc->clearNeeded(Fs);
@ -343,24 +350,24 @@ mVUop(mVU_OPMSUB) {
} }
// FTOI0/FTIO4/FTIO12/FTIO15 Opcodes // FTOI0/FTIO4/FTIO12/FTIO15 Opcodes
static void mVU_FTOIx(mP, uptr addr, const char* opName) { static void mVU_FTOIx(mP, const float (*addr)[4], const char* opName) {
pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); } pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); }
pass2 { pass2 {
if (!_Ft_) return; if (!_Ft_) return;
int Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf))); xmm Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
int t1 = mVU->regAlloc->allocReg(); xmm t1 = mVU->regAlloc->allocReg();
int t2 = mVU->regAlloc->allocReg(); xmm t2 = mVU->regAlloc->allocReg();
// Note: For help understanding this algorithm see recVUMI_FTOI_Saturate() // Note: For help understanding this algorithm see recVUMI_FTOI_Saturate()
SSE_MOVAPS_XMM_to_XMM(t1, Fs); xMOVAPS(t1, Fs);
if (addr) { SSE_MULPS_M128_to_XMM(Fs, addr); } if (addr) { xMUL.PS(Fs, ptr128[addr]); }
SSE2_CVTTPS2DQ_XMM_to_XMM(Fs, Fs); xCVTTPS2DQ(Fs, Fs);
SSE2_PXOR_M128_to_XMM(t1, (uptr)mVUglob.signbit); xPXOR(t1, ptr128[mVUglob.signbit]);
SSE2_PSRAD_I8_to_XMM (t1, 31); xPSRA.D(t1, 31);
SSE_MOVAPS_XMM_to_XMM(t2, Fs); xMOVAPS(t2, Fs);
SSE2_PCMPEQD_M128_to_XMM(t2, (uptr)mVUglob.signbit); xPCMP.EQD(t2, ptr128[mVUglob.signbit]);
SSE_ANDPS_XMM_to_XMM (t1, t2); xAND.PS(t1, t2);
SSE2_PADDD_XMM_to_XMM(Fs, t1); xPADD.D(Fs, t1);
mVU->regAlloc->clearNeeded(Fs); mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->clearNeeded(t1); mVU->regAlloc->clearNeeded(t1);
@ -370,14 +377,14 @@ static void mVU_FTOIx(mP, uptr addr, const char* opName) {
} }
// ITOF0/ITOF4/ITOF12/ITOF15 Opcodes // ITOF0/ITOF4/ITOF12/ITOF15 Opcodes
static void mVU_ITOFx(mP, uptr addr, const char* opName) { static void mVU_ITOFx(mP, const float (*addr)[4], const char* opName) {
pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); } pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); }
pass2 { pass2 {
if (!_Ft_) return; if (!_Ft_) return;
int Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf))); xmm Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
SSE2_CVTDQ2PS_XMM_to_XMM(Fs, Fs); xCVTDQ2PS(Fs, Fs);
if (addr) { SSE_MULPS_M128_to_XMM(Fs, addr); } if (addr) { xMUL.PS(Fs, ptr128[addr]); }
//mVUclamp2(Fs, xmmT1, 15); // Clamp (not sure if this is needed) //mVUclamp2(Fs, xmmT1, 15); // Clamp (not sure if this is needed)
mVU->regAlloc->clearNeeded(Fs); mVU->regAlloc->clearNeeded(Fs);
@ -389,34 +396,34 @@ static void mVU_ITOFx(mP, uptr addr, const char* opName) {
mVUop(mVU_CLIP) { mVUop(mVU_CLIP) {
pass1 { mVUanalyzeFMAC4(mVU, _Fs_, _Ft_); } pass1 { mVUanalyzeFMAC4(mVU, _Fs_, _Ft_); }
pass2 { pass2 {
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf); xmm Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0x1); xmm Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0x1);
int t1 = mVU->regAlloc->allocReg(); xmm t1 = mVU->regAlloc->allocReg();
mVUunpack_xyzw(Ft, Ft, 0); mVUunpack_xyzw(Ft, Ft, 0);
mVUallocCFLAGa(mVU, gprT1, cFLAG.lastWrite); mVUallocCFLAGa(mVU, gprT1, cFLAG.lastWrite);
SHL32ItoR(gprT1, 6); xSHL(gprT1, 6);
SSE_ANDPS_M128_to_XMM(Ft, (uptr)mVUglob.absclip); xAND.PS(Ft, ptr128[&mVUglob.absclip[0]]);
SSE_MOVAPS_XMM_to_XMM(t1, Ft); xMOVAPS(t1, Ft);
SSE_ORPS_M128_to_XMM(t1, (uptr)mVUglob.signbit); xPOR(t1, ptr128[&mVUglob.signbit[0]]);
SSE_CMPNLEPS_XMM_to_XMM(t1, Fs); // -w, -z, -y, -x xCMPNLE.PS(t1, Fs); // -w, -z, -y, -x
SSE_CMPLTPS_XMM_to_XMM(Ft, Fs); // +w, +z, +y, +x xCMPLT.PS(Ft, Fs); // +w, +z, +y, +x
SSE_MOVAPS_XMM_to_XMM(Fs, Ft); // Fs = +w, +z, +y, +x xMOVAPS(Fs, Ft); // Fs = +w, +z, +y, +x
SSE_UNPCKLPS_XMM_to_XMM(Ft, t1); // Ft = -y,+y,-x,+x xUNPCK.LPS(Ft, t1); // Ft = -y,+y,-x,+x
SSE_UNPCKHPS_XMM_to_XMM(Fs, t1); // Fs = -w,+w,-z,+z xUNPCK.HPS(Fs, t1); // Fs = -w,+w,-z,+z
SSE_MOVMSKPS_XMM_to_R32(gprT2, Fs); // -w,+w,-z,+z xMOVMSKPS(gprT2, Fs); // -w,+w,-z,+z
AND32ItoR(gprT2, 0x3); xAND(gprT2, 0x3);
SHL32ItoR(gprT2, 4); xSHL(gprT2, 4);
OR32RtoR (gprT1, gprT2); xOR(gprT1, gprT2);
SSE_MOVMSKPS_XMM_to_R32(gprT2, Ft); // -y,+y,-x,+x xMOVMSKPS(gprT2, Ft); // -y,+y,-x,+x
AND32ItoR(gprT2, 0xf); xAND(gprT2, 0xf);
OR32RtoR (gprT1, gprT2); xOR(gprT1, gprT2);
AND32ItoR(gprT1, 0xffffff); xAND(gprT1, 0xffffff);
mVUallocCFLAGb(mVU, gprT1, cFLAG.write); mVUallocCFLAGb(mVU, gprT1, cFLAG.write);
mVU->regAlloc->clearNeeded(Fs); mVU->regAlloc->clearNeeded(Fs);
@ -512,12 +519,12 @@ mVUop(mVU_MINIx) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIx", 0); }
mVUop(mVU_MINIy) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIy", 0); } mVUop(mVU_MINIy) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIy", 0); }
mVUop(mVU_MINIz) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIz", 0); } mVUop(mVU_MINIz) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIz", 0); }
mVUop(mVU_MINIw) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIw", 0); } mVUop(mVU_MINIw) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIw", 0); }
mVUop(mVU_FTOI0) { mVU_FTOIx(mX, (uptr)0, "FTOI0"); } mVUop(mVU_FTOI0) { mVU_FTOIx(mX, NULL, "FTOI0"); }
mVUop(mVU_FTOI4) { mVU_FTOIx(mX, (uptr)mVUglob.FTOI_4, "FTOI4"); } mVUop(mVU_FTOI4) { mVU_FTOIx(mX, &mVUglob.FTOI_4, "FTOI4"); }
mVUop(mVU_FTOI12) { mVU_FTOIx(mX, (uptr)mVUglob.FTOI_12, "FTOI12"); } mVUop(mVU_FTOI12) { mVU_FTOIx(mX, &mVUglob.FTOI_12, "FTOI12"); }
mVUop(mVU_FTOI15) { mVU_FTOIx(mX, (uptr)mVUglob.FTOI_15, "FTOI15"); } mVUop(mVU_FTOI15) { mVU_FTOIx(mX, &mVUglob.FTOI_15, "FTOI15"); }
mVUop(mVU_ITOF0) { mVU_ITOFx(mX, (uptr)0, "ITOF0"); } mVUop(mVU_ITOF0) { mVU_ITOFx(mX, NULL, "ITOF0"); }
mVUop(mVU_ITOF4) { mVU_ITOFx(mX, (uptr)mVUglob.ITOF_4, "ITOF4"); } mVUop(mVU_ITOF4) { mVU_ITOFx(mX, &mVUglob.ITOF_4, "ITOF4"); }
mVUop(mVU_ITOF12) { mVU_ITOFx(mX, (uptr)mVUglob.ITOF_12, "ITOF12"); } mVUop(mVU_ITOF12) { mVU_ITOFx(mX, &mVUglob.ITOF_12, "ITOF12"); }
mVUop(mVU_ITOF15) { mVU_ITOFx(mX, (uptr)mVUglob.ITOF_15, "ITOF15"); } mVUop(mVU_ITOF15) { mVU_ITOFx(mX, &mVUglob.ITOF_15, "ITOF15"); }
mVUop(mVU_NOP) { pass3 { mVUlog("NOP"); } } mVUop(mVU_NOP) { pass3 { mVUlog("NOP"); } }

View File

@ -33,7 +33,7 @@ typedef void (__fastcall *nVifrecCall)(uptr dest, uptr src);
#include "newVif_BlockBuffer.h" #include "newVif_BlockBuffer.h"
#include "newVif_HashBucket.h" #include "newVif_HashBucket.h"
extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0); extern void mVUmergeRegs(xRegisterSSE dest, xRegisterSSE src, int xyzw, bool modXYZW = 0);
extern void _nVifUnpack (int idx, u8 *data, u32 size, bool isFill); extern void _nVifUnpack (int idx, u8 *data, u32 size, bool isFill);
extern void dVifUnpack (int idx, u8 *data, u32 size, bool isFill); extern void dVifUnpack (int idx, u8 *data, u32 size, bool isFill);
extern void dVifReset (int idx); extern void dVifReset (int idx);

View File

@ -84,7 +84,7 @@ _f void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const { void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking."); pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
int t = regX.Id ? 0 : 1; // Get Temp Reg xRegisterSSE t = regX == xmm0 ? xmm1 : xmm0; // Get Temp Reg
int cc = aMin(vCL, 3); int cc = aMin(vCL, 3);
u32 m0 = (vB.mask >> (cc * 8)) & 0xff; u32 m0 = (vB.mask >> (cc * 8)) & 0xff;
u32 m1 = m0 & 0xaa; u32 m1 = m0 & 0xaa;
@ -95,18 +95,18 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
makeMergeMask(m3); makeMergeMask(m3);
makeMergeMask(m4); makeMergeMask(m4);
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect
if (doMask&&m2) { mergeVectors(regX.Id, xmmRow.Id, t, m2); } // Merge Row if (doMask&&m2) { mergeVectors(regX, xmmRow, t, m2); } // Merge Row
if (doMask&&m3) { mergeVectors(regX.Id, xmmCol0.Id+cc, t, m3); } // Merge Col if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), t, m3); } // Merge Col
if (doMask&&m4) { mergeVectors(regX.Id, xmmTemp.Id, t, m4); } // Merge Write Protect if (doMask&&m4) { mergeVectors(regX, xmmTemp, t, m4); } // Merge Write Protect
if (doMode) { if (doMode) {
u32 m5 = (~m1>>1) & ~m0; u32 m5 = (~m1>>1) & ~m0;
if (!doMask) m5 = 0xf; if (!doMask) m5 = 0xf;
else makeMergeMask(m5); else makeMergeMask(m5);
if (m5 < 0xf) { if (m5 < 0xf) {
xPXOR(xmmTemp, xmmTemp); xPXOR(xmmTemp, xmmTemp);
mergeVectors(xmmTemp.Id, xmmRow.Id, t, m5); mergeVectors(xmmTemp, xmmRow, t, m5);
xPADD.D(regX, xmmTemp); xPADD.D(regX, xmmTemp);
if (doMode==2) mergeVectors(xmmRow.Id, regX.Id, t, m5); if (doMode==2) mergeVectors(xmmRow, regX, t, m5);
} }
else if (m5 == 0xf) { else if (m5 == 0xf) {
xPADD.D(regX, xmmRow); xPADD.D(regX, xmmRow);

View File

@ -25,13 +25,13 @@
static __pagealigned u8 nVifUpkExec[__pagesize*4]; static __pagealigned u8 nVifUpkExec[__pagesize*4];
// Merges xmm vectors without modifying source reg // Merges xmm vectors without modifying source reg
void mergeVectors(int dest, int src, int temp, int xyzw) { void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw) {
if (x86caps.hasStreamingSIMD4Extensions || (xyzw==15) if (x86caps.hasStreamingSIMD4Extensions || (xyzw==15)
|| (xyzw==12) || (xyzw==11) || (xyzw==8) || (xyzw==3)) { || (xyzw==12) || (xyzw==11) || (xyzw==8) || (xyzw==3)) {
mVUmergeRegs(dest, src, xyzw); mVUmergeRegs(dest, src, xyzw);
} }
else { else {
SSE_MOVAPS_XMM_to_XMM(temp, src); xMOVAPS(temp, src);
mVUmergeRegs(dest, temp, xyzw); mVUmergeRegs(dest, temp, xyzw);
} }
} }
@ -48,9 +48,9 @@ void loadRowCol(nVifStruct& v) {
xPSHUF.D(xmm1, xmm1, _v0); xPSHUF.D(xmm1, xmm1, _v0);
xPSHUF.D(xmm2, xmm2, _v0); xPSHUF.D(xmm2, xmm2, _v0);
xPSHUF.D(xmm6, xmm6, _v0); xPSHUF.D(xmm6, xmm6, _v0);
mVUmergeRegs(XMM6, XMM0, 8); mVUmergeRegs(xmm6, xmm0, 8);
mVUmergeRegs(XMM6, XMM1, 4); mVUmergeRegs(xmm6, xmm1, 4);
mVUmergeRegs(XMM6, XMM2, 2); mVUmergeRegs(xmm6, xmm2, 2);
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]); xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]); xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]); xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
@ -221,13 +221,13 @@ void VifUnpackSSE_Base::xUPK_V4_5() const {
xMOVAPS (destReg, workReg); // x|x|x|R xMOVAPS (destReg, workReg); // x|x|x|R
xPSRL.D (workReg, 8); // ABG xPSRL.D (workReg, 8); // ABG
xPSLL.D (workReg, 3); // AB|G5.000 xPSLL.D (workReg, 3); // AB|G5.000
mVUmergeRegs(destReg.Id, workReg.Id, 0x4); // x|x|G|R mVUmergeRegs(destReg, workReg, 0x4);// x|x|G|R
xPSRL.D (workReg, 8); // AB xPSRL.D (workReg, 8); // AB
xPSLL.D (workReg, 3); // A|B5.000 xPSLL.D (workReg, 3); // A|B5.000
mVUmergeRegs(destReg.Id, workReg.Id, 0x2); // x|B|G|R mVUmergeRegs(destReg, workReg, 0x2);// x|B|G|R
xPSRL.D (workReg, 8); // A xPSRL.D (workReg, 8); // A
xPSLL.D (workReg, 7); // A.0000000 xPSLL.D (workReg, 7); // A.0000000
mVUmergeRegs(destReg.Id, workReg.Id, 0x1); // A|B|G|R mVUmergeRegs(destReg, workReg, 0x1);// A|B|G|R
xPSLL.D (destReg, 24); // can optimize to xPSLL.D (destReg, 24); // can optimize to
xPSRL.D (destReg, 24); // single AND... xPSRL.D (destReg, 24); // single AND...
} }

View File

@ -24,7 +24,7 @@
using namespace x86Emitter; using namespace x86Emitter;
extern void mergeVectors(int dest, int src, int temp, int xyzw); extern void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw);
extern void loadRowCol(nVifStruct& v); extern void loadRowCol(nVifStruct& v);
// -------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------