microVU: converted all code to the new emitter style. If anything breaks, blame the guy below me.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3406 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
sudonim1 2010-07-06 20:05:21 +00:00
parent a4afe629e5
commit b53a92e019
19 changed files with 1242 additions and 1264 deletions

View File

@ -363,6 +363,8 @@ template< typename T > void xWrite( T val );
bool operator==( const xRegisterSSE& src ) const { return this->Id == src.Id; }
bool operator!=( const xRegisterSSE& src ) const { return this->Id != src.Id; }
void operator=( xRegisterSSE src ) { Id = src.Id; }
xRegisterSSE& operator++()
{
++Id &= (iREGCNT_XMM-1);

View File

@ -289,6 +289,8 @@ void EmitSibMagic( uint regfield, const xIndirectVoid& info )
int displacement_size = (info.Displacement == 0) ? 0 :
( ( info.IsByteSizeDisp() ) ? 1 : 2 );
assert(!info.Base.IsEmpty() || !info.Index.IsEmpty() || displacement_size == 2);
if( !NeedsSibMagic( info ) )
{
// Use ModRm-only encoding, with the rm field holding an index/base register, if

View File

@ -29,8 +29,8 @@ using namespace x86Emitter;
#include "R5900OpcodeTables.h"
#include "x86emitter/x86emitter.h"
#include "SamplProf.h"
#include "microVU_IR.h"
#include "microVU_Misc.h"
#include "microVU_IR.h"
struct microBlockLink {
microBlock* block;

View File

@ -23,148 +23,164 @@
// Flag Allocators
//------------------------------------------------------------------
#define getFlagReg(regX, fInst) { \
switch (fInst) { \
case 0: regX = gprF0; break; \
case 1: regX = gprF1; break; \
case 2: regX = gprF2; break; \
case 3: regX = gprF3; break; \
default: \
Console.Error("microVU Error: fInst = %d", fInst); \
regX = gprF0; \
break; \
} \
_f static x32 getFlagReg(int fInst)
{
if (fInst >= 0 && fInst < 4)
{
return gprF[fInst];
}
else
{
Console.Error("microVU Error: fInst = %d", fInst);
return gprF[0];
}
}
#define setBitSFLAG(bitTest, bitSet) { \
TEST32ItoR(regT, bitTest); \
pjmp = JZ8(0); \
OR32ItoR(reg, bitSet); \
x86SetJ8(pjmp); \
_f void setBitSFLAG(x32 reg, x32 regT, int bitTest, int bitSet)
{
xTEST(regT, bitTest);
xForwardJZ8 skip;
xOR(reg, bitSet);
skip.SetTarget();
}
#define setBitFSEQ(bitX) { \
TEST32ItoR(gprT1, bitX); \
pjmp = JZ8(0); \
OR32ItoR(gprT1, bitX); \
x86SetJ8(pjmp); \
_f void setBitFSEQ(x32 reg, int bitX)
{
xTEST(reg, bitX);
xForwardJump8 skip(Jcc_Zero);
xOR(reg, bitX);
skip.SetTarget();
}
_f void mVUallocSFLAGa(int reg, int fInstance) {
getFlagReg(fInstance, fInstance);
MOV32RtoR(reg, fInstance);
_f void mVUallocSFLAGa(x32 reg, int fInstance)
{
xMOV(reg, getFlagReg(fInstance));
}
_f void mVUallocSFLAGb(int reg, int fInstance) {
getFlagReg(fInstance, fInstance);
MOV32RtoR(fInstance, reg);
_f void mVUallocSFLAGb(x32 reg, int fInstance)
{
xMOV(getFlagReg(fInstance), reg);
}
// Normalize Status Flag
_f void mVUallocSFLAGc(int reg, int regT, int fInstance) {
u8 *pjmp;
XOR32RtoR(reg, reg);
_f void mVUallocSFLAGc(x32 reg, x32 regT, int fInstance)
{
xXOR(reg, reg);
mVUallocSFLAGa(regT, fInstance);
setBitSFLAG(0x0f00, 0x0001); // Z Bit
setBitSFLAG(0xf000, 0x0002); // S Bit
setBitSFLAG(0x000f, 0x0040); // ZS Bit
setBitSFLAG(0x00f0, 0x0080); // SS Bit
AND32ItoR(regT, 0xffff0000); // DS/DI/OS/US/D/I/O/U Bits
SHR32ItoR(regT, 14);
OR32RtoR(reg, regT);
setBitSFLAG(reg, regT, 0x0f00, 0x0001); // Z Bit
setBitSFLAG(reg, regT, 0xf000, 0x0002); // S Bit
setBitSFLAG(reg, regT, 0x000f, 0x0040); // ZS Bit
setBitSFLAG(reg, regT, 0x00f0, 0x0080); // SS Bit
xAND(regT, 0xffff0000); // DS/DI/OS/US/D/I/O/U Bits
xSHR(regT, 14);
xOR(reg, regT);
}
// Denormalizes Status Flag
_f void mVUallocSFLAGd(uptr memAddr, bool setAllflags) {
_f void mVUallocSFLAGd(u32* memAddr, bool setAllflags) {
// Cannot use EBP (gprF1) here; as this function is used by mVU0 macro and
// Cannot use EBP (gprF[1]) here; as this function is used by mVU0 macro and
// the EErec needs EBP preserved.
MOV32MtoR(gprF0, memAddr);
MOV32RtoR(gprF3, gprF0);
SHR32ItoR(gprF3, 3);
AND32ItoR(gprF3, 0x18);
xMOV(gprF[0], ptr32[memAddr]);
xMOV(gprF[3], gprF[0]);
xSHR(gprF[3], 3);
xAND(gprF[3], 0x18);
MOV32RtoR(gprF2, gprF0);
SHL32ItoR(gprF2, 11);
AND32ItoR(gprF2, 0x1800);
OR32RtoR (gprF3, gprF2);
xMOV(gprF[2], gprF[0]);
xSHL(gprF[2], 11);
xAND(gprF[2], 0x1800);
xOR (gprF[3], gprF[2]);
SHL32ItoR(gprF0, 14);
AND32ItoR(gprF0, 0x3cf0000);
OR32RtoR (gprF3, gprF0);
xSHL(gprF[0], 14);
xAND(gprF[0], 0x3cf0000);
xOR (gprF[3], gprF[0]);
if (setAllflags) {
// this code should be run in mVU micro mode only, so writing to
// EBP (gprF1) is ok (and needed for vuMicro optimizations).
// EBP (gprF[1]) is ok (and needed for vuMicro optimizations).
MOV32RtoR(gprF0, gprF3);
MOV32RtoR(gprF1, gprF3);
MOV32RtoR(gprF2, gprF3);
xMOV(gprF[0], gprF[3]);
xMOV(gprF[1], gprF[3]);
xMOV(gprF[2], gprF[3]);
}
}
_f void mVUallocMFLAGa(mV, int reg, int fInstance) {
MOVZX32M16toR(reg, (uptr)&mVU->macFlag[fInstance]);
_f void mVUallocMFLAGa(mV, x32 reg, int fInstance)
{
xMOVZX(reg, ptr16[&mVU->macFlag[fInstance]]);
}
_f void mVUallocMFLAGb(mV, int reg, int fInstance) {
//AND32ItoR(reg, 0xffff);
if (fInstance < 4) MOV32RtoM((uptr)&mVU->macFlag[fInstance], reg); // microVU
else MOV32RtoM((uptr)&mVU->regs->VI[REG_MAC_FLAG].UL, reg); // macroVU
_f void mVUallocMFLAGb(mV, x32 reg, int fInstance)
{
//xAND(reg, 0xffff);
if (fInstance < 4) xMOV(ptr32[&mVU->macFlag[fInstance]], reg); // microVU
else xMOV(ptr32[&mVU->regs->VI[REG_MAC_FLAG].UL], reg); // macroVU
}
_f void mVUallocCFLAGa(mV, int reg, int fInstance) {
if (fInstance < 4) MOV32MtoR(reg, (uptr)&mVU->clipFlag[fInstance]); // microVU
else MOV32MtoR(reg, (uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL); // macroVU
_f void mVUallocCFLAGa(mV, x32 reg, int fInstance)
{
if (fInstance < 4) xMOV(reg, ptr32[&mVU->clipFlag[fInstance]]); // microVU
else xMOV(reg, ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL]); // macroVU
}
_f void mVUallocCFLAGb(mV, int reg, int fInstance) {
if (fInstance < 4) MOV32RtoM((uptr)&mVU->clipFlag[fInstance], reg); // microVU
else MOV32RtoM((uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL, reg); // macroVU
_f void mVUallocCFLAGb(mV, x32 reg, int fInstance)
{
if (fInstance < 4) xMOV(ptr32[&mVU->clipFlag[fInstance]], reg); // microVU
else xMOV(ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL], reg); // macroVU
}
//------------------------------------------------------------------
// VI Reg Allocators
//------------------------------------------------------------------
_f void mVUallocVIa(mV, int GPRreg, int _reg_) {
if (!_reg_) { XOR32RtoR(GPRreg, GPRreg); }
else { MOVZX32M16toR(GPRreg, (uptr)&mVU->regs->VI[_reg_].UL); }
_f void mVUallocVIa(mV, x32 GPRreg, int _reg_, bool signext = false)
{
if (!_reg_)
xXOR(GPRreg, GPRreg);
else
if (signext)
xMOVSX(GPRreg, ptr16[&mVU->regs->VI[_reg_].SL]);
else
xMOVZX(GPRreg, ptr16[&mVU->regs->VI[_reg_].UL]);
}
_f void mVUallocVIb(mV, int GPRreg, int _reg_) {
_f void mVUallocVIb(mV, x32 GPRreg, int _reg_)
{
if (mVUlow.backupVI) { // Backs up reg to memory (used when VI is modified b4 a branch)
MOVZX32M16toR(gprT3, (uptr)&mVU->regs->VI[_reg_].UL);
MOV32RtoM((uptr)&mVU->VIbackup, gprT3);
xMOVZX(edx, ptr16[&mVU->regs->VI[_reg_].UL]);
xMOV(ptr32[&mVU->VIbackup], edx);
}
if (_reg_ == 0) { return; }
else if (_reg_ < 16) { MOV16RtoM((uptr)&mVU->regs->VI[_reg_].UL, GPRreg); }
else if (_reg_ < 16) { xMOV(ptr16[&mVU->regs->VI[_reg_].UL], xRegister16(GPRreg.Id)); }
}
//------------------------------------------------------------------
// P/Q Reg Allocators
//------------------------------------------------------------------
_f void getPreg(mV, int reg) {
_f void getPreg(mV, xmm reg)
{
mVUunpack_xyzw(reg, xmmPQ, (2 + mVUinfo.readP));
/*if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2(reg, xmmT1, 15);*/
}
_f void getQreg(int reg, int qInstance) {
_f void getQreg(xmm reg, int qInstance)
{
mVUunpack_xyzw(reg, xmmPQ, qInstance);
/*if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2<vuIndex>(reg, xmmT1, 15);*/
}
_f void writeQreg(int reg, int qInstance) {
_f void writeQreg(xmm reg, int qInstance)
{
if (qInstance) {
if (!x86caps.hasStreamingSIMD4Extensions) {
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1);
SSE_MOVSS_XMM_to_XMM(xmmPQ, reg);
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1);
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
xMOVSS(xmmPQ, reg);
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
}
else SSE4_INSERTPS_XMM_to_XMM(xmmPQ, reg, _MM_MK_INSERTPS_NDX(0, 1, 0));
else xINSERTPS(xmmPQ, reg, _MM_MK_INSERTPS_NDX(0, 1, 0));
}
else SSE_MOVSS_XMM_to_XMM(xmmPQ, reg);
else xMOVSS(xmmPQ, reg);
}

View File

@ -55,34 +55,33 @@ _f void mVUendProgram(mV, microFlagCycles* mFC, int isEbit) {
}
// Save P/Q Regs
if (qInst) { SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe5); }
SSE_MOVSS_XMM_to_M32((uptr)&mVU->regs->VI[REG_Q].UL, xmmPQ);
if (qInst) { xPSHUF.D(xmmPQ, xmmPQ, 0xe5); }
xMOVSS(ptr32[&mVU->regs->VI[REG_Q].UL], xmmPQ);
if (isVU1) {
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, pInst ? 3 : 2);
SSE_MOVSS_XMM_to_M32((uptr)&mVU->regs->VI[REG_P].UL, xmmPQ);
xPSHUF.D(xmmPQ, xmmPQ, pInst ? 3 : 2);
xMOVSS(ptr32[&mVU->regs->VI[REG_P].UL], xmmPQ);
}
// Save Flag Instances
#if 1 // CHECK_MACROVU0 - Always on now
getFlagReg(fStatus, fStatus);
MOV32RtoM((uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL, fStatus);
xMOV(ptr32[&mVU->regs->VI[REG_STATUS_FLAG].UL], getFlagReg(fStatus));
#else
mVUallocSFLAGc(gprT1, gprT2, fStatus);
MOV32RtoM((uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL, gprT1);
mVUallocSFLAGc(gprT1, fStatus);
xMOV(ptr32[&mVU->regs->VI[REG_STATUS_FLAG].UL], gprT1);
#endif
mVUallocMFLAGa(mVU, gprT1, fMac);
mVUallocCFLAGa(mVU, gprT2, fClip);
MOV32RtoM((uptr)&mVU->regs->VI[REG_MAC_FLAG].UL, gprT1);
MOV32RtoM((uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL, gprT2);
xMOV(ptr32[&mVU->regs->VI[REG_MAC_FLAG].UL], gprT1);
xMOV(ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL], gprT2);
if (isEbit || isVU1) { // Clear 'is busy' Flags
AND32ItoM((uptr)&VU0.VI[REG_VPU_STAT].UL, (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
AND32ItoM((uptr)&mVU->regs->vifRegs->stat, ~VIF1_STAT_VEW); // Clear VU 'is busy' signal for vif
xAND(ptr32[&VU0.VI[REG_VPU_STAT].UL], (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
xAND(ptr32[&mVU->regs->vifRegs->stat], ~VIF1_STAT_VEW); // Clear VU 'is busy' signal for vif
}
if (isEbit != 2) { // Save PC, and Jump to Exit Point
MOV32ItoM((uptr)&mVU->regs->VI[REG_TPC].UL, xPC);
JMP32((uptr)mVU->exitFunct - ((uptr)x86Ptr + 5));
xMOV(ptr32[&mVU->regs->VI[REG_TPC].UL], xPC);
xJMP(mVU->exitFunct);
}
}
@ -93,7 +92,7 @@ _f void mVUsetupBranch(mV, microFlagCycles& mFC) {
mVUsetupFlags(mVU, mFC); // Shuffle Flag Instances
// Shuffle P/Q regs since every block starts at instance #0
if (mVU->p || mVU->q) { SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, shufflePQ); }
if (mVU->p || mVU->q) { xPSHUF.D(xmmPQ, xmmPQ, shufflePQ); }
}
void normBranchCompile(microVU* mVU, u32 branchPC) {
@ -109,15 +108,15 @@ void normJumpCompile(mV, microFlagCycles& mFC, bool isEvilJump) {
mVUsetupBranch(mVU, mFC);
mVUbackupRegs(mVU);
if (isEvilJump) MOV32MtoR(gprT2, (uptr)&mVU->evilBranch);
else MOV32MtoR(gprT2, (uptr)&mVU->branch);
MOV32ItoR(gprT3, (u32)&mVUpBlock->pStateEnd);
if (isEvilJump) xMOV(gprT2, ptr32[&mVU->evilBranch]);
else xMOV(gprT2, ptr32[&mVU->branch]);
xMOV(gprT3, (uptr)&mVUpBlock->pStateEnd);
if (!mVU->index) xCALL(mVUcompileJIT<0>); //(u32 startPC, uptr pState)
else xCALL(mVUcompileJIT<1>);
mVUrestoreRegs(mVU);
JMPR(gprT1); // Jump to rec-code address
xJMP(gprT1); // Jump to rec-code address
}
void normBranch(mV, microFlagCycles& mFC) {
@ -132,7 +131,7 @@ void normBranch(mV, microFlagCycles& mFC) {
void condBranch(mV, microFlagCycles& mFC, int JMPcc) {
mVUsetupBranch(mVU, mFC);
xCMP(ptr16[(u16*)&mVU->branch], 0);
xCMP(ptr16[&mVU->branch], 0);
incPC(3);
if (mVUup.eBit) { // Conditional Branch With E-Bit Set
mVUendProgram(mVU, &mFC, 2);
@ -190,8 +189,8 @@ void normJump(mV, microFlagCycles& mFC) {
if (mVUup.eBit) { // E-bit Jump
mVUendProgram(mVU, &mFC, 2);
MOV32MtoR(gprT1, (uptr)&mVU->branch);
MOV32RtoM((uptr)&mVU->regs->VI[REG_TPC].UL, gprT1);
xMOV(gprT1, ptr32[&mVU->branch]);
xMOV(ptr32[&mVU->regs->VI[REG_TPC].UL], gprT1);
xJMP(mVU->exitFunct);
}
else normJumpCompile(mVU, mFC, 0);

View File

@ -34,16 +34,16 @@ const __aligned16 u32 sse4_maxvals[2][4] = {
// gotten a NaN value, then something went wrong; and the NaN's sign
// is not to be trusted. Games like positive values better usually,
// and its faster... so just always make NaNs into positive infinity.
void mVUclamp1(int reg, int regT1, int xyzw, bool bClampE = 0) {
void mVUclamp1(xmm reg, xmm regT1, int xyzw, bool bClampE = 0) {
if ((!clampE && CHECK_VU_OVERFLOW) || (clampE && bClampE)) {
switch (xyzw) {
case 1: case 2: case 4: case 8:
SSE_MINSS_M32_to_XMM(reg, (uptr)mVUglob.maxvals);
SSE_MAXSS_M32_to_XMM(reg, (uptr)mVUglob.minvals);
xMIN.SS(reg, ptr32[mVUglob.maxvals]);
xMAX.SS(reg, ptr32[mVUglob.minvals]);
break;
default:
SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals);
SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals);
xMIN.PS(reg, ptr32[mVUglob.maxvals]);
xMAX.PS(reg, ptr32[mVUglob.minvals]);
break;
}
}
@ -54,44 +54,41 @@ void mVUclamp1(int reg, int regT1, int xyzw, bool bClampE = 0) {
// Note 2: Using regalloc here seems to contaminate some regs in certain games.
// Must be some specific case I've overlooked (or I used regalloc improperly on an opcode)
// so we just use a temporary mem location for our backup for now... (non-sse4 version only)
void mVUclamp2(microVU* mVU, int reg, int regT1, int xyzw, bool bClampE = 0) {
void mVUclamp2(microVU* mVU, xmm reg, xmm regT1in, int xyzw, bool bClampE = 0) {
if ((!clampE && CHECK_VU_SIGN_OVERFLOW) || (clampE && bClampE && CHECK_VU_SIGN_OVERFLOW)) {
if (x86caps.hasStreamingSIMD4Extensions) {
int i = (xyzw==1||xyzw==2||xyzw==4||xyzw==8) ? 0: 1;
SSE4_PMINSD_M128_to_XMM(reg, (uptr)&sse4_maxvals[i][0]);
SSE4_PMINUD_M128_to_XMM(reg, (uptr)&sse4_minvals[i][0]);
xPMIN.SD(reg, ptr128[&sse4_maxvals[i][0]]);
xPMIN.UD(reg, ptr128[&sse4_minvals[i][0]]);
return;
}
int regT1b = 0;
if (regT1 < 0) {
regT1b = 1; regT1=(reg+1)%8;
SSE_MOVAPS_XMM_to_M128((uptr)mVU->xmmCTemp, regT1);
//regT1 = mVU->regAlloc->allocReg();
}
//xmm regT1 = regT1b ? mVU->regAlloc->allocReg() : regT1in;
xmm regT1 = regT1in.IsEmpty() ? xmm((reg.Id + 1) % 8) : regT1in;
if (regT1 != regT1in) xMOVAPS(ptr128[mVU->xmmCTemp], regT1);
switch (xyzw) {
case 1: case 2: case 4: case 8:
SSE_MOVAPS_XMM_to_XMM(regT1, reg);
SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit);
SSE_MINSS_M32_to_XMM (reg, (uptr)mVUglob.maxvals);
SSE_MAXSS_M32_to_XMM (reg, (uptr)mVUglob.minvals);
SSE_ORPS_XMM_to_XMM (reg, regT1);
xMOVAPS(regT1, reg);
xAND.PS(regT1, ptr128[mVUglob.signbit]);
xMIN.SS(reg, ptr128[mVUglob.maxvals]);
xMAX.SS(reg, ptr128[mVUglob.minvals]);
xOR.PS (reg, regT1);
break;
default:
SSE_MOVAPS_XMM_to_XMM(regT1, reg);
SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit);
SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals);
SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals);
SSE_ORPS_XMM_to_XMM (reg, regT1);
xMOVAPS(regT1, reg);
xAND.PS(regT1, ptr128[mVUglob.signbit]);
xMIN.PS(reg, ptr128[mVUglob.maxvals]);
xMAX.PS(reg, ptr128[mVUglob.minvals]);
xOR.PS (reg, regT1);
break;
}
//if (regT1b) mVU->regAlloc->clearNeeded(regT1);
if (regT1b) SSE_MOVAPS_M128_to_XMM(regT1, (uptr)mVU->xmmCTemp);
//if (regT1 != regT1in) mVU->regAlloc->clearNeeded(regT1);
if (regT1 != regT1in) xMOVAPS(regT1, ptr128[mVU->xmmCTemp]);
}
else mVUclamp1(reg, regT1, xyzw, bClampE);
else mVUclamp1(reg, regT1in, xyzw, bClampE);
}
// Used for operand clamping on every SSE instruction (add/sub/mul/div)
void mVUclamp3(microVU* mVU, int reg, int regT1, int xyzw) {
void mVUclamp3(microVU* mVU, xmm reg, xmm regT1, int xyzw) {
if (clampE) mVUclamp2(mVU, reg, regT1, xyzw, 1);
}
@ -101,6 +98,6 @@ void mVUclamp3(microVU* mVU, int reg, int regT1, int xyzw) {
// emulated opcodes (causing crashes). Since we're clamping the operands
// with mVUclamp3, we should almost never be getting a NaN result,
// but this clamp is just a precaution just-in-case.
void mVUclamp4(int reg, int regT1, int xyzw) {
void mVUclamp4(xmm reg, xmm regT1, int xyzw) {
if (clampE && !CHECK_VU_SIGN_OVERFLOW) mVUclamp1(reg, regT1, xyzw, 1);
}

View File

@ -126,7 +126,7 @@ void doIbit(mV) {
}
else tempI = curI;
MOV32ItoM((uptr)&mVU->regs->VI[REG_I].UL, tempI);
xMOV(ptr32[&mVU->regs->VI[REG_I].UL], tempI);
incPC(1);
}
}
@ -134,21 +134,27 @@ void doIbit(mV) {
void doSwapOp(mV) {
if (mVUinfo.backupVF && !mVUlow.noWriteVF) {
DevCon.WriteLn(Color_Green, "microVU%d: Backing Up VF Reg [%04x]", getIndex, xPC);
int t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg);
int t2 = mVU->regAlloc->allocReg();
SSE_MOVAPS_XMM_to_XMM(t2, t1);
mVU->regAlloc->clearNeeded(t1);
xmm t2 = mVU->regAlloc->allocReg();
{
xmm t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg);
xMOVAPS(t2, t1);
mVU->regAlloc->clearNeeded(t1);
}
mVUopL(mVU, 1);
t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg, mVUlow.VF_write.reg, 0xf, 0);
SSE_XORPS_XMM_to_XMM(t2, t1);
SSE_XORPS_XMM_to_XMM(t1, t2);
SSE_XORPS_XMM_to_XMM(t2, t1);
mVU->regAlloc->clearNeeded(t1);
{
xmm t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg, mVUlow.VF_write.reg, 0xf, 0);
xXOR.PS(t2, t1);
xXOR.PS(t1, t2);
xXOR.PS(t2, t1);
mVU->regAlloc->clearNeeded(t1);
}
incPC(1);
doUpperOp();
t1 = mVU->regAlloc->allocReg(-1, mVUlow.VF_write.reg, 0xf);
SSE_MOVAPS_XMM_to_XMM(t1, t2);
mVU->regAlloc->clearNeeded(t1);
{
xmm t1 = mVU->regAlloc->allocReg(-1, mVUlow.VF_write.reg, 0xf);
xMOVAPS(t1, t2);
mVU->regAlloc->clearNeeded(t1);
}
mVU->regAlloc->clearNeeded(t2);
}
else { mVUopL(mVU, 1); incPC(1); doUpperOp(); }
@ -165,9 +171,9 @@ _f void mVUcheckBadOp(mV) {
// Prints msg when exiting block early if 1st op was a bad opcode (Dawn of Mana Level 2)
_f void handleBadOp(mV, int count) {
if (mVUinfo.isBadOp && count == 0) {
MOV32ItoR(gprT2, (uptr)mVU);
if (!isVU1) CALLFunc((uptr)mVUbadOp0);
else CALLFunc((uptr)mVUbadOp1);
xMOV(ecx, (uptr)mVU);
if (!isVU1) xCALL(mVUbadOp0);
else xCALL(mVUbadOp1);
}
}
@ -302,7 +308,7 @@ _f bool doEarlyExit(microVU* mVU) {
_f void mVUsavePipelineState(microVU* mVU) {
u32* lpS = (u32*)&mVU->prog.lpState.vi15;
for (int i = 0; i < (sizeof(microRegInfo)-4)/4; i++, lpS++) {
MOV32ItoM((uptr)lpS, lpS[0]);
xMOV(ptr32[lpS], lpS[0]);
}
}
@ -311,18 +317,19 @@ void mVUtestCycles(microVU* mVU) {
iPC = mVUstartPC;
mVUdebugNOW(0);
if (doEarlyExit(mVU)) {
CMP32ItoM((uptr)&mVU->cycles, 0);
u32* jmp32 = JG32(0);
xCMP(ptr32[&mVU->cycles], 0);
xForwardJG32 skip;
// FIXME: uh... actually kind of a pain with xForwardJump
//if (!isVU1) { TEST32ItoM((uptr)&mVU->regs->flags, VUFLAG_MFLAGSET); vu0jmp = JZ32(0); }
MOV32ItoR(gprT2, (uptr)mVU);
if (isVU1) CALLFunc((uptr)mVUwarning1);
//else CALLFunc((uptr)mVUwarning0); // VU0 is allowed early exit for COP2 Interlock Simulation
mVUsavePipelineState(mVU);
mVUendProgram(mVU, NULL, 0);
//if (!isVU1) x86SetJ32(vu0jmp);
x86SetJ32(jmp32);
xMOV(ecx, (uptr)mVU);
if (isVU1) xCALL(mVUwarning1);
//else xCALL(mVUwarning0); // VU0 is allowed early exit for COP2 Interlock Simulation
mVUsavePipelineState(mVU);
mVUendProgram(mVU, NULL, 0);
//if (!isVU1) vu0jmp.SetTarget();
skip.SetTarget();
}
SUB32ItoM((uptr)&mVU->cycles, mVUcycles);
xSUB(ptr32[&mVU->cycles], mVUcycles);
}
// Initialize VI Constants (vi15 propagates through blocks)
@ -410,7 +417,7 @@ _r void* mVUcompile(microVU* mVU, u32 startPC, uptr pState) {
u32 x = 0;
for (; x < endCount; x++) {
if (mVUinfo.isEOB) { handleBadOp(mVU, x); x = 0xffff; }
if (mVUup.mBit) { OR32ItoM((uptr)&mVU->regs->flags, VUFLAG_MFLAGSET); }
if (mVUup.mBit) { xOR(ptr32[&mVU->regs->flags], VUFLAG_MFLAGSET); }
if (mVUlow.isNOP) { incPC(1); doUpperOp(); doIbit(mVU); }
else if (!mVUinfo.swapOps) { incPC(1); doUpperOp(); doLowerOp(); }
else { doSwapOp(mVU); }

View File

@ -43,25 +43,25 @@ void mVUdispatcherA(mV) {
// Load Regs
#if 1 // CHECK_MACROVU0 - Always on now
MOV32MtoR(gprF0, (uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL);
MOV32RtoR(gprF1, gprF0);
MOV32RtoR(gprF2, gprF0);
MOV32RtoR(gprF3, gprF0);
xMOV(gprF[0], ptr32[&mVU->regs->VI[REG_STATUS_FLAG].UL]);
xMOV(gprF[1], gprF[0]);
xMOV(gprF[2], gprF[0]);
xMOV(gprF[3], gprF[0]);
#else
mVUallocSFLAGd((uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL, 1);
#endif
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_MAC_FLAG].UL);
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, 0);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->macFlag, xmmT1);
xMOVAPS(xmmT1, ptr128[&mVU->regs->VI[REG_MAC_FLAG].UL]);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVAPS(ptr128[&mVU->macFlag[0]], xmmT1);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL);
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, 0);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->clipFlag, xmmT1);
xMOVAPS(xmmT1, ptr128[&mVU->regs->VI[REG_CLIP_FLAG].UL]);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVAPS(ptr128[&mVU->clipFlag[0]], xmmT1);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_P].UL);
SSE_MOVAPS_M128_to_XMM(xmmPQ, (uptr)&mVU->regs->VI[REG_Q].UL);
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmT1, 0); // wzyx = PPQQ
xMOVAPS(xmmT1, ptr128[&mVU->regs->VI[REG_P].UL]);
xMOVAPS(xmmPQ, ptr128[&mVU->regs->VI[REG_Q].UL]);
xSHUF.PS(xmmPQ, xmmT1, 0); // wzyx = PPQQ
// Jump to Recompiled Code Block
xJMP(eax);

View File

@ -20,12 +20,10 @@
// Sets FDIV Flags at the proper time
_f void mVUdivSet(mV) {
int flagReg1, flagReg2;
if (mVUinfo.doDivFlag) {
getFlagReg(flagReg1, sFLAG.write);
if (!sFLAG.doFlag) { getFlagReg(flagReg2, sFLAG.lastWrite); MOV32RtoR(flagReg1, flagReg2); }
AND32ItoR(flagReg1, 0xfff3ffff);
OR32MtoR (flagReg1, (uptr)&mVU->divFlag);
if (!sFLAG.doFlag) { xMOV(getFlagReg(sFLAG.write), getFlagReg(sFLAG.lastWrite)); }
xAND(getFlagReg(sFLAG.write), 0xfff3ffff);
xOR (getFlagReg(sFLAG.write), ptr32[&mVU->divFlag]);
}
}
@ -159,9 +157,8 @@ _f void mVUsetFlags(mV, microFlagCycles& mFC) {
iPC = endPC;
}
#define getFlagReg1(x) ((x == 3) ? gprF3 : ((x == 2) ? gprF2 : ((x == 1) ? gprF1 : gprF0)))
#define getFlagReg2(x) ((bStatus[0] == x) ? getFlagReg1(x) : gprT1)
#define getFlagReg3(x) ((gFlag == x) ? gprT1 : getFlagReg1(x))
#define getFlagReg2(x) ((bStatus[0] == x) ? getFlagReg(x) : gprT1)
#define getFlagReg3(x) ((gFlag == x) ? gprT1 : getFlagReg(x))
#define getFlagReg4(x) ((gFlag == x) ? gprT1 : gprT2)
#define shuffleMac ((bMac [3]<<6)|(bMac [2]<<4)|(bMac [1]<<2)|bMac [0])
#define shuffleClip ((bClip[3]<<6)|(bClip[2]<<4)|(bClip[1]<<2)|bClip[0])
@ -175,52 +172,52 @@ _f void mVUsetupFlags(mV, microFlagCycles& mFC) {
// DevCon::Status("sortRegs = %d", params sortRegs);
// Note: Emitter will optimize out mov(reg1, reg1) cases...
if (sortRegs == 1) {
MOV32RtoR(gprF0, getFlagReg1(bStatus[0]));
MOV32RtoR(gprF1, getFlagReg1(bStatus[1]));
MOV32RtoR(gprF2, getFlagReg1(bStatus[2]));
MOV32RtoR(gprF3, getFlagReg1(bStatus[3]));
xMOV(gprF[0], getFlagReg(bStatus[0]));
xMOV(gprF[1], getFlagReg(bStatus[1]));
xMOV(gprF[2], getFlagReg(bStatus[2]));
xMOV(gprF[3], getFlagReg(bStatus[3]));
}
else if (sortRegs == 2) {
MOV32RtoR(gprT1, getFlagReg1(bStatus[3]));
MOV32RtoR(gprF0, getFlagReg1(bStatus[0]));
MOV32RtoR(gprF1, getFlagReg2(bStatus[1]));
MOV32RtoR(gprF2, getFlagReg2(bStatus[2]));
MOV32RtoR(gprF3, gprT1);
xMOV(gprT1, getFlagReg(bStatus[3]));
xMOV(gprF[0], getFlagReg(bStatus[0]));
xMOV(gprF[1], getFlagReg2(bStatus[1]));
xMOV(gprF[2], getFlagReg2(bStatus[2]));
xMOV(gprF[3], gprT1);
}
else if (sortRegs == 3) {
int gFlag = (bStatus[0] == bStatus[1]) ? bStatus[2] : bStatus[1];
MOV32RtoR(gprT1, getFlagReg1(gFlag));
MOV32RtoR(gprT2, getFlagReg1(bStatus[3]));
MOV32RtoR(gprF0, getFlagReg1(bStatus[0]));
MOV32RtoR(gprF1, getFlagReg3(bStatus[1]));
MOV32RtoR(gprF2, getFlagReg4(bStatus[2]));
MOV32RtoR(gprF3, gprT2);
xMOV(gprT1, getFlagReg(gFlag));
xMOV(gprT2, getFlagReg(bStatus[3]));
xMOV(gprF[0], getFlagReg(bStatus[0]));
xMOV(gprF[1], getFlagReg3(bStatus[1]));
xMOV(gprF[2], getFlagReg4(bStatus[2]));
xMOV(gprF[3], gprT2);
}
else {
MOV32RtoR(gprT1, getFlagReg1(bStatus[0]));
MOV32RtoR(gprT2, getFlagReg1(bStatus[1]));
MOV32RtoR(gprT3, getFlagReg1(bStatus[2]));
MOV32RtoR(gprF3, getFlagReg1(bStatus[3]));
MOV32RtoR(gprF0, gprT1);
MOV32RtoR(gprF1, gprT2);
MOV32RtoR(gprF2, gprT3);
xMOV(gprT1, getFlagReg(bStatus[0]));
xMOV(gprT2, getFlagReg(bStatus[1]));
xMOV(gprT3, getFlagReg(bStatus[2]));
xMOV(gprF[3], getFlagReg(bStatus[3]));
xMOV(gprF[0], gprT1);
xMOV(gprF[1], gprT2);
xMOV(gprF[2], gprT3);
}
}
if (__Mac) {
int bMac[4];
sortFlag(mFC.xMac, bMac, mFC.cycles);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)mVU->macFlag);
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, shuffleMac);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->macFlag, xmmT1);
xMOVAPS(xmmT1, ptr128[mVU->macFlag]);
xSHUF.PS(xmmT1, xmmT1, shuffleMac);
xMOVAPS(ptr128[mVU->macFlag], xmmT1);
}
if (__Clip) {
int bClip[4];
sortFlag(mFC.xClip, bClip, mFC.cycles);
SSE_MOVAPS_M128_to_XMM(xmmT2, (uptr)mVU->clipFlag);
SSE_SHUFPS_XMM_to_XMM (xmmT2, xmmT2, shuffleClip);
SSE_MOVAPS_XMM_to_M128((uptr)mVU->clipFlag, xmmT2);
xMOVAPS(xmmT2, ptr128[mVU->clipFlag]);
xSHUF.PS(xmmT2, xmmT2, shuffleClip);
xMOVAPS(ptr128[mVU->clipFlag], xmmT2);
}
}

View File

@ -163,11 +163,6 @@ struct microIR {
// Reg Alloc
//------------------------------------------------------------------
void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW);
void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW);
void mVUloadReg(int reg, uptr offset, int xyzw);
void mVUloadIreg(int reg, int xyzw, VURegs* vuRegs);
struct microMapXMM {
int VFreg; // VF Reg Number Stored (-1 = Temp; 0 = vf0 and will not be written back; 32 = ACC; 33 = I reg)
int xyzw; // xyzw to write back (0 = Don't write back anything AND cached vfReg has all vectors valid)
@ -209,18 +204,18 @@ public:
}
void reset() {
for (int i = 0; i < xmmTotal; i++) {
clearReg(i);
clearReg(xmm(i));
}
counter = 0;
}
void flushAll(bool clearState = 1) {
for (int i = 0; i < xmmTotal; i++) {
writeBackReg(i);
if (clearState) clearReg(i);
writeBackReg(xmm(i));
if (clearState) clearReg(xmm(i));
}
}
void clearReg(int reg) {
microMapXMM& clear( xmmMap[reg] );
void clearReg(xmm reg) {
microMapXMM& clear( xmmMap[reg.Id] );
clear.VFreg = -1;
clear.count = 0;
clear.xyzw = 0;
@ -228,23 +223,23 @@ public:
}
void clearRegVF(int VFreg) {
for (int i = 0; i < xmmTotal; i++) {
if (xmmMap[i].VFreg == VFreg) clearReg(i);
if (xmmMap[i].VFreg == VFreg) clearReg(xmm(i));
}
}
void writeBackReg(int reg, bool invalidateRegs = 1) {
microMapXMM& write( xmmMap[reg] );
void writeBackReg(xmm reg, bool invalidateRegs = 1) {
microMapXMM& write( xmmMap[reg.Id] );
if ((write.VFreg > 0) && write.xyzw) { // Reg was modified and not Temp or vf0
if (write.VFreg == 33) SSE_MOVSS_XMM_to_M32((uptr)&vuRegs->VI[REG_I].UL, reg);
else if (write.VFreg == 32) mVUsaveReg(reg, (uptr)&vuRegs->ACC.UL[0], write.xyzw, 1);
else mVUsaveReg(reg, (uptr)&vuRegs->VF[write.VFreg].UL[0], write.xyzw, 1);
if (write.VFreg == 33) xMOVSS(ptr32[&vuRegs->VI[REG_I].UL], reg);
else if (write.VFreg == 32) mVUsaveReg(reg, ptr[&vuRegs->ACC.UL[0]], write.xyzw, 1);
else mVUsaveReg(reg, ptr[&vuRegs->VF[write.VFreg].UL[0]], write.xyzw, 1);
if (invalidateRegs) {
for (int i = 0; i < xmmTotal; i++) {
microMapXMM& imap (xmmMap[i]);
if ((i == reg) || imap.isNeeded) continue;
if ((i == reg.Id) || imap.isNeeded) continue;
if (imap.VFreg == write.VFreg) {
if (imap.xyzw && imap.xyzw < 0xf) DevCon.Error("microVU Error: writeBackReg() [%d]", imap.VFreg);
clearReg(i); // Invalidate any Cached Regs of same vf Reg
clearReg(xmm(i)); // Invalidate any Cached Regs of same vf Reg
}
}
}
@ -257,27 +252,28 @@ public:
}
clearReg(reg); // Clear Reg
}
void clearNeeded(int reg) {
if ((reg < 0) || (reg >= xmmTotal)) return;
void clearNeeded(xmm reg)
{
if ((reg.Id < 0) || (reg.Id >= xmmTotal)) return;
microMapXMM& clear (xmmMap[reg]);
microMapXMM& clear (xmmMap[reg.Id]);
clear.isNeeded = 0;
if (clear.xyzw) { // Reg was modified
if (clear.VFreg > 0) {
int mergeRegs = 0;
if (clear.xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes
for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg
if (i == reg) continue;
if (i == reg.Id) continue;
microMapXMM& imap (xmmMap[i]);
if (imap.VFreg == clear.VFreg) {
if (imap.xyzw && imap.xyzw < 0xf) DevCon.Error("microVU Error: clearNeeded() [%d]", imap.VFreg);
if (mergeRegs == 1) {
mVUmergeRegs(i, reg, clear.xyzw, 1);
mVUmergeRegs(xmm(i), reg, clear.xyzw, 1);
imap.xyzw = 0xf;
imap.count = counter;
mergeRegs = 2;
}
else clearReg(i);
else clearReg(xmm(i));
}
}
if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged
@ -286,10 +282,11 @@ public:
else clearReg(reg); // If Reg was temp or vf0, then invalidate itself
}
}
int allocReg(int vfLoadReg = -1, int vfWriteReg = -1, int xyzw = 0, bool cloneWrite = 1) {
xmm allocReg(int vfLoadReg = -1, int vfWriteReg = -1, int xyzw = 0, bool cloneWrite = 1) {
counter++;
if (vfLoadReg >= 0) { // Search For Cached Regs
for (int i = 0; i < xmmTotal; i++) {
xmm xmmi(i);
microMapXMM& imap (xmmMap[i]);
if ((imap.VFreg == vfLoadReg) && (!imap.xyzw // Reg Was Not Modified
|| (imap.VFreg && (imap.xyzw==0xf)))) { // Reg Had All Vectors Modified and != VF0
@ -297,49 +294,51 @@ public:
if (vfWriteReg >= 0) { // Reg will be modified
if (cloneWrite) { // Clone Reg so as not to use the same Cached Reg
z = findFreeReg();
writeBackReg(z);
if (z!=i && xyzw==8) SSE_MOVAPS_XMM_to_XMM (z, i);
else if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1);
else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2);
else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3);
else if (z != i) SSE_MOVAPS_XMM_to_XMM (z, i);
xmm xmmz(z);
writeBackReg(xmmz);
if (z!=i && xyzw==8) xMOVAPS (xmmz, xmmi);
else if (xyzw == 4) xPSHUF.D(xmmz, xmmi, 1);
else if (xyzw == 2) xPSHUF.D(xmmz, xmmi, 2);
else if (xyzw == 1) xPSHUF.D(xmmz, xmmi, 3);
else if (z != i) xMOVAPS (xmmz, xmmi);
imap.count = counter; // Reg i was used, so update counter
}
else { // Don't clone reg, but shuffle to adjust for SS ops
if ((vfLoadReg != vfWriteReg) || (xyzw != 0xf)) { writeBackReg(z); }
if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1);
else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2);
else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3);
if ((vfLoadReg != vfWriteReg) || (xyzw != 0xf)) { writeBackReg(xmmi); }
if (xyzw == 4) xPSHUF.D(xmmi, xmmi, 1);
else if (xyzw == 2) xPSHUF.D(xmmi, xmmi, 2);
else if (xyzw == 1) xPSHUF.D(xmmi, xmmi, 3);
}
xmmMap[z].VFreg = vfWriteReg;
xmmMap[z].xyzw = xyzw;
}
xmmMap[z].count = counter;
xmmMap[z].isNeeded = 1;
return z;
return xmm(z);
}
}
}
int x = findFreeReg();
writeBackReg(x);
xmm xmmx(x);
writeBackReg(xmmx);
if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading)
if ((vfLoadReg == 0) && !(xyzw & 1)) { SSE2_PXOR_XMM_to_XMM(x, x); }
else if (vfLoadReg == 33) mVUloadIreg(x, xyzw, vuRegs);
else if (vfLoadReg == 32) mVUloadReg (x, (uptr)&vuRegs->ACC.UL[0], xyzw);
else if (vfLoadReg >= 0) mVUloadReg (x, (uptr)&vuRegs->VF[vfLoadReg].UL[0], xyzw);
if ((vfLoadReg == 0) && !(xyzw & 1)) { xPXOR(xmmx, xmmx); }
else if (vfLoadReg == 33) mVUloadIreg(xmmx, xyzw, vuRegs);
else if (vfLoadReg == 32) mVUloadReg (xmmx, ptr[&vuRegs->ACC.UL[0]], xyzw);
else if (vfLoadReg >= 0) mVUloadReg (xmmx, ptr[&vuRegs->VF[vfLoadReg].UL[0]], xyzw);
xmmMap[x].VFreg = vfWriteReg;
xmmMap[x].xyzw = xyzw;
}
else { // Reg Will Not Be Modified (always load full reg for caching)
if (vfLoadReg == 33) mVUloadIreg(x, 0xf, vuRegs);
else if (vfLoadReg == 32) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->ACC.UL[0]);
else if (vfLoadReg >= 0) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->VF[vfLoadReg].UL[0]);
if (vfLoadReg == 33) mVUloadIreg(xmmx, 0xf, vuRegs);
else if (vfLoadReg == 32) xMOVAPS(xmmx, ptr128[&vuRegs->ACC.UL[0]]);
else if (vfLoadReg >= 0) xMOVAPS(xmmx, ptr128[&vuRegs->VF[vfLoadReg].UL[0]]);
xmmMap[x].VFreg = vfLoadReg;
xmmMap[x].xyzw = 0;
}
xmmMap[x].count = counter;
xmmMap[x].isNeeded = 1;
return x;
return xmmx;
}
};

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,7 @@ void setupMacroOp(int mode, const char* opName) {
iFlushCall(FLUSH_EVERYTHING);
microVU0.regAlloc->reset();
if (mode & 0x01) { // Q-Reg will be Read
SSE_MOVSS_M32_to_XMM(xmmPQ, (uptr)&microVU0.regs->VI[REG_Q].UL);
xMOVSSZX(xmmPQ, ptr32[&microVU0.regs->VI[REG_Q].UL]);
}
if (mode & 0x08) { // Clip Instruction
microVU0.prog.IRinfo.info[0].cFlag.write = 0xff;
@ -51,16 +51,16 @@ void setupMacroOp(int mode, const char* opName) {
microVU0.prog.IRinfo.info[0].sFlag.lastWrite = 0;
microVU0.prog.IRinfo.info[0].mFlag.doFlag = 1;
microVU0.prog.IRinfo.info[0].mFlag.write = 0xff;
MOV32MtoR(gprF0, (uptr)&microVU0.regs->VI[REG_STATUS_FLAG].UL);
xMOV(gprF[0], ptr32[&microVU0.regs->VI[REG_STATUS_FLAG].UL]);
}
}
void endMacroOp(int mode) {
if (mode & 0x02) { // Q-Reg was Written To
SSE_MOVSS_XMM_to_M32((uptr)&microVU0.regs->VI[REG_Q].UL, xmmPQ);
xMOVSS(ptr32[&microVU0.regs->VI[REG_Q].UL], xmmPQ);
}
if (mode & 0x10) { // Status/Mac Flags were Updated
MOV32RtoM((uptr)&microVU0.regs->VI[REG_STATUS_FLAG].UL, gprF0);
xMOV(ptr32[&microVU0.regs->VI[REG_STATUS_FLAG].UL], gprF[0]);
}
microVU0.regAlloc->flushAll();
microVU0.cop2 = 0;
@ -253,11 +253,11 @@ void COP2_Interlock(bool mBitSync) {
}
void TEST_FBRST_RESET(FnType_Void* resetFunct, int vuIndex) {
TEST32ItoR(EAX, (vuIndex) ? 0x200 : 0x002);
j8Ptr[0] = JZ8(0);
xCALL(resetFunct);
MOV32MtoR(EAX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
x86SetJ8(j8Ptr[0]);
xTEST(eax, (vuIndex) ? 0x200 : 0x002);
xForwardJZ8 skip;
xCALL(resetFunct);
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
skip.SetTarget();
}
static void recCFC2() {
@ -269,19 +269,19 @@ static void recCFC2() {
iFlushCall(FLUSH_EVERYTHING);
if (_Rd_ == REG_STATUS_FLAG) { // Normalize Status Flag
MOV32MtoR(gprF0, (uptr)&microVU0.regs->VI[REG_STATUS_FLAG].UL);
mVUallocSFLAGc(EAX, gprF0, 0);
xMOV(gprF[0], ptr32[&microVU0.regs->VI[REG_STATUS_FLAG].UL]);
mVUallocSFLAGc(eax, gprF[0], 0);
}
else MOV32MtoR(EAX, (uptr)&microVU0.regs->VI[_Rd_].UL);
else xMOV(eax, ptr32[&microVU0.regs->VI[_Rd_].UL]);
// FixMe: Should R-Reg have upper 9 bits 0?
MOV32RtoM((uptr)&cpuRegs.GPR.r[_Rt_].UL[0], EAX);
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]], eax);
if (_Rd_ >= 16) {
CDQ(); // Sign Extend
MOV32RtoM ((uptr)&cpuRegs.GPR.r[_Rt_].UL[1], EDX);
xCDQ(); // Sign Extend
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], edx);
}
else MOV32ItoM((uptr)&cpuRegs.GPR.r[_Rt_].UL[1], 0);
else xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], 0);
// FixMe: I think this is needed, but not sure how it works
_eeOnWriteReg(_Rt_, 1);
@ -298,36 +298,36 @@ static void recCTC2() {
case REG_MAC_FLAG: case REG_TPC:
case REG_VPU_STAT: break; // Read Only Regs
case REG_R:
MOV32MtoR(EAX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
OR32ItoR (EAX, 0x3f800000);
MOV32RtoM((uptr)&microVU0.regs->VI[REG_R].UL, EAX);
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
xOR (eax, 0x3f800000);
xMOV(ptr32[&microVU0.regs->VI[REG_R].UL], eax);
break;
case REG_STATUS_FLAG:
if (_Rt_) { // Denormalizes flag into gprF1
mVUallocSFLAGd((uptr)&cpuRegs.GPR.r[_Rt_].UL[0], 0);
MOV32RtoM((uptr)&microVU0.regs->VI[_Rd_].UL, gprF1);
mVUallocSFLAGd(&cpuRegs.GPR.r[_Rt_].UL[0], 0);
xMOV(ptr32[&microVU0.regs->VI[_Rd_].UL], gprF[1]);
}
else MOV32ItoM((uptr)&microVU0.regs->VI[_Rd_].UL, 0);
else xMOV(ptr32[&microVU0.regs->VI[_Rd_].UL], 0);
break;
case REG_CMSAR1: // Execute VU1 Micro SubRoutine
if (_Rt_) {
MOV32MtoR(ECX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
xMOV(ecx, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
}
else XOR32RtoR(ECX,ECX);
else xXOR(ecx, ecx);
xCALL(vu1ExecMicro);
break;
case REG_FBRST:
if (!_Rt_) {
MOV32ItoM((uptr)&microVU0.regs->VI[REG_FBRST].UL, 0);
xMOV(ptr32[&microVU0.regs->VI[REG_FBRST].UL], 0);
return;
}
else MOV32MtoR(EAX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
else xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
TEST_FBRST_RESET(vu0ResetRegs, 0);
TEST_FBRST_RESET(vu1ResetRegs, 1);
AND32ItoR(EAX, 0x0C0C);
MOV32RtoM((uptr)&microVU0.regs->VI[REG_FBRST].UL, EAX);
xAND(eax, 0x0C0C);
xMOV(ptr32[&microVU0.regs->VI[REG_FBRST].UL], eax);
break;
default:
// Executing vu0 block here fixes the intro of Ratchet and Clank
@ -349,8 +349,8 @@ static void recQMFC2() {
// FixMe: For some reason this line is needed or else games break:
_eeOnWriteReg(_Rt_, 0);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&microVU0.regs->VF[_Rd_].UL[0]);
SSE_MOVAPS_XMM_to_M128((uptr)&cpuRegs.GPR.r[_Rt_].UL[0], xmmT1);
xMOVAPS(xmmT1, ptr128[&microVU0.regs->VF[_Rd_]]);
xMOVAPS(ptr128[&cpuRegs.GPR.r[_Rt_]], xmmT1);
}
static void recQMTC2() {
@ -360,8 +360,8 @@ static void recQMTC2() {
if (!_Rd_) return;
iFlushCall(FLUSH_EVERYTHING);
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
SSE_MOVAPS_XMM_to_M128((uptr)&microVU0.regs->VF[_Rd_].UL[0], xmmT1);
xMOVAPS(xmmT1, ptr128[&cpuRegs.GPR.r[_Rt_]]);
xMOVAPS(ptr128[&microVU0.regs->VF[_Rd_]], xmmT1);
}
//------------------------------------------------------------------

View File

@ -15,6 +15,8 @@
#pragma once
using namespace x86Emitter;
//------------------------------------------------------------------
// Global Variables
//------------------------------------------------------------------
@ -32,6 +34,9 @@ struct mVU_Globals {
extern const __aligned(32) mVU_Globals mVUglob;
typedef xRegisterSSE xmm;
typedef xRegister32 x32;
//------------------------------------------------------------------
// Helper Macros
//------------------------------------------------------------------
@ -87,23 +92,21 @@ extern const __aligned(32) mVU_Globals mVUglob;
#define offsetSS ((_X) ? (0) : ((_Y) ? (4) : ((_Z) ? 8: 12)))
#define offsetReg ((_X) ? (0) : ((_Y) ? (1) : ((_Z) ? 2: 3)))
#define xmmT1 0 // Used for regAlloc
#define xmmT2 1 // Used for regAlloc
#define xmmT3 2 // Used for regAlloc
#define xmmT4 3 // Used for regAlloc
#define xmmT5 4 // Used for regAlloc
#define xmmT6 5 // Used for regAlloc
#define xmmT7 6 // Used for regAlloc
#define xmmPQ 7 // Holds the Value and Backup Values of P and Q regs
const xmm
xmmT1 = xmm(0), // Used for regAlloc
xmmT2 = xmm(1), // Used for regAlloc
xmmT3 = xmm(2), // Used for regAlloc
xmmT4 = xmm(3), // Used for regAlloc
xmmT5 = xmm(4), // Used for regAlloc
xmmT6 = xmm(5), // Used for regAlloc
xmmT7 = xmm(6), // Used for regAlloc
xmmPQ = xmm(7); // Holds the Value and Backup Values of P and Q regs
#define gprT1 0 // Temp Reg
#define gprT2 1 // Temp Reg
#define gprT3 2 // Temp Reg
#define gprF0 3 // Status Flag 0
#define gprESP 4 // Don't use?
#define gprF1 5 // Status Flag 1
#define gprF2 6 // Status Flag 2
#define gprF3 7 // Status Flag 3
const x32
gprT1 = x32(0), // eax - Temp Reg
gprT2 = x32(1), // ecx - Temp Reg
gprT3 = x32(2), // edx - Temp Reg
gprF[4] = {x32(3), x32(5), x32(6), x32(7)}; // ebx, ebp, esi, edi - Status Flags
// Function Params
#define mP microVU* mVU, int recPass
@ -192,7 +195,7 @@ typedef u32 (__fastcall *mVUCall)(void*, void*);
#define branchAddrN ((xPC + 16 + (_Imm11_ * 8)) & (mVU->microMemSize-8))
#define shufflePQ (((mVU->p) ? 0xb0 : 0xe0) | ((mVU->q) ? 0x01 : 0x04))
#define cmpOffset(x) ((u8*)&(((u8*)x)[it[0].start]))
#define Rmem (uptr)&mVU->regs->VI[REG_R].UL
#define Rmem &mVU->regs->VI[REG_R].UL
#define aWrap(x, m) ((x > m) ? 0 : x)
#define shuffleSS(x) ((x==1)?(0x27):((x==2)?(0xc6):((x==4)?(0xe1):(0xe4))))
#define _1mb (0x100000)
@ -295,8 +298,13 @@ typedef u32 (__fastcall *mVUCall)(void*, void*);
#define mVUdebugNOW(isEndPC) { \
if (mVUdebugNow) { \
MOV32ItoR(gprT2, xPC); \
if (isEndPC) { CALLFunc((uptr)mVUprintPC2); } \
else { CALLFunc((uptr)mVUprintPC1); } \
xMOV(gprT2, xPC); \
if (isEndPC) { xCALL(mVUprintPC2); } \
else { xCALL(mVUprintPC1); } \
} \
}
void mVUmergeRegs(xmm dest, xmm src, int xyzw, bool modXYZW=false);
void mVUsaveReg(xmm reg, xAddressVoid ptr, int xyzw, bool modXYZW);
void mVUloadReg(xmm reg, xAddressVoid ptr, int xyzw);
void mVUloadIreg(xmm reg, int xyzw, VURegs* vuRegs);

View File

@ -1,6 +1,6 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2010 PCSX2 Dev Team
*
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
@ -19,247 +19,190 @@
// Micro VU - Reg Loading/Saving/Shuffling/Unpacking/Merging...
//------------------------------------------------------------------
void mVUunpack_xyzw(int dstreg, int srcreg, int xyzw) {
void mVUunpack_xyzw(xmm dstreg, xmm srcreg, int xyzw)
{
switch ( xyzw ) {
case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break;
case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break;
case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break;
case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break;
case 0: xPSHUF.D(dstreg, srcreg, 0x00); break;
case 1: xPSHUF.D(dstreg, srcreg, 0x55); break;
case 2: xPSHUF.D(dstreg, srcreg, 0xaa); break;
case 3: xPSHUF.D(dstreg, srcreg, 0xff); break;
}
}
void mVUloadReg(int reg, uptr offset, int xyzw) {
void mVUloadReg(xmm reg, xAddressVoid ptr, int xyzw)
{
switch( xyzw ) {
case 8: SSE_MOVSS_M32_to_XMM(reg, offset); break; // X
case 4: SSE_MOVSS_M32_to_XMM(reg, offset+4); break; // Y
case 2: SSE_MOVSS_M32_to_XMM(reg, offset+8); break; // Z
case 1: SSE_MOVSS_M32_to_XMM(reg, offset+12); break; // W
default: SSE_MOVAPS_M128_to_XMM(reg, offset); break;
case 8: xMOVSSZX(reg, ptr32[ptr]); break; // X
case 4: xMOVSSZX(reg, ptr32[ptr+4]); break; // Y
case 2: xMOVSSZX(reg, ptr32[ptr+8]); break; // Z
case 1: xMOVSSZX(reg, ptr32[ptr+12]); break; // W
default: xMOVAPS(reg, ptr128[ptr]); break;
}
}
void mVUloadReg2(int reg, int gprReg, uptr offset, int xyzw) {
switch( xyzw ) {
case 8: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset); break; // X
case 4: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+4); break; // Y
case 2: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+8); break; // Z
case 1: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+12); break; // W
default: SSE_MOVAPSRmtoR(reg, gprReg, offset); break;
}
}
void mVUloadIreg(int reg, int xyzw, VURegs* vuRegs) {
SSE_MOVSS_M32_to_XMM(reg, (uptr)&vuRegs->VI[REG_I].UL);
if (!_XYZWss(xyzw)) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0);
void mVUloadIreg(xmm reg, int xyzw, VURegs* vuRegs)
{
xMOVSSZX(reg, ptr32[&vuRegs->VI[REG_I].UL]);
if (!_XYZWss(xyzw)) xSHUF.PS(reg, reg, 0);
}
// Modifies the Source Reg!
void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW) {
/*SSE_MOVAPS_M128_to_XMM(xmmT2, offset);
void mVUsaveReg(xmm reg, xAddressVoid ptr, int xyzw, bool modXYZW)
{
/*xMOVAPS(xmmT2, ptr128[ptr]);
if (modXYZW && (xyzw == 8 || xyzw == 4 || xyzw == 2 || xyzw == 1)) {
mVUunpack_xyzw<vuIndex>(reg, reg, 0);
mVUunpack_xyzw(reg, reg, 0);
}
mVUmergeRegs(xmmT2, reg, xyzw);
SSE_MOVAPS_XMM_to_M128(offset, xmmT2);
xMOVAPS(ptr128[ptr], xmmT2);
return;*/
switch ( xyzw ) {
case 5: if (x86caps.hasStreamingSIMD4Extensions) {
SSE4_EXTRACTPS_XMM_to_M32(offset+4, reg, 1);
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3);
xEXTRACTPS(ptr32[ptr+4], reg, 1);
xEXTRACTPS(ptr32[ptr+12], reg, 3);
}
else {
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY
SSE_MOVSS_XMM_to_M32(offset+4, reg);
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
SSE_MOVSS_XMM_to_M32(offset+12, reg);
xPSHUF.D(reg, reg, 0xe1); //WZXY
xMOVSS(ptr32[ptr+4], reg);
xPSHUF.D(reg, reg, 0xff); //WWWW
xMOVSS(ptr32[ptr+12], reg);
}
break; // YW
case 6: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xc9);
SSE_MOVLPS_XMM_to_M64(offset+4, reg);
case 6: xPSHUF.D(reg, reg, 0xc9);
xMOVL.PS(ptr64[ptr+4], reg);
break; // YZ
case 7: if (x86caps.hasStreamingSIMD4Extensions) {
SSE_MOVHPS_XMM_to_M64(offset+8, reg);
SSE4_EXTRACTPS_XMM_to_M32(offset+4, reg, 1);
xMOVH.PS(ptr64[ptr+8], reg);
xEXTRACTPS(ptr32[ptr+4], reg, 1);
}
else {
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x93); //ZYXW
SSE_MOVHPS_XMM_to_M64(offset+4, reg);
SSE_MOVSS_XMM_to_M32(offset+12, reg);
xPSHUF.D(reg, reg, 0x93); //ZYXW
xMOVH.PS(ptr64[ptr+4], reg);
xMOVSS(ptr32[ptr+12], reg);
}
break; // YZW
case 9: if (x86caps.hasStreamingSIMD4Extensions) {
SSE_MOVSS_XMM_to_M32(offset, reg);
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3);
xMOVSS(ptr32[ptr], reg);
xEXTRACTPS(ptr32[ptr+12], reg, 3);
}
else {
SSE_MOVSS_XMM_to_M32(offset, reg);
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
SSE_MOVSS_XMM_to_M32(offset+12, reg);
xMOVSS(ptr32[ptr], reg);
xPSHUF.D(reg, reg, 0xff); //WWWW
xMOVSS(ptr32[ptr+12], reg);
}
break; // XW
case 10: if (x86caps.hasStreamingSIMD4Extensions) {
SSE_MOVSS_XMM_to_M32(offset, reg);
SSE4_EXTRACTPS_XMM_to_M32(offset+8, reg, 2);
xMOVSS(ptr32[ptr], reg);
xEXTRACTPS(ptr32[ptr+8], reg, 2);
}
else {
SSE_MOVSS_XMM_to_M32(offset, reg);
SSE_MOVHLPS_XMM_to_XMM(reg, reg);
SSE_MOVSS_XMM_to_M32(offset+8, reg);
xMOVSS(ptr32[ptr], reg);
xMOVHL.PS(reg, reg);
xMOVSS(ptr32[ptr+8], reg);
}
break; //XZ
case 11: SSE_MOVSS_XMM_to_M32(offset, reg);
SSE_MOVHPS_XMM_to_M64(offset+8, reg);
case 11: xMOVSS(ptr32[ptr], reg);
xMOVH.PS(ptr64[ptr+8], reg);
break; //XZW
case 13: if (x86caps.hasStreamingSIMD4Extensions) {
SSE_MOVLPS_XMM_to_M64(offset, reg);
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3);
xMOVL.PS(ptr64[ptr], reg);
xEXTRACTPS(ptr32[ptr+12], reg, 3);
}
else {
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x4b); //YXZW
SSE_MOVHPS_XMM_to_M64(offset, reg);
SSE_MOVSS_XMM_to_M32(offset+12, reg);
xPSHUF.D(reg, reg, 0x4b); //YXZW
xMOVH.PS(ptr64[ptr], reg);
xMOVSS(ptr32[ptr+12], reg);
}
break; // XYW
case 14: if (x86caps.hasStreamingSIMD4Extensions) {
SSE_MOVLPS_XMM_to_M64(offset, reg);
SSE4_EXTRACTPS_XMM_to_M32(offset+8, reg, 2);
xMOVL.PS(ptr64[ptr], reg);
xEXTRACTPS(ptr32[ptr+8], reg, 2);
}
else {
SSE_MOVLPS_XMM_to_M64(offset, reg);
SSE_MOVHLPS_XMM_to_XMM(reg, reg);
SSE_MOVSS_XMM_to_M32(offset+8, reg);
xMOVL.PS(ptr64[ptr], reg);
xMOVHL.PS(reg, reg);
xMOVSS(ptr32[ptr+8], reg);
}
break; // XYZ
case 4: if (!modXYZW) mVUunpack_xyzw(reg, reg, 1);
SSE_MOVSS_XMM_to_M32(offset+4, reg);
xMOVSS(ptr32[ptr+4], reg);
break; // Y
case 2: if (!modXYZW) mVUunpack_xyzw(reg, reg, 2);
SSE_MOVSS_XMM_to_M32(offset+8, reg);
xMOVSS(ptr32[ptr+8], reg);
break; // Z
case 1: if (!modXYZW) mVUunpack_xyzw(reg, reg, 3);
SSE_MOVSS_XMM_to_M32(offset+12, reg);
xMOVSS(ptr32[ptr+12], reg);
break; // W
case 8: SSE_MOVSS_XMM_to_M32(offset, reg); break; // X
case 12: SSE_MOVLPS_XMM_to_M64(offset, reg); break; // XY
case 3: SSE_MOVHPS_XMM_to_M64(offset+8, reg); break; // ZW
default: SSE_MOVAPS_XMM_to_M128(offset, reg); break; // XYZW
}
}
// Modifies the Source Reg!
void mVUsaveReg2(int reg, int gprReg, u32 offset, int xyzw) {
/*SSE_MOVAPSRmtoR(xmmT2, gprReg, offset);
if (xyzw == 8 || xyzw == 4 || xyzw == 2 || xyzw == 1) {
mVUunpack_xyzw<vuIndex>(reg, reg, 0);
}
mVUmergeRegs(xmmT2, reg, xyzw);
SSE_MOVAPSRtoRm(gprReg, xmmT2, offset);
return;*/
switch ( xyzw ) {
case 5: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+4);
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
break; // YW
case 6: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xc9);
SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset+4);
break; // YZ
case 7: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x93); //ZYXW
SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+4);
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
break; // YZW
case 9: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
break; // XW
case 10: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
SSE_MOVHLPS_XMM_to_XMM(reg, reg);
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+8);
break; //XZ
case 11: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+8);
break; //XZW
case 13: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x4b); //YXZW
SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset);
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
break; // XYW
case 14: SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset);
SSE_MOVHLPS_XMM_to_XMM(reg, reg);
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+8);
break; // XYZ
case 8: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset); break; // X
case 4: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+4); break; // Y
case 2: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+8); break; // Z
case 1: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12); break; // W
case 12: SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset); break; // XY
case 3: SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+8); break; // ZW
default: SSE_MOVAPSRtoRm(gprReg, reg, offset); break; // XYZW
case 8: xMOVSS(ptr32[ptr], reg); break; // X
case 12: xMOVL.PS(ptr64[ptr], reg); break; // XY
case 3: xMOVH.PS(ptr64[ptr+8], reg); break; // ZW
default: xMOVAPS(ptr128[ptr], reg); break; // XYZW
}
}
// Modifies the Source Reg! (ToDo: Optimize modXYZW = 1 cases)
void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0) {
void mVUmergeRegs(xmm dest, xmm src, int xyzw, bool modXYZW)
{
xyzw &= 0xf;
if ( (dest != src) && (xyzw != 0) ) {
if (x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf)) {
if (modXYZW) {
if (xyzw == 1) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 3, 0)); return; }
else if (xyzw == 2) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 2, 0)); return; }
else if (xyzw == 4) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 1, 0)); return; }
if (xyzw == 1) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 3, 0)); return; }
else if (xyzw == 2) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 2, 0)); return; }
else if (xyzw == 4) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 1, 0)); return; }
}
xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw);
xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
xBLEND.PS(dest, src, xyzw);
}
else {
switch (xyzw) {
case 1: if (modXYZW) mVUunpack_xyzw(src, src, 0);
SSE_MOVHLPS_XMM_to_XMM(src, dest); // src = Sw Sz Dw Dz
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4); // 11 00 01 00
xMOVHL.PS(src, dest); // src = Sw Sz Dw Dz
xSHUF.PS(dest, src, 0xc4); // 11 00 01 00
break;
case 2: if (modXYZW) mVUunpack_xyzw(src, src, 0);
SSE_MOVHLPS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
xMOVHL.PS(src, dest);
xSHUF.PS(dest, src, 0x64);
break;
case 3: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
case 3: xSHUF.PS(dest, src, 0xe4);
break;
case 4: if (modXYZW) mVUunpack_xyzw(src, src, 0);
SSE_MOVSS_XMM_to_XMM(src, dest);
SSE2_MOVSD_XMM_to_XMM(dest, src);
xMOVSS(src, dest);
xMOVSD(dest, src);
break;
case 5: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd8);
case 5: xSHUF.PS(dest, src, 0xd8);
xPSHUF.D(dest, dest, 0xd8);
break;
case 6: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x78);
case 6: xSHUF.PS(dest, src, 0x9c);
xPSHUF.D(dest, dest, 0x78);
break;
case 7: SSE_MOVSS_XMM_to_XMM(src, dest);
SSE_MOVAPS_XMM_to_XMM(dest, src);
case 7: xMOVSS(src, dest);
xMOVAPS(dest, src);
break;
case 8: SSE_MOVSS_XMM_to_XMM(dest, src);
case 8: xMOVSS(dest, src);
break;
case 9: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd2);
case 9: xSHUF.PS(dest, src, 0xc9);
xPSHUF.D(dest, dest, 0xd2);
break;
case 10: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d);
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x72);
case 10: xSHUF.PS(dest, src, 0x8d);
xPSHUF.D(dest, dest, 0x72);
break;
case 11: SSE_MOVSS_XMM_to_XMM(dest, src);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
case 11: xMOVSS(dest, src);
xSHUF.PS(dest, src, 0xe4);
break;
case 12: SSE2_MOVSD_XMM_to_XMM(dest, src);
case 12: xMOVSD(dest, src);
break;
case 13: SSE_MOVHLPS_XMM_to_XMM(dest, src);
SSE_SHUFPS_XMM_to_XMM(src, dest, 0x64);
SSE_MOVAPS_XMM_to_XMM(dest, src);
case 13: xMOVHL.PS(dest, src);
xSHUF.PS(src, dest, 0x64);
xMOVAPS(dest, src);
break;
case 14: SSE_MOVHLPS_XMM_to_XMM(dest, src);
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xc4);
SSE_MOVAPS_XMM_to_XMM(dest, src);
case 14: xMOVHL.PS(dest, src);
xSHUF.PS(src, dest, 0xc4);
xMOVAPS(dest, src);
break;
default: SSE_MOVAPS_XMM_to_XMM(dest, src);
default: xMOVAPS(dest, src);
break;
}
}
@ -271,33 +214,35 @@ void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0) {
//------------------------------------------------------------------
// Transforms the Address in gprReg to valid VU0/VU1 Address
_f void mVUaddrFix(mV, int gprReg) {
_f void mVUaddrFix(mV, x32 gprReg)
{
if (isVU1) {
AND32ItoR(gprReg, 0x3ff); // wrap around
SHL32ItoR(gprReg, 4);
xAND(gprReg, 0x3ff); // wrap around
xSHL(gprReg, 4);
}
else {
u8 *jmpA, *jmpB;
CMP32ItoR(gprReg, 0x400);
jmpA = JL8(0); // if addr >= 0x4000, reads VU1's VF regs and VI regs
AND32ItoR(gprReg, 0x43f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs!
jmpB = JMP8(0);
x86SetJ8(jmpA);
AND32ItoR(gprReg, 0xff); // if addr < 0x4000, wrap around
x86SetJ8(jmpB);
SHL32ItoR(gprReg, 4); // multiply by 16 (shift left by 4)
xCMP(gprReg, 0x400);
xForwardJL8 jmpA; // if addr >= 0x4000, reads VU1's VF regs and VI regs
xAND(gprReg, 0x43f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs!
xForwardJump8 jmpB;
jmpA.SetTarget();
xAND(gprReg, 0xff); // if addr < 0x4000, wrap around
jmpB.SetTarget();
xSHL(gprReg, 4); // multiply by 16 (shift left by 4)
}
}
// Backup Volatile Regs (EAX, ECX, EDX, MM0~7, XMM0~7, are all volatile according to 32bit Win/Linux ABI)
_f void mVUbackupRegs(microVU* mVU) {
_f void mVUbackupRegs(microVU* mVU)
{
mVU->regAlloc->flushAll();
SSE_MOVAPS_XMM_to_M128((uptr)&mVU->xmmPQb[0], xmmPQ);
xMOVAPS(ptr128[&mVU->xmmPQb[0]], xmmPQ);
}
// Restore Volatile Regs
_f void mVUrestoreRegs(microVU* mVU) {
SSE_MOVAPS_M128_to_XMM(xmmPQ, (uptr)&mVU->xmmPQb[0]);
_f void mVUrestoreRegs(microVU* mVU)
{
xMOVAPS(xmmPQ, ptr128[&mVU->xmmPQb[0]]);
}
//------------------------------------------------------------------
@ -306,7 +251,7 @@ _f void mVUrestoreRegs(microVU* mVU) {
struct SSEMaskPair { u32 mask1[4], mask2[4]; };
static const __aligned16 SSEMaskPair MIN_MAX =
static const __aligned16 SSEMaskPair MIN_MAX =
{
{0xffffffff, 0x80000000, 0xffffffff, 0x80000000},
{0x00000000, 0x40000000, 0x00000000, 0x40000000}
@ -314,121 +259,117 @@ static const __aligned16 SSEMaskPair MIN_MAX =
// Warning: Modifies t1 and t2
void MIN_MAX_PS(microVU* mVU, int to, int from, int t1, int t2, bool min) {
bool t1b = 0, t2b = 0;
if (t1 < 0) { t1 = mVU->regAlloc->allocReg(); t1b = 1; }
if (t2 < 0) { t2 = mVU->regAlloc->allocReg(); t2b = 1; }
void MIN_MAX_PS(microVU* mVU, xmm to, xmm from, xmm t1in, xmm t2in, bool min)
{
xmm t1 = t1in.IsEmpty() ? mVU->regAlloc->allocReg() : t1in;
xmm t2 = t2in.IsEmpty() ? mVU->regAlloc->allocReg() : t2in;
// ZW
SSE2_PSHUFD_XMM_to_XMM(t1, to, 0xfa);
SSE2_PAND_M128_to_XMM (t1, (uptr)MIN_MAX.mask1);
SSE2_POR_M128_to_XMM (t1, (uptr)MIN_MAX.mask2);
SSE2_PSHUFD_XMM_to_XMM(t2, from, 0xfa);
SSE2_PAND_M128_to_XMM (t2, (uptr)MIN_MAX.mask1);
SSE2_POR_M128_to_XMM (t2, (uptr)MIN_MAX.mask2);
if (min) SSE2_MINPD_XMM_to_XMM(t1, t2);
else SSE2_MAXPD_XMM_to_XMM(t1, t2);
xPSHUF.D(t1, to, 0xfa);
xPAND (t1, ptr128[MIN_MAX.mask1]);
xPOR (t1, ptr128[MIN_MAX.mask2]);
xPSHUF.D(t2, from, 0xfa);
xPAND (t2, ptr128[MIN_MAX.mask1]);
xPOR (t2, ptr128[MIN_MAX.mask2]);
if (min) xMIN.PD(t1, t2);
else xMAX.PD(t1, t2);
// XY
SSE2_PSHUFD_XMM_to_XMM(t2, from, 0x50);
SSE2_PAND_M128_to_XMM (t2, (uptr)MIN_MAX.mask1);
SSE2_POR_M128_to_XMM (t2, (uptr)MIN_MAX.mask2);
SSE2_PSHUFD_XMM_to_XMM(to, to, 0x50);
SSE2_PAND_M128_to_XMM (to, (uptr)MIN_MAX.mask1);
SSE2_POR_M128_to_XMM (to, (uptr)MIN_MAX.mask2);
if (min) SSE2_MINPD_XMM_to_XMM(to, t2);
else SSE2_MAXPD_XMM_to_XMM(to, t2);
xPSHUF.D(t2, from, 0x50);
xPAND (t2, ptr128[MIN_MAX.mask1]);
xPOR (t2, ptr128[MIN_MAX.mask2]);
xPSHUF.D(to, to, 0x50);
xPAND (to, ptr128[MIN_MAX.mask1]);
xPOR (to, ptr128[MIN_MAX.mask2]);
if (min) xMIN.PD(to, t2);
else xMAX.PD(to, t2);
SSE_SHUFPS_XMM_to_XMM(to, t1, 0x88);
if (t1b) mVU->regAlloc->clearNeeded(t1);
if (t2b) mVU->regAlloc->clearNeeded(t2);
xSHUF.PS(to, t1, 0x88);
if (t1 != t1in) mVU->regAlloc->clearNeeded(t1);
if (t2 != t2in) mVU->regAlloc->clearNeeded(t2);
}
// Warning: Modifies to's upper 3 vectors, and t1
void MIN_MAX_SS(mV, int to, int from, int t1, bool min) {
bool t1b = 0;
if (t1 < 0) { t1 = mVU->regAlloc->allocReg(); t1b = 1; }
SSE_SHUFPS_XMM_to_XMM (to, from, 0);
SSE2_PAND_M128_to_XMM (to, (uptr)MIN_MAX.mask1);
SSE2_POR_M128_to_XMM (to, (uptr)MIN_MAX.mask2);
SSE2_PSHUFD_XMM_to_XMM(t1, to, 0xee);
if (min) SSE2_MINPD_XMM_to_XMM(to, t1);
else SSE2_MAXPD_XMM_to_XMM(to, t1);
if (t1b) mVU->regAlloc->clearNeeded(t1);
void MIN_MAX_SS(mV, xmm to, xmm from, xmm t1in, bool min)
{
xmm t1 = t1in.IsEmpty() ? mVU->regAlloc->allocReg() : t1in;
xSHUF.PS(to, from, 0);
xPAND (to, ptr128[MIN_MAX.mask1]);
xPOR (to, ptr128[MIN_MAX.mask2]);
xPSHUF.D(t1, to, 0xee);
if (min) xMIN.PD(to, t1);
else xMAX.PD(to, t1);
if (t1 != t1in) mVU->regAlloc->clearNeeded(t1);
}
// Warning: Modifies all vectors in 'to' and 'from', and Modifies xmmT1 and xmmT2
void ADD_SS(microVU* mVU, int to, int from, int t1, int t2) {
void ADD_SS(microVU* mVU, xmm to, xmm from, xmm t1in, xmm t2in)
{
xmm t1 = t1in.IsEmpty() ? mVU->regAlloc->allocReg() : t1in;
xmm t2 = t2in.IsEmpty() ? mVU->regAlloc->allocReg() : t2in;
u8 *localptr[8];
bool t1b = 0, t2b = 0;
if (t1 < 0) { t1 = mVU->regAlloc->allocReg(); t1b = 1; }
if (t2 < 0) { t2 = mVU->regAlloc->allocReg(); t2b = 1; }
xMOVAPS(t1, to);
xMOVAPS(t2, from);
xMOVD(ecx, to);
xSHR(ecx, 23);
xMOVD(eax, from);
xSHR(eax, 23);
xAND(ecx, 0xff);
xAND(eax, 0xff);
xSUB(ecx, eax); //ecx = exponent difference
SSE_MOVAPS_XMM_to_XMM(t1, to);
SSE_MOVAPS_XMM_to_XMM(t2, from);
SSE2_MOVD_XMM_to_R(gprT2, to);
SHR32ItoR(gprT2, 23);
SSE2_MOVD_XMM_to_R(gprT1, from);
SHR32ItoR(gprT1, 23);
AND32ItoR(gprT2, 0xff);
AND32ItoR(gprT1, 0xff);
SUB32RtoR(gprT2, gprT1); //gprT2 = exponent difference
xCMP(ecx, 25);
xForwardJGE8 case2;
xCMP(ecx, 0);
xForwardJG8 case3;
xForwardJE8 toend1;
xCMP(ecx, -25);
xForwardJLE8 case4;
CMP32ItoR(gprT2, 25);
localptr[0] = JGE8(0);
CMP32ItoR(gprT2, 0);
localptr[1] = JG8(0);
localptr[2] = JE8(0);
CMP32ItoR(gprT2, -25);
localptr[3] = JLE8(0);
NEG32R(gprT2);
DEC32R(gprT2);
MOV32ItoR(gprT1, 0xffffffff);
SHL32CLtoR(gprT1);
SSE2_PCMPEQB_XMM_to_XMM(to, to);
SSE2_MOVD_R_to_XMM(from, gprT1);
SSE_MOVSS_XMM_to_XMM(to, from);
SSE2_PCMPEQB_XMM_to_XMM(from, from);
localptr[4] = JMP8(0);
// negative small
xNOT(ecx); // -ecx - 1
xMOV(eax, 0xffffffff);
xSHL(eax, cl);
xPCMP.EQB(to, to);
xMOVDZX(from, eax);
xMOVSS(to, from);
xPCMP.EQB(from, from);
xForwardJump8 toend2;
x86SetJ8(localptr[0]);
MOV32ItoR(gprT1, 0x80000000);
SSE2_PCMPEQB_XMM_to_XMM(from, from);
SSE2_MOVD_R_to_XMM(to, gprT1);
SSE_MOVSS_XMM_to_XMM(from, to);
SSE2_PCMPEQB_XMM_to_XMM(to, to);
localptr[5] = JMP8(0);
case2.SetTarget(); // positive large
xMOV(eax, 0x80000000);
xPCMP.EQB(from, from);
xMOVDZX(to, eax);
xMOVSS(from, to);
xPCMP.EQB(to, to);
xForwardJump8 toend3;
x86SetJ8(localptr[1]);
DEC32R(gprT2);
MOV32ItoR(gprT1, 0xffffffff);
SHL32CLtoR(gprT1);
SSE2_PCMPEQB_XMM_to_XMM(from, from);
SSE2_MOVD_R_to_XMM(to, gprT1);
SSE_MOVSS_XMM_to_XMM(from, to);
SSE2_PCMPEQB_XMM_to_XMM(to, to);
localptr[6] = JMP8(0);
case3.SetTarget(); // positive small
xDEC(ecx);
xMOV(eax, 0xffffffff);
xSHL(eax, cl);
xPCMP.EQB(from, from);
xMOVDZX(to, eax);
xMOVSS(from, to);
xPCMP.EQB(to, to);
xForwardJump8 toend4;
x86SetJ8(localptr[3]);
MOV32ItoR(gprT1, 0x80000000);
SSE2_PCMPEQB_XMM_to_XMM(to, to);
SSE2_MOVD_R_to_XMM(from, gprT1);
SSE_MOVSS_XMM_to_XMM(to, from);
SSE2_PCMPEQB_XMM_to_XMM(from, from);
localptr[7] = JMP8(0);
case4.SetTarget(); // negative large
xMOV(eax, 0x80000000);
xPCMP.EQB(to, to);
xMOVDZX(from, eax);
xMOVSS(to, from);
xPCMP.EQB(from, from);
x86SetJ8(localptr[2]);
x86SetJ8(localptr[4]);
x86SetJ8(localptr[5]);
x86SetJ8(localptr[6]);
x86SetJ8(localptr[7]);
toend1.SetTarget();
toend2.SetTarget();
toend3.SetTarget();
toend4.SetTarget();
SSE_ANDPS_XMM_to_XMM(to, t1); // to contains mask
SSE_ANDPS_XMM_to_XMM(from, t2); // from contains mask
SSE_ADDSS_XMM_to_XMM(to, from);
if (t1b) mVU->regAlloc->clearNeeded(t1);
if (t2b) mVU->regAlloc->clearNeeded(t2);
xAND.PS(to, t1); // to contains mask
xAND.PS(from, t2); // from contains mask
xADD.SS(to, from);
if (t1 != t1in) mVU->regAlloc->clearNeeded(t1);
if (t2 != t2in) mVU->regAlloc->clearNeeded(t2);
}
#define clampOp(opX, isPS) { \
@ -438,53 +379,68 @@ void ADD_SS(microVU* mVU, int to, int from, int t1, int t2) {
mVUclamp4(to, t1, (isPS)?0xf:0x8); \
}
void SSE_MAXPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
if (CHECK_VU_MINMAXHACK) { SSE_MAXPS_XMM_to_XMM(to, from); }
void SSE_MAXPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
if (CHECK_VU_MINMAXHACK) { xMAX.PS(to, from); }
else { MIN_MAX_PS(mVU, to, from, t1, t2, 0); }
}
void SSE_MINPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
if (CHECK_VU_MINMAXHACK) { SSE_MINPS_XMM_to_XMM(to, from); }
void SSE_MINPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
if (CHECK_VU_MINMAXHACK) { xMIN.PS(to, from); }
else { MIN_MAX_PS(mVU, to, from, t1, t2, 1); }
}
void SSE_MAXSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
if (CHECK_VU_MINMAXHACK) { SSE_MAXSS_XMM_to_XMM(to, from); }
void SSE_MAXSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
if (CHECK_VU_MINMAXHACK) { xMAX.SS(to, from); }
else { MIN_MAX_SS(mVU, to, from, t1, 0); }
}
void SSE_MINSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
if (CHECK_VU_MINMAXHACK) { SSE_MINSS_XMM_to_XMM(to, from); }
void SSE_MINSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
if (CHECK_VU_MINMAXHACK) { xMIN.SS(to, from); }
else { MIN_MAX_SS(mVU, to, from, t1, 1); }
}
void SSE_ADD2SS(mV, int to, int from, int t1 = -1, int t2 = -1) {
if (!CHECK_VUADDSUBHACK) { clampOp(SSE_ADDSS_XMM_to_XMM, 0); }
void SSE_ADD2SS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
if (!CHECK_VUADDSUBHACK) { clampOp(xADD.SS, 0); }
else { ADD_SS(mVU, to, from, t1, t2); }
}
void SSE_ADD2PS(mV, int to, int from, int t1 = -1, int t2 = -1) {
clampOp(SSE_ADDPS_XMM_to_XMM, 1);
// FIXME: why do we need two identical definitions with different names?
void SSE_ADD2PS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
clampOp(xADD.PS, 1);
}
void SSE_ADDPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
clampOp(SSE_ADDPS_XMM_to_XMM, 1);
void SSE_ADDPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
clampOp(xADD.PS, 1);
}
void SSE_ADDSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
clampOp(SSE_ADDSS_XMM_to_XMM, 0);
void SSE_ADDSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
clampOp(xADD.SS, 0);
}
void SSE_SUBPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
clampOp(SSE_SUBPS_XMM_to_XMM, 1);
void SSE_SUBPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
clampOp(xSUB.PS, 1);
}
void SSE_SUBSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
clampOp(SSE_SUBSS_XMM_to_XMM, 0);
void SSE_SUBSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
clampOp(xSUB.SS, 0);
}
void SSE_MULPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
clampOp(SSE_MULPS_XMM_to_XMM, 1);
void SSE_MULPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
clampOp(xMUL.PS, 1);
}
void SSE_MULSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
clampOp(SSE_MULSS_XMM_to_XMM, 0);
void SSE_MULSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
clampOp(xMUL.SS, 0);
}
void SSE_DIVPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
clampOp(SSE_DIVPS_XMM_to_XMM, 1);
void SSE_DIVPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
clampOp(xDIV.PS, 1);
}
void SSE_DIVSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
clampOp(SSE_DIVSS_XMM_to_XMM, 0);
void SSE_DIVSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
{
clampOp(xDIV.SS, 0);
}
//------------------------------------------------------------------
@ -493,7 +449,7 @@ void SSE_DIVSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
static __pagealigned u8 mVUsearchXMM[__pagesize];
// Generates a custom optimized block-search function
// Generates a custom optimized block-search function
// Note: Structs must be 16-byte aligned! (GCC doesn't guarantee this)
void mVUcustomSearch() {
HostSys::MemProtectStatic(mVUsearchXMM, Protect_ReadWrite, false);

View File

@ -21,54 +21,61 @@
#define AND_XYZW ((_XYZW_SS && modXYZW) ? (1) : (mFLAG.doFlag ? (_X_Y_Z_W) : (flipMask[_X_Y_Z_W])))
#define ADD_XYZW ((_XYZW_SS && modXYZW) ? (_X ? 3 : (_Y ? 2 : (_Z ? 1 : 0))) : 0)
#define SHIFT_XYZW(gprReg) { if (_XYZW_SS && modXYZW && !_W) { SHL32ItoR(gprReg, ADD_XYZW); } }
#define SHIFT_XYZW(gprReg) { if (_XYZW_SS && modXYZW && !_W) { xSHL(gprReg, ADD_XYZW); } }
// Note: If modXYZW is true, then it adjusts XYZW for Single Scalar operations
static void mVUupdateFlags(mV, int reg, int regT1 = -1, int regT2 = -1, bool modXYZW = 1) {
int sReg, mReg = gprT1, regT1b = 0, regT2b = 0;
static void mVUupdateFlags(mV, xmm reg, xmm regT1in = xEmptyReg, xmm regT2 = xEmptyReg, bool modXYZW = 1) {
x32 mReg = gprT1;
bool regT2b = false;
static const u16 flipMask[16] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
//SysPrintf("Status = %d; Mac = %d\n", sFLAG.doFlag, mFLAG.doFlag);
if (!sFLAG.doFlag && !mFLAG.doFlag) { return; }
if ((mFLAG.doFlag && !(_XYZW_SS && modXYZW))) {
if (regT2 < 0) { regT2 = mVU->regAlloc->allocReg(); regT2b = 1; }
SSE2_PSHUFD_XMM_to_XMM(regT2, reg, 0x1B); // Flip wzyx to xyzw
xmm regT1 = regT1in.IsEmpty() ? mVU->regAlloc->allocReg() : regT1in;
if ((mFLAG.doFlag && !(_XYZW_SS && modXYZW)))
{
if (regT2.IsEmpty())
{
regT2 = mVU->regAlloc->allocReg();
regT2b = true;
}
xPSHUF.D(regT2, reg, 0x1B); // Flip wzyx to xyzw
}
else regT2 = reg;
else
regT2 = reg;
if (sFLAG.doFlag) {
getFlagReg(sReg, sFLAG.write); // Set sReg to valid GPR by Cur Flag Instance
mVUallocSFLAGa(sReg, sFLAG.lastWrite); // Get Prev Status Flag
if (sFLAG.doNonSticky) AND32ItoR(sReg, 0xfffc00ff); // Clear O,U,S,Z flags
mVUallocSFLAGa(getFlagReg(sFLAG.write), sFLAG.lastWrite); // Get Prev Status Flag
if (sFLAG.doNonSticky) xAND(getFlagReg(sFLAG.write), 0xfffc00ff); // Clear O,U,S,Z flags
}
if (regT1 < 0) { regT1 = mVU->regAlloc->allocReg(); regT1b = 1; }
//-------------------------Check for Signed flags------------------------------
SSE_MOVMSKPS_XMM_to_R32(mReg, regT2); // Move the Sign Bits of the t2reg
SSE_XORPS_XMM_to_XMM (regT1, regT1); // Clear regT1
SSE_CMPEQPS_XMM_to_XMM (regT1, regT2); // Set all F's if each vector is zero
SSE_MOVMSKPS_XMM_to_R32(gprT2, regT1); // Used for Zero Flag Calculation
xMOVMSKPS(mReg, regT2); // Move the Sign Bits of the t2reg
xXOR.PS (regT1, regT1); // Clear regT1
xCMPEQ.PS(regT1, regT2); // Set all F's if each vector is zero
xMOVMSKPS(gprT2, regT1); // Used for Zero Flag Calculation
AND32ItoR(mReg, AND_XYZW); // Grab "Is Signed" bits from the previous calculation
SHL32ItoR(mReg, 4 + ADD_XYZW);
xAND(mReg, AND_XYZW); // Grab "Is Signed" bits from the previous calculation
xSHL(mReg, 4 + ADD_XYZW);
//-------------------------Check for Zero flags------------------------------
AND32ItoR(gprT2, AND_XYZW); // Grab "Is Zero" bits from the previous calculation
xAND(gprT2, AND_XYZW); // Grab "Is Zero" bits from the previous calculation
if (mFLAG.doFlag) { SHIFT_XYZW(gprT2); }
OR32RtoR(mReg, gprT2);
xOR(mReg, gprT2);
//-------------------------Write back flags------------------------------
if (mFLAG.doFlag) mVUallocMFLAGb(mVU, mReg, mFLAG.write); // Set Mac Flag
if (sFLAG.doFlag) {
OR32RtoR (sReg, mReg);
xOR(getFlagReg(sFLAG.write), mReg);
if (sFLAG.doNonSticky) {
SHL32ItoR(mReg, 8);
OR32RtoR (sReg, mReg);
xSHL(mReg, 8);
xOR(getFlagReg(sFLAG.write), mReg);
}
}
if (regT1b) mVU->regAlloc->clearNeeded(regT1);
if (regT1 != regT1in) mVU->regAlloc->clearNeeded(regT1);
if (regT2b) mVU->regAlloc->clearNeeded(regT2);
}
@ -76,7 +83,7 @@ static void mVUupdateFlags(mV, int reg, int regT1 = -1, int regT2 = -1, bool mod
// Helper Macros and Functions
//------------------------------------------------------------------
static void (*SSE_PS[]) (microVU*, int, int, int, int) = {
static void (*SSE_PS[]) (microVU*, xmm, xmm, xmm, xmm) = {
SSE_ADDPS, // 0
SSE_SUBPS, // 1
SSE_MULPS, // 2
@ -85,7 +92,7 @@ static void (*SSE_PS[]) (microVU*, int, int, int, int) = {
SSE_ADD2PS // 5
};
static void (*SSE_SS[]) (microVU*, int, int, int, int) = {
static void (*SSE_SS[]) (microVU*, xmm, xmm, xmm, xmm) = {
SSE_ADDSS, // 0
SSE_SUBSS, // 1
SSE_MULSS, // 2
@ -122,9 +129,9 @@ void setupPass1(microVU* mVU, int opCase, bool isACC, bool noFlagUpdate) {
bool doSafeSub(microVU* mVU, int opCase, int opType, bool isACC) {
opCase1 {
if ((opType == 1) && (_Ft_ == _Fs_)) {
int Fs = mVU->regAlloc->allocReg(-1, isACC ? 32 : _Fd_, _X_Y_Z_W);
SSE2_PXOR_XMM_to_XMM(Fs, Fs); // Set to Positive 0
mVUupdateFlags(mVU, Fs, -1);
xmm Fs = mVU->regAlloc->allocReg(-1, isACC ? 32 : _Fd_, _X_Y_Z_W);
xPXOR(Fs, Fs); // Set to Positive 0
mVUupdateFlags(mVU, Fs);
mVU->regAlloc->clearNeeded(Fs);
return 1;
}
@ -133,11 +140,11 @@ bool doSafeSub(microVU* mVU, int opCase, int opType, bool isACC) {
}
// Sets Up Ft Reg for Normal, BC, I, and Q Cases
void setupFtReg(microVU* mVU, int& Ft, int& tempFt, int opCase) {
void setupFtReg(microVU* mVU, xmm& Ft, xmm& tempFt, int opCase) {
opCase1 {
if (_XYZW_SS2) { Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W); tempFt = Ft; }
else if (clampE) { Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf); tempFt = Ft; }
else { Ft = mVU->regAlloc->allocReg(_Ft_); tempFt = -1; }
else { Ft = mVU->regAlloc->allocReg(_Ft_); tempFt = xEmptyReg; }
}
opCase2 {
tempFt = mVU->regAlloc->allocReg(_Ft_);
@ -148,7 +155,7 @@ void setupFtReg(microVU* mVU, int& Ft, int& tempFt, int opCase) {
}
opCase3 { Ft = mVU->regAlloc->allocReg(33, 0, _X_Y_Z_W); tempFt = Ft; }
opCase4 {
if (!clampE && _XYZW_SS && !mVUinfo.readQ) { Ft = xmmPQ; tempFt = -1; }
if (!clampE && _XYZW_SS && !mVUinfo.readQ) { Ft = xmmPQ; tempFt = xEmptyReg; }
else { Ft = mVU->regAlloc->allocReg(); tempFt = Ft; getQreg(Ft, mVUinfo.readQ); }
}
}
@ -159,27 +166,27 @@ void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, co
pass2 {
if (doSafeSub(mVU, opCase, opType, isACC)) return;
int Fs, Ft, ACC, tempFt;
xmm Fs, Ft, ACC, tempFt;
setupFtReg(mVU, Ft, tempFt, opCase);
if (isACC) {
Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
ACC = mVU->regAlloc->allocReg((_X_Y_Z_W == 0xf) ? -1 : 32, 32, 0xf, 0);
if (_XYZW_SS2) SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W));
if (_XYZW_SS2) xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W));
}
else { Fs = mVU->regAlloc->allocReg(_Fs_, _Fd_, _X_Y_Z_W); }
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W);
if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
if (_XYZW_SS) SSE_SS[opType](mVU, Fs, Ft, -1, -1);
else SSE_PS[opType](mVU, Fs, Ft, -1, -1);
if (_XYZW_SS) SSE_SS[opType](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
else SSE_PS[opType](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
if (isACC) {
if (_XYZW_SS) SSE_MOVSS_XMM_to_XMM(ACC, Fs);
if (_XYZW_SS) xMOVSS(ACC, Fs);
else mVUmergeRegs(ACC, Fs, _X_Y_Z_W);
mVUupdateFlags(mVU, ACC, Fs, tempFt);
if (_XYZW_SS2) SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W));
if (_XYZW_SS2) xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W));
mVU->regAlloc->clearNeeded(ACC);
}
else mVUupdateFlags(mVU, Fs, tempFt);
@ -195,30 +202,30 @@ void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, co
void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, const char* opName, int clampType) {
pass1 { setupPass1(mVU, opCase, 1, 0); }
pass2 {
int Fs, Ft, ACC, tempFt;
xmm Fs, Ft, ACC, tempFt;
setupFtReg(mVU, Ft, tempFt, opCase);
Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
ACC = mVU->regAlloc->allocReg(32, 32, 0xf, 0);
if (_XYZW_SS2) { SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
if (_XYZW_SS2) { xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W);
if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
if (_XYZW_SS) SSE_SS[2](mVU, Fs, Ft, -1, -1);
else SSE_PS[2](mVU, Fs, Ft, -1, -1);
if (_XYZW_SS) SSE_SS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
else SSE_PS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
if (_XYZW_SS || _X_Y_Z_W == 0xf) {
if (_XYZW_SS) SSE_SS[opType](mVU, ACC, Fs, tempFt, -1);
else SSE_PS[opType](mVU, ACC, Fs, tempFt, -1);
if (_XYZW_SS) SSE_SS[opType](mVU, ACC, Fs, tempFt, xEmptyReg);
else SSE_PS[opType](mVU, ACC, Fs, tempFt, xEmptyReg);
mVUupdateFlags(mVU, ACC, Fs, tempFt);
if (_XYZW_SS && _X_Y_Z_W != 8) SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W));
if (_XYZW_SS && _X_Y_Z_W != 8) xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W));
}
else {
int tempACC = mVU->regAlloc->allocReg();
SSE_MOVAPS_XMM_to_XMM(tempACC, ACC);
SSE_PS[opType](mVU, tempACC, Fs, tempFt, -1);
xmm tempACC = mVU->regAlloc->allocReg();
xMOVAPS(tempACC, ACC);
SSE_PS[opType](mVU, tempACC, Fs, tempFt, xEmptyReg);
mVUmergeRegs(ACC, tempACC, _X_Y_Z_W);
mVUupdateFlags(mVU, ACC, Fs, tempFt);
mVU->regAlloc->clearNeeded(tempACC);
@ -236,22 +243,22 @@ void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, const char* op
void mVU_FMACc(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) {
pass1 { setupPass1(mVU, opCase, 0, 0); }
pass2 {
int Fs, Ft, ACC, tempFt;
xmm Fs, Ft, ACC, tempFt;
setupFtReg(mVU, Ft, tempFt, opCase);
ACC = mVU->regAlloc->allocReg(32);
Fs = mVU->regAlloc->allocReg(_Fs_, _Fd_, _X_Y_Z_W);
if (_XYZW_SS2) { SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
if (_XYZW_SS2) { xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W);
if (clampType & cACC) mVUclamp2(mVU, ACC, -1, _X_Y_Z_W);
if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
if (clampType & cACC) mVUclamp2(mVU, ACC, xEmptyReg, _X_Y_Z_W);
if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, -1, -1); SSE_SS[0](mVU, Fs, ACC, tempFt, -1); }
else { SSE_PS[2](mVU, Fs, Ft, -1, -1); SSE_PS[0](mVU, Fs, ACC, tempFt, -1); }
if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_SS[0](mVU, Fs, ACC, tempFt, xEmptyReg); }
else { SSE_PS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_PS[0](mVU, Fs, ACC, tempFt, xEmptyReg); }
if (_XYZW_SS2) { SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
if (_XYZW_SS2) { xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
mVUupdateFlags(mVU, Fs, tempFt);
@ -267,18 +274,18 @@ void mVU_FMACc(microVU* mVU, int recPass, int opCase, const char* opName, int cl
void mVU_FMACd(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) {
pass1 { setupPass1(mVU, opCase, 0, 0); }
pass2 {
int Fs, Ft, Fd, tempFt;
xmm Fs, Ft, Fd, tempFt;
setupFtReg(mVU, Ft, tempFt, opCase);
Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
Fd = mVU->regAlloc->allocReg(32, _Fd_, _X_Y_Z_W);
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W);
if (clampType & cACC) mVUclamp2(mVU, Fd, -1, _X_Y_Z_W);
if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
if (clampType & cACC) mVUclamp2(mVU, Fd, xEmptyReg, _X_Y_Z_W);
if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, -1, -1); SSE_SS[1](mVU, Fd, Fs, tempFt, -1); }
else { SSE_PS[2](mVU, Fs, Ft, -1, -1); SSE_PS[1](mVU, Fd, Fs, tempFt, -1); }
if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_SS[1](mVU, Fd, Fs, tempFt, xEmptyReg); }
else { SSE_PS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_PS[1](mVU, Fd, Fs, tempFt, xEmptyReg); }
mVUupdateFlags(mVU, Fd, Fs, tempFt);
@ -295,8 +302,8 @@ mVUop(mVU_ABS) {
pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); }
pass2 {
if (!_Ft_) return;
int Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
SSE_ANDPS_M128_to_XMM(Fs, (uptr)mVUglob.absclip);
xmm Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
xAND.PS(Fs, ptr128[mVUglob.absclip]);
mVU->regAlloc->clearNeeded(Fs);
}
pass3 { mVUlog("ABS"); mVUlogFtFs(); }
@ -306,11 +313,11 @@ mVUop(mVU_ABS) {
mVUop(mVU_OPMULA) {
pass1 { mVUanalyzeFMAC1(mVU, 0, _Fs_, _Ft_); }
pass2 {
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W);
int Fs = mVU->regAlloc->allocReg(_Fs_, 32, _X_Y_Z_W);
xmm Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W);
xmm Fs = mVU->regAlloc->allocReg(_Fs_, 32, _X_Y_Z_W);
SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
xPSHUF.D(Fs, Fs, 0xC9); // WXZY
xPSHUF.D(Ft, Ft, 0xD2); // WYXZ
SSE_MULPS(mVU, Fs, Ft);
mVU->regAlloc->clearNeeded(Ft);
mVUupdateFlags(mVU, Fs);
@ -324,12 +331,12 @@ mVUop(mVU_OPMULA) {
mVUop(mVU_OPMSUB) {
pass1 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, _Ft_); }
pass2 {
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf);
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
int ACC = mVU->regAlloc->allocReg(32, _Fd_, _X_Y_Z_W);
xmm Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf);
xmm Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
xmm ACC = mVU->regAlloc->allocReg(32, _Fd_, _X_Y_Z_W);
SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
xPSHUF.D(Fs, Fs, 0xC9); // WXZY
xPSHUF.D(Ft, Ft, 0xD2); // WYXZ
SSE_MULPS(mVU, Fs, Ft);
SSE_SUBPS(mVU, ACC, Fs);
mVU->regAlloc->clearNeeded(Fs);
@ -343,24 +350,24 @@ mVUop(mVU_OPMSUB) {
}
// FTOI0/FTIO4/FTIO12/FTIO15 Opcodes
static void mVU_FTOIx(mP, uptr addr, const char* opName) {
static void mVU_FTOIx(mP, const float (*addr)[4], const char* opName) {
pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); }
pass2 {
if (!_Ft_) return;
int Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
int t1 = mVU->regAlloc->allocReg();
int t2 = mVU->regAlloc->allocReg();
xmm Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
xmm t1 = mVU->regAlloc->allocReg();
xmm t2 = mVU->regAlloc->allocReg();
// Note: For help understanding this algorithm see recVUMI_FTOI_Saturate()
SSE_MOVAPS_XMM_to_XMM(t1, Fs);
if (addr) { SSE_MULPS_M128_to_XMM(Fs, addr); }
SSE2_CVTTPS2DQ_XMM_to_XMM(Fs, Fs);
SSE2_PXOR_M128_to_XMM(t1, (uptr)mVUglob.signbit);
SSE2_PSRAD_I8_to_XMM (t1, 31);
SSE_MOVAPS_XMM_to_XMM(t2, Fs);
SSE2_PCMPEQD_M128_to_XMM(t2, (uptr)mVUglob.signbit);
SSE_ANDPS_XMM_to_XMM (t1, t2);
SSE2_PADDD_XMM_to_XMM(Fs, t1);
xMOVAPS(t1, Fs);
if (addr) { xMUL.PS(Fs, ptr128[addr]); }
xCVTTPS2DQ(Fs, Fs);
xPXOR(t1, ptr128[mVUglob.signbit]);
xPSRA.D(t1, 31);
xMOVAPS(t2, Fs);
xPCMP.EQD(t2, ptr128[mVUglob.signbit]);
xAND.PS(t1, t2);
xPADD.D(Fs, t1);
mVU->regAlloc->clearNeeded(Fs);
mVU->regAlloc->clearNeeded(t1);
@ -370,14 +377,14 @@ static void mVU_FTOIx(mP, uptr addr, const char* opName) {
}
// ITOF0/ITOF4/ITOF12/ITOF15 Opcodes
static void mVU_ITOFx(mP, uptr addr, const char* opName) {
static void mVU_ITOFx(mP, const float (*addr)[4], const char* opName) {
pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); }
pass2 {
if (!_Ft_) return;
int Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
xmm Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
SSE2_CVTDQ2PS_XMM_to_XMM(Fs, Fs);
if (addr) { SSE_MULPS_M128_to_XMM(Fs, addr); }
xCVTDQ2PS(Fs, Fs);
if (addr) { xMUL.PS(Fs, ptr128[addr]); }
//mVUclamp2(Fs, xmmT1, 15); // Clamp (not sure if this is needed)
mVU->regAlloc->clearNeeded(Fs);
@ -389,34 +396,34 @@ static void mVU_ITOFx(mP, uptr addr, const char* opName) {
mVUop(mVU_CLIP) {
pass1 { mVUanalyzeFMAC4(mVU, _Fs_, _Ft_); }
pass2 {
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0x1);
int t1 = mVU->regAlloc->allocReg();
xmm Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
xmm Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0x1);
xmm t1 = mVU->regAlloc->allocReg();
mVUunpack_xyzw(Ft, Ft, 0);
mVUallocCFLAGa(mVU, gprT1, cFLAG.lastWrite);
SHL32ItoR(gprT1, 6);
xSHL(gprT1, 6);
SSE_ANDPS_M128_to_XMM(Ft, (uptr)mVUglob.absclip);
SSE_MOVAPS_XMM_to_XMM(t1, Ft);
SSE_ORPS_M128_to_XMM(t1, (uptr)mVUglob.signbit);
xAND.PS(Ft, ptr128[&mVUglob.absclip[0]]);
xMOVAPS(t1, Ft);
xPOR(t1, ptr128[&mVUglob.signbit[0]]);
SSE_CMPNLEPS_XMM_to_XMM(t1, Fs); // -w, -z, -y, -x
SSE_CMPLTPS_XMM_to_XMM(Ft, Fs); // +w, +z, +y, +x
xCMPNLE.PS(t1, Fs); // -w, -z, -y, -x
xCMPLT.PS(Ft, Fs); // +w, +z, +y, +x
SSE_MOVAPS_XMM_to_XMM(Fs, Ft); // Fs = +w, +z, +y, +x
SSE_UNPCKLPS_XMM_to_XMM(Ft, t1); // Ft = -y,+y,-x,+x
SSE_UNPCKHPS_XMM_to_XMM(Fs, t1); // Fs = -w,+w,-z,+z
xMOVAPS(Fs, Ft); // Fs = +w, +z, +y, +x
xUNPCK.LPS(Ft, t1); // Ft = -y,+y,-x,+x
xUNPCK.HPS(Fs, t1); // Fs = -w,+w,-z,+z
SSE_MOVMSKPS_XMM_to_R32(gprT2, Fs); // -w,+w,-z,+z
AND32ItoR(gprT2, 0x3);
SHL32ItoR(gprT2, 4);
OR32RtoR (gprT1, gprT2);
xMOVMSKPS(gprT2, Fs); // -w,+w,-z,+z
xAND(gprT2, 0x3);
xSHL(gprT2, 4);
xOR(gprT1, gprT2);
SSE_MOVMSKPS_XMM_to_R32(gprT2, Ft); // -y,+y,-x,+x
AND32ItoR(gprT2, 0xf);
OR32RtoR (gprT1, gprT2);
AND32ItoR(gprT1, 0xffffff);
xMOVMSKPS(gprT2, Ft); // -y,+y,-x,+x
xAND(gprT2, 0xf);
xOR(gprT1, gprT2);
xAND(gprT1, 0xffffff);
mVUallocCFLAGb(mVU, gprT1, cFLAG.write);
mVU->regAlloc->clearNeeded(Fs);
@ -512,12 +519,12 @@ mVUop(mVU_MINIx) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIx", 0); }
mVUop(mVU_MINIy) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIy", 0); }
mVUop(mVU_MINIz) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIz", 0); }
mVUop(mVU_MINIw) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIw", 0); }
mVUop(mVU_FTOI0) { mVU_FTOIx(mX, (uptr)0, "FTOI0"); }
mVUop(mVU_FTOI4) { mVU_FTOIx(mX, (uptr)mVUglob.FTOI_4, "FTOI4"); }
mVUop(mVU_FTOI12) { mVU_FTOIx(mX, (uptr)mVUglob.FTOI_12, "FTOI12"); }
mVUop(mVU_FTOI15) { mVU_FTOIx(mX, (uptr)mVUglob.FTOI_15, "FTOI15"); }
mVUop(mVU_ITOF0) { mVU_ITOFx(mX, (uptr)0, "ITOF0"); }
mVUop(mVU_ITOF4) { mVU_ITOFx(mX, (uptr)mVUglob.ITOF_4, "ITOF4"); }
mVUop(mVU_ITOF12) { mVU_ITOFx(mX, (uptr)mVUglob.ITOF_12, "ITOF12"); }
mVUop(mVU_ITOF15) { mVU_ITOFx(mX, (uptr)mVUglob.ITOF_15, "ITOF15"); }
mVUop(mVU_FTOI0) { mVU_FTOIx(mX, NULL, "FTOI0"); }
mVUop(mVU_FTOI4) { mVU_FTOIx(mX, &mVUglob.FTOI_4, "FTOI4"); }
mVUop(mVU_FTOI12) { mVU_FTOIx(mX, &mVUglob.FTOI_12, "FTOI12"); }
mVUop(mVU_FTOI15) { mVU_FTOIx(mX, &mVUglob.FTOI_15, "FTOI15"); }
mVUop(mVU_ITOF0) { mVU_ITOFx(mX, NULL, "ITOF0"); }
mVUop(mVU_ITOF4) { mVU_ITOFx(mX, &mVUglob.ITOF_4, "ITOF4"); }
mVUop(mVU_ITOF12) { mVU_ITOFx(mX, &mVUglob.ITOF_12, "ITOF12"); }
mVUop(mVU_ITOF15) { mVU_ITOFx(mX, &mVUglob.ITOF_15, "ITOF15"); }
mVUop(mVU_NOP) { pass3 { mVUlog("NOP"); } }

View File

@ -33,7 +33,7 @@ typedef void (__fastcall *nVifrecCall)(uptr dest, uptr src);
#include "newVif_BlockBuffer.h"
#include "newVif_HashBucket.h"
extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0);
extern void mVUmergeRegs(xRegisterSSE dest, xRegisterSSE src, int xyzw, bool modXYZW = 0);
extern void _nVifUnpack (int idx, u8 *data, u32 size, bool isFill);
extern void dVifUnpack (int idx, u8 *data, u32 size, bool isFill);
extern void dVifReset (int idx);

View File

@ -84,7 +84,7 @@ _f void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
int t = regX.Id ? 0 : 1; // Get Temp Reg
xRegisterSSE t = regX == xmm0 ? xmm1 : xmm0; // Get Temp Reg
int cc = aMin(vCL, 3);
u32 m0 = (vB.mask >> (cc * 8)) & 0xff;
u32 m1 = m0 & 0xaa;
@ -95,18 +95,18 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
makeMergeMask(m3);
makeMergeMask(m4);
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect
if (doMask&&m2) { mergeVectors(regX.Id, xmmRow.Id, t, m2); } // Merge Row
if (doMask&&m3) { mergeVectors(regX.Id, xmmCol0.Id+cc, t, m3); } // Merge Col
if (doMask&&m4) { mergeVectors(regX.Id, xmmTemp.Id, t, m4); } // Merge Write Protect
if (doMask&&m2) { mergeVectors(regX, xmmRow, t, m2); } // Merge Row
if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), t, m3); } // Merge Col
if (doMask&&m4) { mergeVectors(regX, xmmTemp, t, m4); } // Merge Write Protect
if (doMode) {
u32 m5 = (~m1>>1) & ~m0;
if (!doMask) m5 = 0xf;
else makeMergeMask(m5);
if (m5 < 0xf) {
xPXOR(xmmTemp, xmmTemp);
mergeVectors(xmmTemp.Id, xmmRow.Id, t, m5);
mergeVectors(xmmTemp, xmmRow, t, m5);
xPADD.D(regX, xmmTemp);
if (doMode==2) mergeVectors(xmmRow.Id, regX.Id, t, m5);
if (doMode==2) mergeVectors(xmmRow, regX, t, m5);
}
else if (m5 == 0xf) {
xPADD.D(regX, xmmRow);

View File

@ -25,13 +25,13 @@
static __pagealigned u8 nVifUpkExec[__pagesize*4];
// Merges xmm vectors without modifying source reg
void mergeVectors(int dest, int src, int temp, int xyzw) {
void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw) {
if (x86caps.hasStreamingSIMD4Extensions || (xyzw==15)
|| (xyzw==12) || (xyzw==11) || (xyzw==8) || (xyzw==3)) {
mVUmergeRegs(dest, src, xyzw);
}
else {
SSE_MOVAPS_XMM_to_XMM(temp, src);
xMOVAPS(temp, src);
mVUmergeRegs(dest, temp, xyzw);
}
}
@ -48,9 +48,9 @@ void loadRowCol(nVifStruct& v) {
xPSHUF.D(xmm1, xmm1, _v0);
xPSHUF.D(xmm2, xmm2, _v0);
xPSHUF.D(xmm6, xmm6, _v0);
mVUmergeRegs(XMM6, XMM0, 8);
mVUmergeRegs(XMM6, XMM1, 4);
mVUmergeRegs(XMM6, XMM2, 2);
mVUmergeRegs(xmm6, xmm0, 8);
mVUmergeRegs(xmm6, xmm1, 4);
mVUmergeRegs(xmm6, xmm2, 2);
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
@ -221,13 +221,13 @@ void VifUnpackSSE_Base::xUPK_V4_5() const {
xMOVAPS (destReg, workReg); // x|x|x|R
xPSRL.D (workReg, 8); // ABG
xPSLL.D (workReg, 3); // AB|G5.000
mVUmergeRegs(destReg.Id, workReg.Id, 0x4); // x|x|G|R
mVUmergeRegs(destReg, workReg, 0x4);// x|x|G|R
xPSRL.D (workReg, 8); // AB
xPSLL.D (workReg, 3); // A|B5.000
mVUmergeRegs(destReg.Id, workReg.Id, 0x2); // x|B|G|R
mVUmergeRegs(destReg, workReg, 0x2);// x|B|G|R
xPSRL.D (workReg, 8); // A
xPSLL.D (workReg, 7); // A.0000000
mVUmergeRegs(destReg.Id, workReg.Id, 0x1); // A|B|G|R
mVUmergeRegs(destReg, workReg, 0x1);// A|B|G|R
xPSLL.D (destReg, 24); // can optimize to
xPSRL.D (destReg, 24); // single AND...
}

View File

@ -24,7 +24,7 @@
using namespace x86Emitter;
extern void mergeVectors(int dest, int src, int temp, int xyzw);
extern void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw);
extern void loadRowCol(nVifStruct& v);
// --------------------------------------------------------------------------------------