mirror of https://github.com/PCSX2/pcsx2.git
microVU: converted all code to the new emitter style. If anything breaks, blame the guy below me.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3406 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
a4afe629e5
commit
b53a92e019
|
@ -363,6 +363,8 @@ template< typename T > void xWrite( T val );
|
|||
bool operator==( const xRegisterSSE& src ) const { return this->Id == src.Id; }
|
||||
bool operator!=( const xRegisterSSE& src ) const { return this->Id != src.Id; }
|
||||
|
||||
void operator=( xRegisterSSE src ) { Id = src.Id; }
|
||||
|
||||
xRegisterSSE& operator++()
|
||||
{
|
||||
++Id &= (iREGCNT_XMM-1);
|
||||
|
|
|
@ -289,6 +289,8 @@ void EmitSibMagic( uint regfield, const xIndirectVoid& info )
|
|||
int displacement_size = (info.Displacement == 0) ? 0 :
|
||||
( ( info.IsByteSizeDisp() ) ? 1 : 2 );
|
||||
|
||||
assert(!info.Base.IsEmpty() || !info.Index.IsEmpty() || displacement_size == 2);
|
||||
|
||||
if( !NeedsSibMagic( info ) )
|
||||
{
|
||||
// Use ModRm-only encoding, with the rm field holding an index/base register, if
|
||||
|
|
|
@ -29,8 +29,8 @@ using namespace x86Emitter;
|
|||
#include "R5900OpcodeTables.h"
|
||||
#include "x86emitter/x86emitter.h"
|
||||
#include "SamplProf.h"
|
||||
#include "microVU_IR.h"
|
||||
#include "microVU_Misc.h"
|
||||
#include "microVU_IR.h"
|
||||
|
||||
struct microBlockLink {
|
||||
microBlock* block;
|
||||
|
|
|
@ -23,148 +23,164 @@
|
|||
// Flag Allocators
|
||||
//------------------------------------------------------------------
|
||||
|
||||
#define getFlagReg(regX, fInst) { \
|
||||
switch (fInst) { \
|
||||
case 0: regX = gprF0; break; \
|
||||
case 1: regX = gprF1; break; \
|
||||
case 2: regX = gprF2; break; \
|
||||
case 3: regX = gprF3; break; \
|
||||
default: \
|
||||
Console.Error("microVU Error: fInst = %d", fInst); \
|
||||
regX = gprF0; \
|
||||
break; \
|
||||
} \
|
||||
_f static x32 getFlagReg(int fInst)
|
||||
{
|
||||
if (fInst >= 0 && fInst < 4)
|
||||
{
|
||||
return gprF[fInst];
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.Error("microVU Error: fInst = %d", fInst);
|
||||
return gprF[0];
|
||||
}
|
||||
}
|
||||
|
||||
#define setBitSFLAG(bitTest, bitSet) { \
|
||||
TEST32ItoR(regT, bitTest); \
|
||||
pjmp = JZ8(0); \
|
||||
OR32ItoR(reg, bitSet); \
|
||||
x86SetJ8(pjmp); \
|
||||
_f void setBitSFLAG(x32 reg, x32 regT, int bitTest, int bitSet)
|
||||
{
|
||||
xTEST(regT, bitTest);
|
||||
xForwardJZ8 skip;
|
||||
xOR(reg, bitSet);
|
||||
skip.SetTarget();
|
||||
}
|
||||
|
||||
#define setBitFSEQ(bitX) { \
|
||||
TEST32ItoR(gprT1, bitX); \
|
||||
pjmp = JZ8(0); \
|
||||
OR32ItoR(gprT1, bitX); \
|
||||
x86SetJ8(pjmp); \
|
||||
_f void setBitFSEQ(x32 reg, int bitX)
|
||||
{
|
||||
xTEST(reg, bitX);
|
||||
xForwardJump8 skip(Jcc_Zero);
|
||||
xOR(reg, bitX);
|
||||
skip.SetTarget();
|
||||
}
|
||||
|
||||
_f void mVUallocSFLAGa(int reg, int fInstance) {
|
||||
getFlagReg(fInstance, fInstance);
|
||||
MOV32RtoR(reg, fInstance);
|
||||
_f void mVUallocSFLAGa(x32 reg, int fInstance)
|
||||
{
|
||||
xMOV(reg, getFlagReg(fInstance));
|
||||
}
|
||||
|
||||
_f void mVUallocSFLAGb(int reg, int fInstance) {
|
||||
getFlagReg(fInstance, fInstance);
|
||||
MOV32RtoR(fInstance, reg);
|
||||
_f void mVUallocSFLAGb(x32 reg, int fInstance)
|
||||
{
|
||||
xMOV(getFlagReg(fInstance), reg);
|
||||
}
|
||||
|
||||
// Normalize Status Flag
|
||||
_f void mVUallocSFLAGc(int reg, int regT, int fInstance) {
|
||||
u8 *pjmp;
|
||||
XOR32RtoR(reg, reg);
|
||||
_f void mVUallocSFLAGc(x32 reg, x32 regT, int fInstance)
|
||||
{
|
||||
xXOR(reg, reg);
|
||||
mVUallocSFLAGa(regT, fInstance);
|
||||
setBitSFLAG(0x0f00, 0x0001); // Z Bit
|
||||
setBitSFLAG(0xf000, 0x0002); // S Bit
|
||||
setBitSFLAG(0x000f, 0x0040); // ZS Bit
|
||||
setBitSFLAG(0x00f0, 0x0080); // SS Bit
|
||||
AND32ItoR(regT, 0xffff0000); // DS/DI/OS/US/D/I/O/U Bits
|
||||
SHR32ItoR(regT, 14);
|
||||
OR32RtoR(reg, regT);
|
||||
setBitSFLAG(reg, regT, 0x0f00, 0x0001); // Z Bit
|
||||
setBitSFLAG(reg, regT, 0xf000, 0x0002); // S Bit
|
||||
setBitSFLAG(reg, regT, 0x000f, 0x0040); // ZS Bit
|
||||
setBitSFLAG(reg, regT, 0x00f0, 0x0080); // SS Bit
|
||||
xAND(regT, 0xffff0000); // DS/DI/OS/US/D/I/O/U Bits
|
||||
xSHR(regT, 14);
|
||||
xOR(reg, regT);
|
||||
}
|
||||
|
||||
// Denormalizes Status Flag
|
||||
_f void mVUallocSFLAGd(uptr memAddr, bool setAllflags) {
|
||||
_f void mVUallocSFLAGd(u32* memAddr, bool setAllflags) {
|
||||
|
||||
// Cannot use EBP (gprF1) here; as this function is used by mVU0 macro and
|
||||
// Cannot use EBP (gprF[1]) here; as this function is used by mVU0 macro and
|
||||
// the EErec needs EBP preserved.
|
||||
|
||||
MOV32MtoR(gprF0, memAddr);
|
||||
MOV32RtoR(gprF3, gprF0);
|
||||
SHR32ItoR(gprF3, 3);
|
||||
AND32ItoR(gprF3, 0x18);
|
||||
xMOV(gprF[0], ptr32[memAddr]);
|
||||
xMOV(gprF[3], gprF[0]);
|
||||
xSHR(gprF[3], 3);
|
||||
xAND(gprF[3], 0x18);
|
||||
|
||||
MOV32RtoR(gprF2, gprF0);
|
||||
SHL32ItoR(gprF2, 11);
|
||||
AND32ItoR(gprF2, 0x1800);
|
||||
OR32RtoR (gprF3, gprF2);
|
||||
xMOV(gprF[2], gprF[0]);
|
||||
xSHL(gprF[2], 11);
|
||||
xAND(gprF[2], 0x1800);
|
||||
xOR (gprF[3], gprF[2]);
|
||||
|
||||
SHL32ItoR(gprF0, 14);
|
||||
AND32ItoR(gprF0, 0x3cf0000);
|
||||
OR32RtoR (gprF3, gprF0);
|
||||
xSHL(gprF[0], 14);
|
||||
xAND(gprF[0], 0x3cf0000);
|
||||
xOR (gprF[3], gprF[0]);
|
||||
|
||||
if (setAllflags) {
|
||||
|
||||
// this code should be run in mVU micro mode only, so writing to
|
||||
// EBP (gprF1) is ok (and needed for vuMicro optimizations).
|
||||
// EBP (gprF[1]) is ok (and needed for vuMicro optimizations).
|
||||
|
||||
MOV32RtoR(gprF0, gprF3);
|
||||
MOV32RtoR(gprF1, gprF3);
|
||||
MOV32RtoR(gprF2, gprF3);
|
||||
xMOV(gprF[0], gprF[3]);
|
||||
xMOV(gprF[1], gprF[3]);
|
||||
xMOV(gprF[2], gprF[3]);
|
||||
}
|
||||
}
|
||||
|
||||
_f void mVUallocMFLAGa(mV, int reg, int fInstance) {
|
||||
MOVZX32M16toR(reg, (uptr)&mVU->macFlag[fInstance]);
|
||||
_f void mVUallocMFLAGa(mV, x32 reg, int fInstance)
|
||||
{
|
||||
xMOVZX(reg, ptr16[&mVU->macFlag[fInstance]]);
|
||||
}
|
||||
|
||||
_f void mVUallocMFLAGb(mV, int reg, int fInstance) {
|
||||
//AND32ItoR(reg, 0xffff);
|
||||
if (fInstance < 4) MOV32RtoM((uptr)&mVU->macFlag[fInstance], reg); // microVU
|
||||
else MOV32RtoM((uptr)&mVU->regs->VI[REG_MAC_FLAG].UL, reg); // macroVU
|
||||
_f void mVUallocMFLAGb(mV, x32 reg, int fInstance)
|
||||
{
|
||||
//xAND(reg, 0xffff);
|
||||
if (fInstance < 4) xMOV(ptr32[&mVU->macFlag[fInstance]], reg); // microVU
|
||||
else xMOV(ptr32[&mVU->regs->VI[REG_MAC_FLAG].UL], reg); // macroVU
|
||||
}
|
||||
|
||||
_f void mVUallocCFLAGa(mV, int reg, int fInstance) {
|
||||
if (fInstance < 4) MOV32MtoR(reg, (uptr)&mVU->clipFlag[fInstance]); // microVU
|
||||
else MOV32MtoR(reg, (uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL); // macroVU
|
||||
_f void mVUallocCFLAGa(mV, x32 reg, int fInstance)
|
||||
{
|
||||
if (fInstance < 4) xMOV(reg, ptr32[&mVU->clipFlag[fInstance]]); // microVU
|
||||
else xMOV(reg, ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL]); // macroVU
|
||||
}
|
||||
|
||||
_f void mVUallocCFLAGb(mV, int reg, int fInstance) {
|
||||
if (fInstance < 4) MOV32RtoM((uptr)&mVU->clipFlag[fInstance], reg); // microVU
|
||||
else MOV32RtoM((uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL, reg); // macroVU
|
||||
_f void mVUallocCFLAGb(mV, x32 reg, int fInstance)
|
||||
{
|
||||
if (fInstance < 4) xMOV(ptr32[&mVU->clipFlag[fInstance]], reg); // microVU
|
||||
else xMOV(ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL], reg); // macroVU
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------
|
||||
// VI Reg Allocators
|
||||
//------------------------------------------------------------------
|
||||
|
||||
_f void mVUallocVIa(mV, int GPRreg, int _reg_) {
|
||||
if (!_reg_) { XOR32RtoR(GPRreg, GPRreg); }
|
||||
else { MOVZX32M16toR(GPRreg, (uptr)&mVU->regs->VI[_reg_].UL); }
|
||||
_f void mVUallocVIa(mV, x32 GPRreg, int _reg_, bool signext = false)
|
||||
{
|
||||
if (!_reg_)
|
||||
xXOR(GPRreg, GPRreg);
|
||||
else
|
||||
if (signext)
|
||||
xMOVSX(GPRreg, ptr16[&mVU->regs->VI[_reg_].SL]);
|
||||
else
|
||||
xMOVZX(GPRreg, ptr16[&mVU->regs->VI[_reg_].UL]);
|
||||
}
|
||||
|
||||
_f void mVUallocVIb(mV, int GPRreg, int _reg_) {
|
||||
_f void mVUallocVIb(mV, x32 GPRreg, int _reg_)
|
||||
{
|
||||
if (mVUlow.backupVI) { // Backs up reg to memory (used when VI is modified b4 a branch)
|
||||
MOVZX32M16toR(gprT3, (uptr)&mVU->regs->VI[_reg_].UL);
|
||||
MOV32RtoM((uptr)&mVU->VIbackup, gprT3);
|
||||
xMOVZX(edx, ptr16[&mVU->regs->VI[_reg_].UL]);
|
||||
xMOV(ptr32[&mVU->VIbackup], edx);
|
||||
}
|
||||
if (_reg_ == 0) { return; }
|
||||
else if (_reg_ < 16) { MOV16RtoM((uptr)&mVU->regs->VI[_reg_].UL, GPRreg); }
|
||||
else if (_reg_ < 16) { xMOV(ptr16[&mVU->regs->VI[_reg_].UL], xRegister16(GPRreg.Id)); }
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------
|
||||
// P/Q Reg Allocators
|
||||
//------------------------------------------------------------------
|
||||
|
||||
_f void getPreg(mV, int reg) {
|
||||
_f void getPreg(mV, xmm reg)
|
||||
{
|
||||
mVUunpack_xyzw(reg, xmmPQ, (2 + mVUinfo.readP));
|
||||
/*if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2(reg, xmmT1, 15);*/
|
||||
}
|
||||
|
||||
_f void getQreg(int reg, int qInstance) {
|
||||
_f void getQreg(xmm reg, int qInstance)
|
||||
{
|
||||
mVUunpack_xyzw(reg, xmmPQ, qInstance);
|
||||
/*if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2<vuIndex>(reg, xmmT1, 15);*/
|
||||
}
|
||||
|
||||
_f void writeQreg(int reg, int qInstance) {
|
||||
_f void writeQreg(xmm reg, int qInstance)
|
||||
{
|
||||
if (qInstance) {
|
||||
if (!x86caps.hasStreamingSIMD4Extensions) {
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1);
|
||||
SSE_MOVSS_XMM_to_XMM(xmmPQ, reg);
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe1);
|
||||
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
|
||||
xMOVSS(xmmPQ, reg);
|
||||
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
|
||||
}
|
||||
else SSE4_INSERTPS_XMM_to_XMM(xmmPQ, reg, _MM_MK_INSERTPS_NDX(0, 1, 0));
|
||||
else xINSERTPS(xmmPQ, reg, _MM_MK_INSERTPS_NDX(0, 1, 0));
|
||||
}
|
||||
else SSE_MOVSS_XMM_to_XMM(xmmPQ, reg);
|
||||
else xMOVSS(xmmPQ, reg);
|
||||
}
|
||||
|
|
|
@ -55,34 +55,33 @@ _f void mVUendProgram(mV, microFlagCycles* mFC, int isEbit) {
|
|||
}
|
||||
|
||||
// Save P/Q Regs
|
||||
if (qInst) { SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe5); }
|
||||
SSE_MOVSS_XMM_to_M32((uptr)&mVU->regs->VI[REG_Q].UL, xmmPQ);
|
||||
if (qInst) { xPSHUF.D(xmmPQ, xmmPQ, 0xe5); }
|
||||
xMOVSS(ptr32[&mVU->regs->VI[REG_Q].UL], xmmPQ);
|
||||
if (isVU1) {
|
||||
SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, pInst ? 3 : 2);
|
||||
SSE_MOVSS_XMM_to_M32((uptr)&mVU->regs->VI[REG_P].UL, xmmPQ);
|
||||
xPSHUF.D(xmmPQ, xmmPQ, pInst ? 3 : 2);
|
||||
xMOVSS(ptr32[&mVU->regs->VI[REG_P].UL], xmmPQ);
|
||||
}
|
||||
|
||||
// Save Flag Instances
|
||||
#if 1 // CHECK_MACROVU0 - Always on now
|
||||
getFlagReg(fStatus, fStatus);
|
||||
MOV32RtoM((uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL, fStatus);
|
||||
xMOV(ptr32[&mVU->regs->VI[REG_STATUS_FLAG].UL], getFlagReg(fStatus));
|
||||
#else
|
||||
mVUallocSFLAGc(gprT1, gprT2, fStatus);
|
||||
MOV32RtoM((uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL, gprT1);
|
||||
mVUallocSFLAGc(gprT1, fStatus);
|
||||
xMOV(ptr32[&mVU->regs->VI[REG_STATUS_FLAG].UL], gprT1);
|
||||
#endif
|
||||
mVUallocMFLAGa(mVU, gprT1, fMac);
|
||||
mVUallocCFLAGa(mVU, gprT2, fClip);
|
||||
MOV32RtoM((uptr)&mVU->regs->VI[REG_MAC_FLAG].UL, gprT1);
|
||||
MOV32RtoM((uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL, gprT2);
|
||||
xMOV(ptr32[&mVU->regs->VI[REG_MAC_FLAG].UL], gprT1);
|
||||
xMOV(ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL], gprT2);
|
||||
|
||||
if (isEbit || isVU1) { // Clear 'is busy' Flags
|
||||
AND32ItoM((uptr)&VU0.VI[REG_VPU_STAT].UL, (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
|
||||
AND32ItoM((uptr)&mVU->regs->vifRegs->stat, ~VIF1_STAT_VEW); // Clear VU 'is busy' signal for vif
|
||||
xAND(ptr32[&VU0.VI[REG_VPU_STAT].UL], (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
|
||||
xAND(ptr32[&mVU->regs->vifRegs->stat], ~VIF1_STAT_VEW); // Clear VU 'is busy' signal for vif
|
||||
}
|
||||
|
||||
if (isEbit != 2) { // Save PC, and Jump to Exit Point
|
||||
MOV32ItoM((uptr)&mVU->regs->VI[REG_TPC].UL, xPC);
|
||||
JMP32((uptr)mVU->exitFunct - ((uptr)x86Ptr + 5));
|
||||
xMOV(ptr32[&mVU->regs->VI[REG_TPC].UL], xPC);
|
||||
xJMP(mVU->exitFunct);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -93,7 +92,7 @@ _f void mVUsetupBranch(mV, microFlagCycles& mFC) {
|
|||
mVUsetupFlags(mVU, mFC); // Shuffle Flag Instances
|
||||
|
||||
// Shuffle P/Q regs since every block starts at instance #0
|
||||
if (mVU->p || mVU->q) { SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, shufflePQ); }
|
||||
if (mVU->p || mVU->q) { xPSHUF.D(xmmPQ, xmmPQ, shufflePQ); }
|
||||
}
|
||||
|
||||
void normBranchCompile(microVU* mVU, u32 branchPC) {
|
||||
|
@ -109,15 +108,15 @@ void normJumpCompile(mV, microFlagCycles& mFC, bool isEvilJump) {
|
|||
mVUsetupBranch(mVU, mFC);
|
||||
mVUbackupRegs(mVU);
|
||||
|
||||
if (isEvilJump) MOV32MtoR(gprT2, (uptr)&mVU->evilBranch);
|
||||
else MOV32MtoR(gprT2, (uptr)&mVU->branch);
|
||||
MOV32ItoR(gprT3, (u32)&mVUpBlock->pStateEnd);
|
||||
if (isEvilJump) xMOV(gprT2, ptr32[&mVU->evilBranch]);
|
||||
else xMOV(gprT2, ptr32[&mVU->branch]);
|
||||
xMOV(gprT3, (uptr)&mVUpBlock->pStateEnd);
|
||||
|
||||
if (!mVU->index) xCALL(mVUcompileJIT<0>); //(u32 startPC, uptr pState)
|
||||
else xCALL(mVUcompileJIT<1>);
|
||||
|
||||
mVUrestoreRegs(mVU);
|
||||
JMPR(gprT1); // Jump to rec-code address
|
||||
xJMP(gprT1); // Jump to rec-code address
|
||||
}
|
||||
|
||||
void normBranch(mV, microFlagCycles& mFC) {
|
||||
|
@ -132,7 +131,7 @@ void normBranch(mV, microFlagCycles& mFC) {
|
|||
|
||||
void condBranch(mV, microFlagCycles& mFC, int JMPcc) {
|
||||
mVUsetupBranch(mVU, mFC);
|
||||
xCMP(ptr16[(u16*)&mVU->branch], 0);
|
||||
xCMP(ptr16[&mVU->branch], 0);
|
||||
incPC(3);
|
||||
if (mVUup.eBit) { // Conditional Branch With E-Bit Set
|
||||
mVUendProgram(mVU, &mFC, 2);
|
||||
|
@ -190,8 +189,8 @@ void normJump(mV, microFlagCycles& mFC) {
|
|||
|
||||
if (mVUup.eBit) { // E-bit Jump
|
||||
mVUendProgram(mVU, &mFC, 2);
|
||||
MOV32MtoR(gprT1, (uptr)&mVU->branch);
|
||||
MOV32RtoM((uptr)&mVU->regs->VI[REG_TPC].UL, gprT1);
|
||||
xMOV(gprT1, ptr32[&mVU->branch]);
|
||||
xMOV(ptr32[&mVU->regs->VI[REG_TPC].UL], gprT1);
|
||||
xJMP(mVU->exitFunct);
|
||||
}
|
||||
else normJumpCompile(mVU, mFC, 0);
|
||||
|
|
|
@ -34,16 +34,16 @@ const __aligned16 u32 sse4_maxvals[2][4] = {
|
|||
// gotten a NaN value, then something went wrong; and the NaN's sign
|
||||
// is not to be trusted. Games like positive values better usually,
|
||||
// and its faster... so just always make NaNs into positive infinity.
|
||||
void mVUclamp1(int reg, int regT1, int xyzw, bool bClampE = 0) {
|
||||
void mVUclamp1(xmm reg, xmm regT1, int xyzw, bool bClampE = 0) {
|
||||
if ((!clampE && CHECK_VU_OVERFLOW) || (clampE && bClampE)) {
|
||||
switch (xyzw) {
|
||||
case 1: case 2: case 4: case 8:
|
||||
SSE_MINSS_M32_to_XMM(reg, (uptr)mVUglob.maxvals);
|
||||
SSE_MAXSS_M32_to_XMM(reg, (uptr)mVUglob.minvals);
|
||||
xMIN.SS(reg, ptr32[mVUglob.maxvals]);
|
||||
xMAX.SS(reg, ptr32[mVUglob.minvals]);
|
||||
break;
|
||||
default:
|
||||
SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals);
|
||||
xMIN.PS(reg, ptr32[mVUglob.maxvals]);
|
||||
xMAX.PS(reg, ptr32[mVUglob.minvals]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -54,44 +54,41 @@ void mVUclamp1(int reg, int regT1, int xyzw, bool bClampE = 0) {
|
|||
// Note 2: Using regalloc here seems to contaminate some regs in certain games.
|
||||
// Must be some specific case I've overlooked (or I used regalloc improperly on an opcode)
|
||||
// so we just use a temporary mem location for our backup for now... (non-sse4 version only)
|
||||
void mVUclamp2(microVU* mVU, int reg, int regT1, int xyzw, bool bClampE = 0) {
|
||||
void mVUclamp2(microVU* mVU, xmm reg, xmm regT1in, int xyzw, bool bClampE = 0) {
|
||||
if ((!clampE && CHECK_VU_SIGN_OVERFLOW) || (clampE && bClampE && CHECK_VU_SIGN_OVERFLOW)) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
int i = (xyzw==1||xyzw==2||xyzw==4||xyzw==8) ? 0: 1;
|
||||
SSE4_PMINSD_M128_to_XMM(reg, (uptr)&sse4_maxvals[i][0]);
|
||||
SSE4_PMINUD_M128_to_XMM(reg, (uptr)&sse4_minvals[i][0]);
|
||||
xPMIN.SD(reg, ptr128[&sse4_maxvals[i][0]]);
|
||||
xPMIN.UD(reg, ptr128[&sse4_minvals[i][0]]);
|
||||
return;
|
||||
}
|
||||
int regT1b = 0;
|
||||
if (regT1 < 0) {
|
||||
regT1b = 1; regT1=(reg+1)%8;
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)mVU->xmmCTemp, regT1);
|
||||
//regT1 = mVU->regAlloc->allocReg();
|
||||
}
|
||||
//xmm regT1 = regT1b ? mVU->regAlloc->allocReg() : regT1in;
|
||||
xmm regT1 = regT1in.IsEmpty() ? xmm((reg.Id + 1) % 8) : regT1in;
|
||||
if (regT1 != regT1in) xMOVAPS(ptr128[mVU->xmmCTemp], regT1);
|
||||
switch (xyzw) {
|
||||
case 1: case 2: case 4: case 8:
|
||||
SSE_MOVAPS_XMM_to_XMM(regT1, reg);
|
||||
SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit);
|
||||
SSE_MINSS_M32_to_XMM (reg, (uptr)mVUglob.maxvals);
|
||||
SSE_MAXSS_M32_to_XMM (reg, (uptr)mVUglob.minvals);
|
||||
SSE_ORPS_XMM_to_XMM (reg, regT1);
|
||||
xMOVAPS(regT1, reg);
|
||||
xAND.PS(regT1, ptr128[mVUglob.signbit]);
|
||||
xMIN.SS(reg, ptr128[mVUglob.maxvals]);
|
||||
xMAX.SS(reg, ptr128[mVUglob.minvals]);
|
||||
xOR.PS (reg, regT1);
|
||||
break;
|
||||
default:
|
||||
SSE_MOVAPS_XMM_to_XMM(regT1, reg);
|
||||
SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit);
|
||||
SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals);
|
||||
SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals);
|
||||
SSE_ORPS_XMM_to_XMM (reg, regT1);
|
||||
xMOVAPS(regT1, reg);
|
||||
xAND.PS(regT1, ptr128[mVUglob.signbit]);
|
||||
xMIN.PS(reg, ptr128[mVUglob.maxvals]);
|
||||
xMAX.PS(reg, ptr128[mVUglob.minvals]);
|
||||
xOR.PS (reg, regT1);
|
||||
break;
|
||||
}
|
||||
//if (regT1b) mVU->regAlloc->clearNeeded(regT1);
|
||||
if (regT1b) SSE_MOVAPS_M128_to_XMM(regT1, (uptr)mVU->xmmCTemp);
|
||||
//if (regT1 != regT1in) mVU->regAlloc->clearNeeded(regT1);
|
||||
if (regT1 != regT1in) xMOVAPS(regT1, ptr128[mVU->xmmCTemp]);
|
||||
}
|
||||
else mVUclamp1(reg, regT1, xyzw, bClampE);
|
||||
else mVUclamp1(reg, regT1in, xyzw, bClampE);
|
||||
}
|
||||
|
||||
// Used for operand clamping on every SSE instruction (add/sub/mul/div)
|
||||
void mVUclamp3(microVU* mVU, int reg, int regT1, int xyzw) {
|
||||
void mVUclamp3(microVU* mVU, xmm reg, xmm regT1, int xyzw) {
|
||||
if (clampE) mVUclamp2(mVU, reg, regT1, xyzw, 1);
|
||||
}
|
||||
|
||||
|
@ -101,6 +98,6 @@ void mVUclamp3(microVU* mVU, int reg, int regT1, int xyzw) {
|
|||
// emulated opcodes (causing crashes). Since we're clamping the operands
|
||||
// with mVUclamp3, we should almost never be getting a NaN result,
|
||||
// but this clamp is just a precaution just-in-case.
|
||||
void mVUclamp4(int reg, int regT1, int xyzw) {
|
||||
void mVUclamp4(xmm reg, xmm regT1, int xyzw) {
|
||||
if (clampE && !CHECK_VU_SIGN_OVERFLOW) mVUclamp1(reg, regT1, xyzw, 1);
|
||||
}
|
||||
|
|
|
@ -126,7 +126,7 @@ void doIbit(mV) {
|
|||
}
|
||||
else tempI = curI;
|
||||
|
||||
MOV32ItoM((uptr)&mVU->regs->VI[REG_I].UL, tempI);
|
||||
xMOV(ptr32[&mVU->regs->VI[REG_I].UL], tempI);
|
||||
incPC(1);
|
||||
}
|
||||
}
|
||||
|
@ -134,21 +134,27 @@ void doIbit(mV) {
|
|||
void doSwapOp(mV) {
|
||||
if (mVUinfo.backupVF && !mVUlow.noWriteVF) {
|
||||
DevCon.WriteLn(Color_Green, "microVU%d: Backing Up VF Reg [%04x]", getIndex, xPC);
|
||||
int t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg);
|
||||
int t2 = mVU->regAlloc->allocReg();
|
||||
SSE_MOVAPS_XMM_to_XMM(t2, t1);
|
||||
mVU->regAlloc->clearNeeded(t1);
|
||||
xmm t2 = mVU->regAlloc->allocReg();
|
||||
{
|
||||
xmm t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg);
|
||||
xMOVAPS(t2, t1);
|
||||
mVU->regAlloc->clearNeeded(t1);
|
||||
}
|
||||
mVUopL(mVU, 1);
|
||||
t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg, mVUlow.VF_write.reg, 0xf, 0);
|
||||
SSE_XORPS_XMM_to_XMM(t2, t1);
|
||||
SSE_XORPS_XMM_to_XMM(t1, t2);
|
||||
SSE_XORPS_XMM_to_XMM(t2, t1);
|
||||
mVU->regAlloc->clearNeeded(t1);
|
||||
{
|
||||
xmm t1 = mVU->regAlloc->allocReg(mVUlow.VF_write.reg, mVUlow.VF_write.reg, 0xf, 0);
|
||||
xXOR.PS(t2, t1);
|
||||
xXOR.PS(t1, t2);
|
||||
xXOR.PS(t2, t1);
|
||||
mVU->regAlloc->clearNeeded(t1);
|
||||
}
|
||||
incPC(1);
|
||||
doUpperOp();
|
||||
t1 = mVU->regAlloc->allocReg(-1, mVUlow.VF_write.reg, 0xf);
|
||||
SSE_MOVAPS_XMM_to_XMM(t1, t2);
|
||||
mVU->regAlloc->clearNeeded(t1);
|
||||
{
|
||||
xmm t1 = mVU->regAlloc->allocReg(-1, mVUlow.VF_write.reg, 0xf);
|
||||
xMOVAPS(t1, t2);
|
||||
mVU->regAlloc->clearNeeded(t1);
|
||||
}
|
||||
mVU->regAlloc->clearNeeded(t2);
|
||||
}
|
||||
else { mVUopL(mVU, 1); incPC(1); doUpperOp(); }
|
||||
|
@ -165,9 +171,9 @@ _f void mVUcheckBadOp(mV) {
|
|||
// Prints msg when exiting block early if 1st op was a bad opcode (Dawn of Mana Level 2)
|
||||
_f void handleBadOp(mV, int count) {
|
||||
if (mVUinfo.isBadOp && count == 0) {
|
||||
MOV32ItoR(gprT2, (uptr)mVU);
|
||||
if (!isVU1) CALLFunc((uptr)mVUbadOp0);
|
||||
else CALLFunc((uptr)mVUbadOp1);
|
||||
xMOV(ecx, (uptr)mVU);
|
||||
if (!isVU1) xCALL(mVUbadOp0);
|
||||
else xCALL(mVUbadOp1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -302,7 +308,7 @@ _f bool doEarlyExit(microVU* mVU) {
|
|||
_f void mVUsavePipelineState(microVU* mVU) {
|
||||
u32* lpS = (u32*)&mVU->prog.lpState.vi15;
|
||||
for (int i = 0; i < (sizeof(microRegInfo)-4)/4; i++, lpS++) {
|
||||
MOV32ItoM((uptr)lpS, lpS[0]);
|
||||
xMOV(ptr32[lpS], lpS[0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -311,18 +317,19 @@ void mVUtestCycles(microVU* mVU) {
|
|||
iPC = mVUstartPC;
|
||||
mVUdebugNOW(0);
|
||||
if (doEarlyExit(mVU)) {
|
||||
CMP32ItoM((uptr)&mVU->cycles, 0);
|
||||
u32* jmp32 = JG32(0);
|
||||
xCMP(ptr32[&mVU->cycles], 0);
|
||||
xForwardJG32 skip;
|
||||
// FIXME: uh... actually kind of a pain with xForwardJump
|
||||
//if (!isVU1) { TEST32ItoM((uptr)&mVU->regs->flags, VUFLAG_MFLAGSET); vu0jmp = JZ32(0); }
|
||||
MOV32ItoR(gprT2, (uptr)mVU);
|
||||
if (isVU1) CALLFunc((uptr)mVUwarning1);
|
||||
//else CALLFunc((uptr)mVUwarning0); // VU0 is allowed early exit for COP2 Interlock Simulation
|
||||
mVUsavePipelineState(mVU);
|
||||
mVUendProgram(mVU, NULL, 0);
|
||||
//if (!isVU1) x86SetJ32(vu0jmp);
|
||||
x86SetJ32(jmp32);
|
||||
xMOV(ecx, (uptr)mVU);
|
||||
if (isVU1) xCALL(mVUwarning1);
|
||||
//else xCALL(mVUwarning0); // VU0 is allowed early exit for COP2 Interlock Simulation
|
||||
mVUsavePipelineState(mVU);
|
||||
mVUendProgram(mVU, NULL, 0);
|
||||
//if (!isVU1) vu0jmp.SetTarget();
|
||||
skip.SetTarget();
|
||||
}
|
||||
SUB32ItoM((uptr)&mVU->cycles, mVUcycles);
|
||||
xSUB(ptr32[&mVU->cycles], mVUcycles);
|
||||
}
|
||||
|
||||
// Initialize VI Constants (vi15 propagates through blocks)
|
||||
|
@ -410,7 +417,7 @@ _r void* mVUcompile(microVU* mVU, u32 startPC, uptr pState) {
|
|||
u32 x = 0;
|
||||
for (; x < endCount; x++) {
|
||||
if (mVUinfo.isEOB) { handleBadOp(mVU, x); x = 0xffff; }
|
||||
if (mVUup.mBit) { OR32ItoM((uptr)&mVU->regs->flags, VUFLAG_MFLAGSET); }
|
||||
if (mVUup.mBit) { xOR(ptr32[&mVU->regs->flags], VUFLAG_MFLAGSET); }
|
||||
if (mVUlow.isNOP) { incPC(1); doUpperOp(); doIbit(mVU); }
|
||||
else if (!mVUinfo.swapOps) { incPC(1); doUpperOp(); doLowerOp(); }
|
||||
else { doSwapOp(mVU); }
|
||||
|
|
|
@ -43,25 +43,25 @@ void mVUdispatcherA(mV) {
|
|||
|
||||
// Load Regs
|
||||
#if 1 // CHECK_MACROVU0 - Always on now
|
||||
MOV32MtoR(gprF0, (uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL);
|
||||
MOV32RtoR(gprF1, gprF0);
|
||||
MOV32RtoR(gprF2, gprF0);
|
||||
MOV32RtoR(gprF3, gprF0);
|
||||
xMOV(gprF[0], ptr32[&mVU->regs->VI[REG_STATUS_FLAG].UL]);
|
||||
xMOV(gprF[1], gprF[0]);
|
||||
xMOV(gprF[2], gprF[0]);
|
||||
xMOV(gprF[3], gprF[0]);
|
||||
#else
|
||||
mVUallocSFLAGd((uptr)&mVU->regs->VI[REG_STATUS_FLAG].UL, 1);
|
||||
#endif
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_MAC_FLAG].UL);
|
||||
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, 0);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)mVU->macFlag, xmmT1);
|
||||
xMOVAPS(xmmT1, ptr128[&mVU->regs->VI[REG_MAC_FLAG].UL]);
|
||||
xSHUF.PS(xmmT1, xmmT1, 0);
|
||||
xMOVAPS(ptr128[&mVU->macFlag[0]], xmmT1);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_CLIP_FLAG].UL);
|
||||
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, 0);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)mVU->clipFlag, xmmT1);
|
||||
xMOVAPS(xmmT1, ptr128[&mVU->regs->VI[REG_CLIP_FLAG].UL]);
|
||||
xSHUF.PS(xmmT1, xmmT1, 0);
|
||||
xMOVAPS(ptr128[&mVU->clipFlag[0]], xmmT1);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_P].UL);
|
||||
SSE_MOVAPS_M128_to_XMM(xmmPQ, (uptr)&mVU->regs->VI[REG_Q].UL);
|
||||
SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmT1, 0); // wzyx = PPQQ
|
||||
xMOVAPS(xmmT1, ptr128[&mVU->regs->VI[REG_P].UL]);
|
||||
xMOVAPS(xmmPQ, ptr128[&mVU->regs->VI[REG_Q].UL]);
|
||||
xSHUF.PS(xmmPQ, xmmT1, 0); // wzyx = PPQQ
|
||||
|
||||
// Jump to Recompiled Code Block
|
||||
xJMP(eax);
|
||||
|
|
|
@ -20,12 +20,10 @@
|
|||
|
||||
// Sets FDIV Flags at the proper time
|
||||
_f void mVUdivSet(mV) {
|
||||
int flagReg1, flagReg2;
|
||||
if (mVUinfo.doDivFlag) {
|
||||
getFlagReg(flagReg1, sFLAG.write);
|
||||
if (!sFLAG.doFlag) { getFlagReg(flagReg2, sFLAG.lastWrite); MOV32RtoR(flagReg1, flagReg2); }
|
||||
AND32ItoR(flagReg1, 0xfff3ffff);
|
||||
OR32MtoR (flagReg1, (uptr)&mVU->divFlag);
|
||||
if (!sFLAG.doFlag) { xMOV(getFlagReg(sFLAG.write), getFlagReg(sFLAG.lastWrite)); }
|
||||
xAND(getFlagReg(sFLAG.write), 0xfff3ffff);
|
||||
xOR (getFlagReg(sFLAG.write), ptr32[&mVU->divFlag]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -159,9 +157,8 @@ _f void mVUsetFlags(mV, microFlagCycles& mFC) {
|
|||
iPC = endPC;
|
||||
}
|
||||
|
||||
#define getFlagReg1(x) ((x == 3) ? gprF3 : ((x == 2) ? gprF2 : ((x == 1) ? gprF1 : gprF0)))
|
||||
#define getFlagReg2(x) ((bStatus[0] == x) ? getFlagReg1(x) : gprT1)
|
||||
#define getFlagReg3(x) ((gFlag == x) ? gprT1 : getFlagReg1(x))
|
||||
#define getFlagReg2(x) ((bStatus[0] == x) ? getFlagReg(x) : gprT1)
|
||||
#define getFlagReg3(x) ((gFlag == x) ? gprT1 : getFlagReg(x))
|
||||
#define getFlagReg4(x) ((gFlag == x) ? gprT1 : gprT2)
|
||||
#define shuffleMac ((bMac [3]<<6)|(bMac [2]<<4)|(bMac [1]<<2)|bMac [0])
|
||||
#define shuffleClip ((bClip[3]<<6)|(bClip[2]<<4)|(bClip[1]<<2)|bClip[0])
|
||||
|
@ -175,52 +172,52 @@ _f void mVUsetupFlags(mV, microFlagCycles& mFC) {
|
|||
// DevCon::Status("sortRegs = %d", params sortRegs);
|
||||
// Note: Emitter will optimize out mov(reg1, reg1) cases...
|
||||
if (sortRegs == 1) {
|
||||
MOV32RtoR(gprF0, getFlagReg1(bStatus[0]));
|
||||
MOV32RtoR(gprF1, getFlagReg1(bStatus[1]));
|
||||
MOV32RtoR(gprF2, getFlagReg1(bStatus[2]));
|
||||
MOV32RtoR(gprF3, getFlagReg1(bStatus[3]));
|
||||
xMOV(gprF[0], getFlagReg(bStatus[0]));
|
||||
xMOV(gprF[1], getFlagReg(bStatus[1]));
|
||||
xMOV(gprF[2], getFlagReg(bStatus[2]));
|
||||
xMOV(gprF[3], getFlagReg(bStatus[3]));
|
||||
}
|
||||
else if (sortRegs == 2) {
|
||||
MOV32RtoR(gprT1, getFlagReg1(bStatus[3]));
|
||||
MOV32RtoR(gprF0, getFlagReg1(bStatus[0]));
|
||||
MOV32RtoR(gprF1, getFlagReg2(bStatus[1]));
|
||||
MOV32RtoR(gprF2, getFlagReg2(bStatus[2]));
|
||||
MOV32RtoR(gprF3, gprT1);
|
||||
xMOV(gprT1, getFlagReg(bStatus[3]));
|
||||
xMOV(gprF[0], getFlagReg(bStatus[0]));
|
||||
xMOV(gprF[1], getFlagReg2(bStatus[1]));
|
||||
xMOV(gprF[2], getFlagReg2(bStatus[2]));
|
||||
xMOV(gprF[3], gprT1);
|
||||
}
|
||||
else if (sortRegs == 3) {
|
||||
int gFlag = (bStatus[0] == bStatus[1]) ? bStatus[2] : bStatus[1];
|
||||
MOV32RtoR(gprT1, getFlagReg1(gFlag));
|
||||
MOV32RtoR(gprT2, getFlagReg1(bStatus[3]));
|
||||
MOV32RtoR(gprF0, getFlagReg1(bStatus[0]));
|
||||
MOV32RtoR(gprF1, getFlagReg3(bStatus[1]));
|
||||
MOV32RtoR(gprF2, getFlagReg4(bStatus[2]));
|
||||
MOV32RtoR(gprF3, gprT2);
|
||||
xMOV(gprT1, getFlagReg(gFlag));
|
||||
xMOV(gprT2, getFlagReg(bStatus[3]));
|
||||
xMOV(gprF[0], getFlagReg(bStatus[0]));
|
||||
xMOV(gprF[1], getFlagReg3(bStatus[1]));
|
||||
xMOV(gprF[2], getFlagReg4(bStatus[2]));
|
||||
xMOV(gprF[3], gprT2);
|
||||
}
|
||||
else {
|
||||
MOV32RtoR(gprT1, getFlagReg1(bStatus[0]));
|
||||
MOV32RtoR(gprT2, getFlagReg1(bStatus[1]));
|
||||
MOV32RtoR(gprT3, getFlagReg1(bStatus[2]));
|
||||
MOV32RtoR(gprF3, getFlagReg1(bStatus[3]));
|
||||
MOV32RtoR(gprF0, gprT1);
|
||||
MOV32RtoR(gprF1, gprT2);
|
||||
MOV32RtoR(gprF2, gprT3);
|
||||
xMOV(gprT1, getFlagReg(bStatus[0]));
|
||||
xMOV(gprT2, getFlagReg(bStatus[1]));
|
||||
xMOV(gprT3, getFlagReg(bStatus[2]));
|
||||
xMOV(gprF[3], getFlagReg(bStatus[3]));
|
||||
xMOV(gprF[0], gprT1);
|
||||
xMOV(gprF[1], gprT2);
|
||||
xMOV(gprF[2], gprT3);
|
||||
}
|
||||
}
|
||||
|
||||
if (__Mac) {
|
||||
int bMac[4];
|
||||
sortFlag(mFC.xMac, bMac, mFC.cycles);
|
||||
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)mVU->macFlag);
|
||||
SSE_SHUFPS_XMM_to_XMM (xmmT1, xmmT1, shuffleMac);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)mVU->macFlag, xmmT1);
|
||||
xMOVAPS(xmmT1, ptr128[mVU->macFlag]);
|
||||
xSHUF.PS(xmmT1, xmmT1, shuffleMac);
|
||||
xMOVAPS(ptr128[mVU->macFlag], xmmT1);
|
||||
}
|
||||
|
||||
if (__Clip) {
|
||||
int bClip[4];
|
||||
sortFlag(mFC.xClip, bClip, mFC.cycles);
|
||||
SSE_MOVAPS_M128_to_XMM(xmmT2, (uptr)mVU->clipFlag);
|
||||
SSE_SHUFPS_XMM_to_XMM (xmmT2, xmmT2, shuffleClip);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)mVU->clipFlag, xmmT2);
|
||||
xMOVAPS(xmmT2, ptr128[mVU->clipFlag]);
|
||||
xSHUF.PS(xmmT2, xmmT2, shuffleClip);
|
||||
xMOVAPS(ptr128[mVU->clipFlag], xmmT2);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -163,11 +163,6 @@ struct microIR {
|
|||
// Reg Alloc
|
||||
//------------------------------------------------------------------
|
||||
|
||||
void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW);
|
||||
void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW);
|
||||
void mVUloadReg(int reg, uptr offset, int xyzw);
|
||||
void mVUloadIreg(int reg, int xyzw, VURegs* vuRegs);
|
||||
|
||||
struct microMapXMM {
|
||||
int VFreg; // VF Reg Number Stored (-1 = Temp; 0 = vf0 and will not be written back; 32 = ACC; 33 = I reg)
|
||||
int xyzw; // xyzw to write back (0 = Don't write back anything AND cached vfReg has all vectors valid)
|
||||
|
@ -209,18 +204,18 @@ public:
|
|||
}
|
||||
void reset() {
|
||||
for (int i = 0; i < xmmTotal; i++) {
|
||||
clearReg(i);
|
||||
clearReg(xmm(i));
|
||||
}
|
||||
counter = 0;
|
||||
}
|
||||
void flushAll(bool clearState = 1) {
|
||||
for (int i = 0; i < xmmTotal; i++) {
|
||||
writeBackReg(i);
|
||||
if (clearState) clearReg(i);
|
||||
writeBackReg(xmm(i));
|
||||
if (clearState) clearReg(xmm(i));
|
||||
}
|
||||
}
|
||||
void clearReg(int reg) {
|
||||
microMapXMM& clear( xmmMap[reg] );
|
||||
void clearReg(xmm reg) {
|
||||
microMapXMM& clear( xmmMap[reg.Id] );
|
||||
clear.VFreg = -1;
|
||||
clear.count = 0;
|
||||
clear.xyzw = 0;
|
||||
|
@ -228,23 +223,23 @@ public:
|
|||
}
|
||||
void clearRegVF(int VFreg) {
|
||||
for (int i = 0; i < xmmTotal; i++) {
|
||||
if (xmmMap[i].VFreg == VFreg) clearReg(i);
|
||||
if (xmmMap[i].VFreg == VFreg) clearReg(xmm(i));
|
||||
}
|
||||
}
|
||||
void writeBackReg(int reg, bool invalidateRegs = 1) {
|
||||
microMapXMM& write( xmmMap[reg] );
|
||||
void writeBackReg(xmm reg, bool invalidateRegs = 1) {
|
||||
microMapXMM& write( xmmMap[reg.Id] );
|
||||
|
||||
if ((write.VFreg > 0) && write.xyzw) { // Reg was modified and not Temp or vf0
|
||||
if (write.VFreg == 33) SSE_MOVSS_XMM_to_M32((uptr)&vuRegs->VI[REG_I].UL, reg);
|
||||
else if (write.VFreg == 32) mVUsaveReg(reg, (uptr)&vuRegs->ACC.UL[0], write.xyzw, 1);
|
||||
else mVUsaveReg(reg, (uptr)&vuRegs->VF[write.VFreg].UL[0], write.xyzw, 1);
|
||||
if (write.VFreg == 33) xMOVSS(ptr32[&vuRegs->VI[REG_I].UL], reg);
|
||||
else if (write.VFreg == 32) mVUsaveReg(reg, ptr[&vuRegs->ACC.UL[0]], write.xyzw, 1);
|
||||
else mVUsaveReg(reg, ptr[&vuRegs->VF[write.VFreg].UL[0]], write.xyzw, 1);
|
||||
if (invalidateRegs) {
|
||||
for (int i = 0; i < xmmTotal; i++) {
|
||||
microMapXMM& imap (xmmMap[i]);
|
||||
if ((i == reg) || imap.isNeeded) continue;
|
||||
if ((i == reg.Id) || imap.isNeeded) continue;
|
||||
if (imap.VFreg == write.VFreg) {
|
||||
if (imap.xyzw && imap.xyzw < 0xf) DevCon.Error("microVU Error: writeBackReg() [%d]", imap.VFreg);
|
||||
clearReg(i); // Invalidate any Cached Regs of same vf Reg
|
||||
clearReg(xmm(i)); // Invalidate any Cached Regs of same vf Reg
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -257,27 +252,28 @@ public:
|
|||
}
|
||||
clearReg(reg); // Clear Reg
|
||||
}
|
||||
void clearNeeded(int reg) {
|
||||
if ((reg < 0) || (reg >= xmmTotal)) return;
|
||||
void clearNeeded(xmm reg)
|
||||
{
|
||||
if ((reg.Id < 0) || (reg.Id >= xmmTotal)) return;
|
||||
|
||||
microMapXMM& clear (xmmMap[reg]);
|
||||
microMapXMM& clear (xmmMap[reg.Id]);
|
||||
clear.isNeeded = 0;
|
||||
if (clear.xyzw) { // Reg was modified
|
||||
if (clear.VFreg > 0) {
|
||||
int mergeRegs = 0;
|
||||
if (clear.xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes
|
||||
for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg
|
||||
if (i == reg) continue;
|
||||
if (i == reg.Id) continue;
|
||||
microMapXMM& imap (xmmMap[i]);
|
||||
if (imap.VFreg == clear.VFreg) {
|
||||
if (imap.xyzw && imap.xyzw < 0xf) DevCon.Error("microVU Error: clearNeeded() [%d]", imap.VFreg);
|
||||
if (mergeRegs == 1) {
|
||||
mVUmergeRegs(i, reg, clear.xyzw, 1);
|
||||
mVUmergeRegs(xmm(i), reg, clear.xyzw, 1);
|
||||
imap.xyzw = 0xf;
|
||||
imap.count = counter;
|
||||
mergeRegs = 2;
|
||||
}
|
||||
else clearReg(i);
|
||||
else clearReg(xmm(i));
|
||||
}
|
||||
}
|
||||
if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged
|
||||
|
@ -286,10 +282,11 @@ public:
|
|||
else clearReg(reg); // If Reg was temp or vf0, then invalidate itself
|
||||
}
|
||||
}
|
||||
int allocReg(int vfLoadReg = -1, int vfWriteReg = -1, int xyzw = 0, bool cloneWrite = 1) {
|
||||
xmm allocReg(int vfLoadReg = -1, int vfWriteReg = -1, int xyzw = 0, bool cloneWrite = 1) {
|
||||
counter++;
|
||||
if (vfLoadReg >= 0) { // Search For Cached Regs
|
||||
for (int i = 0; i < xmmTotal; i++) {
|
||||
xmm xmmi(i);
|
||||
microMapXMM& imap (xmmMap[i]);
|
||||
if ((imap.VFreg == vfLoadReg) && (!imap.xyzw // Reg Was Not Modified
|
||||
|| (imap.VFreg && (imap.xyzw==0xf)))) { // Reg Had All Vectors Modified and != VF0
|
||||
|
@ -297,49 +294,51 @@ public:
|
|||
if (vfWriteReg >= 0) { // Reg will be modified
|
||||
if (cloneWrite) { // Clone Reg so as not to use the same Cached Reg
|
||||
z = findFreeReg();
|
||||
writeBackReg(z);
|
||||
if (z!=i && xyzw==8) SSE_MOVAPS_XMM_to_XMM (z, i);
|
||||
else if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1);
|
||||
else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2);
|
||||
else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3);
|
||||
else if (z != i) SSE_MOVAPS_XMM_to_XMM (z, i);
|
||||
xmm xmmz(z);
|
||||
writeBackReg(xmmz);
|
||||
if (z!=i && xyzw==8) xMOVAPS (xmmz, xmmi);
|
||||
else if (xyzw == 4) xPSHUF.D(xmmz, xmmi, 1);
|
||||
else if (xyzw == 2) xPSHUF.D(xmmz, xmmi, 2);
|
||||
else if (xyzw == 1) xPSHUF.D(xmmz, xmmi, 3);
|
||||
else if (z != i) xMOVAPS (xmmz, xmmi);
|
||||
imap.count = counter; // Reg i was used, so update counter
|
||||
}
|
||||
else { // Don't clone reg, but shuffle to adjust for SS ops
|
||||
if ((vfLoadReg != vfWriteReg) || (xyzw != 0xf)) { writeBackReg(z); }
|
||||
if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1);
|
||||
else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2);
|
||||
else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3);
|
||||
if ((vfLoadReg != vfWriteReg) || (xyzw != 0xf)) { writeBackReg(xmmi); }
|
||||
if (xyzw == 4) xPSHUF.D(xmmi, xmmi, 1);
|
||||
else if (xyzw == 2) xPSHUF.D(xmmi, xmmi, 2);
|
||||
else if (xyzw == 1) xPSHUF.D(xmmi, xmmi, 3);
|
||||
}
|
||||
xmmMap[z].VFreg = vfWriteReg;
|
||||
xmmMap[z].xyzw = xyzw;
|
||||
}
|
||||
xmmMap[z].count = counter;
|
||||
xmmMap[z].isNeeded = 1;
|
||||
return z;
|
||||
return xmm(z);
|
||||
}
|
||||
}
|
||||
}
|
||||
int x = findFreeReg();
|
||||
writeBackReg(x);
|
||||
xmm xmmx(x);
|
||||
writeBackReg(xmmx);
|
||||
|
||||
if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading)
|
||||
if ((vfLoadReg == 0) && !(xyzw & 1)) { SSE2_PXOR_XMM_to_XMM(x, x); }
|
||||
else if (vfLoadReg == 33) mVUloadIreg(x, xyzw, vuRegs);
|
||||
else if (vfLoadReg == 32) mVUloadReg (x, (uptr)&vuRegs->ACC.UL[0], xyzw);
|
||||
else if (vfLoadReg >= 0) mVUloadReg (x, (uptr)&vuRegs->VF[vfLoadReg].UL[0], xyzw);
|
||||
if ((vfLoadReg == 0) && !(xyzw & 1)) { xPXOR(xmmx, xmmx); }
|
||||
else if (vfLoadReg == 33) mVUloadIreg(xmmx, xyzw, vuRegs);
|
||||
else if (vfLoadReg == 32) mVUloadReg (xmmx, ptr[&vuRegs->ACC.UL[0]], xyzw);
|
||||
else if (vfLoadReg >= 0) mVUloadReg (xmmx, ptr[&vuRegs->VF[vfLoadReg].UL[0]], xyzw);
|
||||
xmmMap[x].VFreg = vfWriteReg;
|
||||
xmmMap[x].xyzw = xyzw;
|
||||
}
|
||||
else { // Reg Will Not Be Modified (always load full reg for caching)
|
||||
if (vfLoadReg == 33) mVUloadIreg(x, 0xf, vuRegs);
|
||||
else if (vfLoadReg == 32) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->ACC.UL[0]);
|
||||
else if (vfLoadReg >= 0) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->VF[vfLoadReg].UL[0]);
|
||||
if (vfLoadReg == 33) mVUloadIreg(xmmx, 0xf, vuRegs);
|
||||
else if (vfLoadReg == 32) xMOVAPS(xmmx, ptr128[&vuRegs->ACC.UL[0]]);
|
||||
else if (vfLoadReg >= 0) xMOVAPS(xmmx, ptr128[&vuRegs->VF[vfLoadReg].UL[0]]);
|
||||
xmmMap[x].VFreg = vfLoadReg;
|
||||
xmmMap[x].xyzw = 0;
|
||||
}
|
||||
xmmMap[x].count = counter;
|
||||
xmmMap[x].isNeeded = 1;
|
||||
return x;
|
||||
return xmmx;
|
||||
}
|
||||
};
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,7 @@ void setupMacroOp(int mode, const char* opName) {
|
|||
iFlushCall(FLUSH_EVERYTHING);
|
||||
microVU0.regAlloc->reset();
|
||||
if (mode & 0x01) { // Q-Reg will be Read
|
||||
SSE_MOVSS_M32_to_XMM(xmmPQ, (uptr)µVU0.regs->VI[REG_Q].UL);
|
||||
xMOVSSZX(xmmPQ, ptr32[µVU0.regs->VI[REG_Q].UL]);
|
||||
}
|
||||
if (mode & 0x08) { // Clip Instruction
|
||||
microVU0.prog.IRinfo.info[0].cFlag.write = 0xff;
|
||||
|
@ -51,16 +51,16 @@ void setupMacroOp(int mode, const char* opName) {
|
|||
microVU0.prog.IRinfo.info[0].sFlag.lastWrite = 0;
|
||||
microVU0.prog.IRinfo.info[0].mFlag.doFlag = 1;
|
||||
microVU0.prog.IRinfo.info[0].mFlag.write = 0xff;
|
||||
MOV32MtoR(gprF0, (uptr)µVU0.regs->VI[REG_STATUS_FLAG].UL);
|
||||
xMOV(gprF[0], ptr32[µVU0.regs->VI[REG_STATUS_FLAG].UL]);
|
||||
}
|
||||
}
|
||||
|
||||
void endMacroOp(int mode) {
|
||||
if (mode & 0x02) { // Q-Reg was Written To
|
||||
SSE_MOVSS_XMM_to_M32((uptr)µVU0.regs->VI[REG_Q].UL, xmmPQ);
|
||||
xMOVSS(ptr32[µVU0.regs->VI[REG_Q].UL], xmmPQ);
|
||||
}
|
||||
if (mode & 0x10) { // Status/Mac Flags were Updated
|
||||
MOV32RtoM((uptr)µVU0.regs->VI[REG_STATUS_FLAG].UL, gprF0);
|
||||
xMOV(ptr32[µVU0.regs->VI[REG_STATUS_FLAG].UL], gprF[0]);
|
||||
}
|
||||
microVU0.regAlloc->flushAll();
|
||||
microVU0.cop2 = 0;
|
||||
|
@ -253,11 +253,11 @@ void COP2_Interlock(bool mBitSync) {
|
|||
}
|
||||
|
||||
void TEST_FBRST_RESET(FnType_Void* resetFunct, int vuIndex) {
|
||||
TEST32ItoR(EAX, (vuIndex) ? 0x200 : 0x002);
|
||||
j8Ptr[0] = JZ8(0);
|
||||
xCALL(resetFunct);
|
||||
MOV32MtoR(EAX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
|
||||
x86SetJ8(j8Ptr[0]);
|
||||
xTEST(eax, (vuIndex) ? 0x200 : 0x002);
|
||||
xForwardJZ8 skip;
|
||||
xCALL(resetFunct);
|
||||
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
|
||||
skip.SetTarget();
|
||||
}
|
||||
|
||||
static void recCFC2() {
|
||||
|
@ -269,19 +269,19 @@ static void recCFC2() {
|
|||
iFlushCall(FLUSH_EVERYTHING);
|
||||
|
||||
if (_Rd_ == REG_STATUS_FLAG) { // Normalize Status Flag
|
||||
MOV32MtoR(gprF0, (uptr)µVU0.regs->VI[REG_STATUS_FLAG].UL);
|
||||
mVUallocSFLAGc(EAX, gprF0, 0);
|
||||
xMOV(gprF[0], ptr32[µVU0.regs->VI[REG_STATUS_FLAG].UL]);
|
||||
mVUallocSFLAGc(eax, gprF[0], 0);
|
||||
}
|
||||
else MOV32MtoR(EAX, (uptr)µVU0.regs->VI[_Rd_].UL);
|
||||
else xMOV(eax, ptr32[µVU0.regs->VI[_Rd_].UL]);
|
||||
|
||||
// FixMe: Should R-Reg have upper 9 bits 0?
|
||||
MOV32RtoM((uptr)&cpuRegs.GPR.r[_Rt_].UL[0], EAX);
|
||||
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]], eax);
|
||||
|
||||
if (_Rd_ >= 16) {
|
||||
CDQ(); // Sign Extend
|
||||
MOV32RtoM ((uptr)&cpuRegs.GPR.r[_Rt_].UL[1], EDX);
|
||||
xCDQ(); // Sign Extend
|
||||
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], edx);
|
||||
}
|
||||
else MOV32ItoM((uptr)&cpuRegs.GPR.r[_Rt_].UL[1], 0);
|
||||
else xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], 0);
|
||||
|
||||
// FixMe: I think this is needed, but not sure how it works
|
||||
_eeOnWriteReg(_Rt_, 1);
|
||||
|
@ -298,36 +298,36 @@ static void recCTC2() {
|
|||
case REG_MAC_FLAG: case REG_TPC:
|
||||
case REG_VPU_STAT: break; // Read Only Regs
|
||||
case REG_R:
|
||||
MOV32MtoR(EAX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
|
||||
OR32ItoR (EAX, 0x3f800000);
|
||||
MOV32RtoM((uptr)µVU0.regs->VI[REG_R].UL, EAX);
|
||||
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
|
||||
xOR (eax, 0x3f800000);
|
||||
xMOV(ptr32[µVU0.regs->VI[REG_R].UL], eax);
|
||||
break;
|
||||
case REG_STATUS_FLAG:
|
||||
if (_Rt_) { // Denormalizes flag into gprF1
|
||||
mVUallocSFLAGd((uptr)&cpuRegs.GPR.r[_Rt_].UL[0], 0);
|
||||
MOV32RtoM((uptr)µVU0.regs->VI[_Rd_].UL, gprF1);
|
||||
mVUallocSFLAGd(&cpuRegs.GPR.r[_Rt_].UL[0], 0);
|
||||
xMOV(ptr32[µVU0.regs->VI[_Rd_].UL], gprF[1]);
|
||||
}
|
||||
else MOV32ItoM((uptr)µVU0.regs->VI[_Rd_].UL, 0);
|
||||
else xMOV(ptr32[µVU0.regs->VI[_Rd_].UL], 0);
|
||||
break;
|
||||
case REG_CMSAR1: // Execute VU1 Micro SubRoutine
|
||||
if (_Rt_) {
|
||||
MOV32MtoR(ECX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
|
||||
xMOV(ecx, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
|
||||
}
|
||||
else XOR32RtoR(ECX,ECX);
|
||||
else xXOR(ecx, ecx);
|
||||
xCALL(vu1ExecMicro);
|
||||
break;
|
||||
case REG_FBRST:
|
||||
if (!_Rt_) {
|
||||
MOV32ItoM((uptr)µVU0.regs->VI[REG_FBRST].UL, 0);
|
||||
xMOV(ptr32[µVU0.regs->VI[REG_FBRST].UL], 0);
|
||||
return;
|
||||
}
|
||||
else MOV32MtoR(EAX, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
|
||||
else xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
|
||||
|
||||
TEST_FBRST_RESET(vu0ResetRegs, 0);
|
||||
TEST_FBRST_RESET(vu1ResetRegs, 1);
|
||||
|
||||
AND32ItoR(EAX, 0x0C0C);
|
||||
MOV32RtoM((uptr)µVU0.regs->VI[REG_FBRST].UL, EAX);
|
||||
xAND(eax, 0x0C0C);
|
||||
xMOV(ptr32[µVU0.regs->VI[REG_FBRST].UL], eax);
|
||||
break;
|
||||
default:
|
||||
// Executing vu0 block here fixes the intro of Ratchet and Clank
|
||||
|
@ -349,8 +349,8 @@ static void recQMFC2() {
|
|||
// FixMe: For some reason this line is needed or else games break:
|
||||
_eeOnWriteReg(_Rt_, 0);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)µVU0.regs->VF[_Rd_].UL[0]);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)&cpuRegs.GPR.r[_Rt_].UL[0], xmmT1);
|
||||
xMOVAPS(xmmT1, ptr128[µVU0.regs->VF[_Rd_]]);
|
||||
xMOVAPS(ptr128[&cpuRegs.GPR.r[_Rt_]], xmmT1);
|
||||
}
|
||||
|
||||
static void recQMTC2() {
|
||||
|
@ -360,8 +360,8 @@ static void recQMTC2() {
|
|||
if (!_Rd_) return;
|
||||
iFlushCall(FLUSH_EVERYTHING);
|
||||
|
||||
SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)µVU0.regs->VF[_Rd_].UL[0], xmmT1);
|
||||
xMOVAPS(xmmT1, ptr128[&cpuRegs.GPR.r[_Rt_]]);
|
||||
xMOVAPS(ptr128[µVU0.regs->VF[_Rd_]], xmmT1);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------
|
||||
|
|
|
@ -15,6 +15,8 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
using namespace x86Emitter;
|
||||
|
||||
//------------------------------------------------------------------
|
||||
// Global Variables
|
||||
//------------------------------------------------------------------
|
||||
|
@ -32,6 +34,9 @@ struct mVU_Globals {
|
|||
|
||||
extern const __aligned(32) mVU_Globals mVUglob;
|
||||
|
||||
typedef xRegisterSSE xmm;
|
||||
typedef xRegister32 x32;
|
||||
|
||||
//------------------------------------------------------------------
|
||||
// Helper Macros
|
||||
//------------------------------------------------------------------
|
||||
|
@ -87,23 +92,21 @@ extern const __aligned(32) mVU_Globals mVUglob;
|
|||
#define offsetSS ((_X) ? (0) : ((_Y) ? (4) : ((_Z) ? 8: 12)))
|
||||
#define offsetReg ((_X) ? (0) : ((_Y) ? (1) : ((_Z) ? 2: 3)))
|
||||
|
||||
#define xmmT1 0 // Used for regAlloc
|
||||
#define xmmT2 1 // Used for regAlloc
|
||||
#define xmmT3 2 // Used for regAlloc
|
||||
#define xmmT4 3 // Used for regAlloc
|
||||
#define xmmT5 4 // Used for regAlloc
|
||||
#define xmmT6 5 // Used for regAlloc
|
||||
#define xmmT7 6 // Used for regAlloc
|
||||
#define xmmPQ 7 // Holds the Value and Backup Values of P and Q regs
|
||||
const xmm
|
||||
xmmT1 = xmm(0), // Used for regAlloc
|
||||
xmmT2 = xmm(1), // Used for regAlloc
|
||||
xmmT3 = xmm(2), // Used for regAlloc
|
||||
xmmT4 = xmm(3), // Used for regAlloc
|
||||
xmmT5 = xmm(4), // Used for regAlloc
|
||||
xmmT6 = xmm(5), // Used for regAlloc
|
||||
xmmT7 = xmm(6), // Used for regAlloc
|
||||
xmmPQ = xmm(7); // Holds the Value and Backup Values of P and Q regs
|
||||
|
||||
#define gprT1 0 // Temp Reg
|
||||
#define gprT2 1 // Temp Reg
|
||||
#define gprT3 2 // Temp Reg
|
||||
#define gprF0 3 // Status Flag 0
|
||||
#define gprESP 4 // Don't use?
|
||||
#define gprF1 5 // Status Flag 1
|
||||
#define gprF2 6 // Status Flag 2
|
||||
#define gprF3 7 // Status Flag 3
|
||||
const x32
|
||||
gprT1 = x32(0), // eax - Temp Reg
|
||||
gprT2 = x32(1), // ecx - Temp Reg
|
||||
gprT3 = x32(2), // edx - Temp Reg
|
||||
gprF[4] = {x32(3), x32(5), x32(6), x32(7)}; // ebx, ebp, esi, edi - Status Flags
|
||||
|
||||
// Function Params
|
||||
#define mP microVU* mVU, int recPass
|
||||
|
@ -192,7 +195,7 @@ typedef u32 (__fastcall *mVUCall)(void*, void*);
|
|||
#define branchAddrN ((xPC + 16 + (_Imm11_ * 8)) & (mVU->microMemSize-8))
|
||||
#define shufflePQ (((mVU->p) ? 0xb0 : 0xe0) | ((mVU->q) ? 0x01 : 0x04))
|
||||
#define cmpOffset(x) ((u8*)&(((u8*)x)[it[0].start]))
|
||||
#define Rmem (uptr)&mVU->regs->VI[REG_R].UL
|
||||
#define Rmem &mVU->regs->VI[REG_R].UL
|
||||
#define aWrap(x, m) ((x > m) ? 0 : x)
|
||||
#define shuffleSS(x) ((x==1)?(0x27):((x==2)?(0xc6):((x==4)?(0xe1):(0xe4))))
|
||||
#define _1mb (0x100000)
|
||||
|
@ -295,8 +298,13 @@ typedef u32 (__fastcall *mVUCall)(void*, void*);
|
|||
|
||||
#define mVUdebugNOW(isEndPC) { \
|
||||
if (mVUdebugNow) { \
|
||||
MOV32ItoR(gprT2, xPC); \
|
||||
if (isEndPC) { CALLFunc((uptr)mVUprintPC2); } \
|
||||
else { CALLFunc((uptr)mVUprintPC1); } \
|
||||
xMOV(gprT2, xPC); \
|
||||
if (isEndPC) { xCALL(mVUprintPC2); } \
|
||||
else { xCALL(mVUprintPC1); } \
|
||||
} \
|
||||
}
|
||||
|
||||
void mVUmergeRegs(xmm dest, xmm src, int xyzw, bool modXYZW=false);
|
||||
void mVUsaveReg(xmm reg, xAddressVoid ptr, int xyzw, bool modXYZW);
|
||||
void mVUloadReg(xmm reg, xAddressVoid ptr, int xyzw);
|
||||
void mVUloadIreg(xmm reg, int xyzw, VURegs* vuRegs);
|
|
@ -19,247 +19,190 @@
|
|||
// Micro VU - Reg Loading/Saving/Shuffling/Unpacking/Merging...
|
||||
//------------------------------------------------------------------
|
||||
|
||||
void mVUunpack_xyzw(int dstreg, int srcreg, int xyzw) {
|
||||
void mVUunpack_xyzw(xmm dstreg, xmm srcreg, int xyzw)
|
||||
{
|
||||
switch ( xyzw ) {
|
||||
case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break;
|
||||
case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break;
|
||||
case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break;
|
||||
case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break;
|
||||
case 0: xPSHUF.D(dstreg, srcreg, 0x00); break;
|
||||
case 1: xPSHUF.D(dstreg, srcreg, 0x55); break;
|
||||
case 2: xPSHUF.D(dstreg, srcreg, 0xaa); break;
|
||||
case 3: xPSHUF.D(dstreg, srcreg, 0xff); break;
|
||||
}
|
||||
}
|
||||
|
||||
void mVUloadReg(int reg, uptr offset, int xyzw) {
|
||||
void mVUloadReg(xmm reg, xAddressVoid ptr, int xyzw)
|
||||
{
|
||||
switch( xyzw ) {
|
||||
case 8: SSE_MOVSS_M32_to_XMM(reg, offset); break; // X
|
||||
case 4: SSE_MOVSS_M32_to_XMM(reg, offset+4); break; // Y
|
||||
case 2: SSE_MOVSS_M32_to_XMM(reg, offset+8); break; // Z
|
||||
case 1: SSE_MOVSS_M32_to_XMM(reg, offset+12); break; // W
|
||||
default: SSE_MOVAPS_M128_to_XMM(reg, offset); break;
|
||||
case 8: xMOVSSZX(reg, ptr32[ptr]); break; // X
|
||||
case 4: xMOVSSZX(reg, ptr32[ptr+4]); break; // Y
|
||||
case 2: xMOVSSZX(reg, ptr32[ptr+8]); break; // Z
|
||||
case 1: xMOVSSZX(reg, ptr32[ptr+12]); break; // W
|
||||
default: xMOVAPS(reg, ptr128[ptr]); break;
|
||||
}
|
||||
}
|
||||
|
||||
void mVUloadReg2(int reg, int gprReg, uptr offset, int xyzw) {
|
||||
switch( xyzw ) {
|
||||
case 8: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset); break; // X
|
||||
case 4: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+4); break; // Y
|
||||
case 2: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+8); break; // Z
|
||||
case 1: SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+12); break; // W
|
||||
default: SSE_MOVAPSRmtoR(reg, gprReg, offset); break;
|
||||
}
|
||||
}
|
||||
|
||||
void mVUloadIreg(int reg, int xyzw, VURegs* vuRegs) {
|
||||
SSE_MOVSS_M32_to_XMM(reg, (uptr)&vuRegs->VI[REG_I].UL);
|
||||
if (!_XYZWss(xyzw)) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0);
|
||||
void mVUloadIreg(xmm reg, int xyzw, VURegs* vuRegs)
|
||||
{
|
||||
xMOVSSZX(reg, ptr32[&vuRegs->VI[REG_I].UL]);
|
||||
if (!_XYZWss(xyzw)) xSHUF.PS(reg, reg, 0);
|
||||
}
|
||||
|
||||
// Modifies the Source Reg!
|
||||
void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW) {
|
||||
/*SSE_MOVAPS_M128_to_XMM(xmmT2, offset);
|
||||
void mVUsaveReg(xmm reg, xAddressVoid ptr, int xyzw, bool modXYZW)
|
||||
{
|
||||
/*xMOVAPS(xmmT2, ptr128[ptr]);
|
||||
if (modXYZW && (xyzw == 8 || xyzw == 4 || xyzw == 2 || xyzw == 1)) {
|
||||
mVUunpack_xyzw<vuIndex>(reg, reg, 0);
|
||||
mVUunpack_xyzw(reg, reg, 0);
|
||||
}
|
||||
mVUmergeRegs(xmmT2, reg, xyzw);
|
||||
|
||||
SSE_MOVAPS_XMM_to_M128(offset, xmmT2);
|
||||
xMOVAPS(ptr128[ptr], xmmT2);
|
||||
return;*/
|
||||
|
||||
switch ( xyzw ) {
|
||||
case 5: if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
SSE4_EXTRACTPS_XMM_to_M32(offset+4, reg, 1);
|
||||
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3);
|
||||
xEXTRACTPS(ptr32[ptr+4], reg, 1);
|
||||
xEXTRACTPS(ptr32[ptr+12], reg, 3);
|
||||
}
|
||||
else {
|
||||
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY
|
||||
SSE_MOVSS_XMM_to_M32(offset+4, reg);
|
||||
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, reg);
|
||||
xPSHUF.D(reg, reg, 0xe1); //WZXY
|
||||
xMOVSS(ptr32[ptr+4], reg);
|
||||
xPSHUF.D(reg, reg, 0xff); //WWWW
|
||||
xMOVSS(ptr32[ptr+12], reg);
|
||||
}
|
||||
break; // YW
|
||||
case 6: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xc9);
|
||||
SSE_MOVLPS_XMM_to_M64(offset+4, reg);
|
||||
case 6: xPSHUF.D(reg, reg, 0xc9);
|
||||
xMOVL.PS(ptr64[ptr+4], reg);
|
||||
break; // YZ
|
||||
case 7: if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
SSE_MOVHPS_XMM_to_M64(offset+8, reg);
|
||||
SSE4_EXTRACTPS_XMM_to_M32(offset+4, reg, 1);
|
||||
xMOVH.PS(ptr64[ptr+8], reg);
|
||||
xEXTRACTPS(ptr32[ptr+4], reg, 1);
|
||||
}
|
||||
else {
|
||||
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x93); //ZYXW
|
||||
SSE_MOVHPS_XMM_to_M64(offset+4, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, reg);
|
||||
xPSHUF.D(reg, reg, 0x93); //ZYXW
|
||||
xMOVH.PS(ptr64[ptr+4], reg);
|
||||
xMOVSS(ptr32[ptr+12], reg);
|
||||
}
|
||||
break; // YZW
|
||||
case 9: if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3);
|
||||
xMOVSS(ptr32[ptr], reg);
|
||||
xEXTRACTPS(ptr32[ptr+12], reg, 3);
|
||||
}
|
||||
else {
|
||||
SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, reg);
|
||||
xMOVSS(ptr32[ptr], reg);
|
||||
xPSHUF.D(reg, reg, 0xff); //WWWW
|
||||
xMOVSS(ptr32[ptr+12], reg);
|
||||
}
|
||||
break; // XW
|
||||
case 10: if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
SSE4_EXTRACTPS_XMM_to_M32(offset+8, reg, 2);
|
||||
xMOVSS(ptr32[ptr], reg);
|
||||
xEXTRACTPS(ptr32[ptr+8], reg, 2);
|
||||
}
|
||||
else {
|
||||
SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
SSE_MOVHLPS_XMM_to_XMM(reg, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+8, reg);
|
||||
xMOVSS(ptr32[ptr], reg);
|
||||
xMOVHL.PS(reg, reg);
|
||||
xMOVSS(ptr32[ptr+8], reg);
|
||||
}
|
||||
break; //XZ
|
||||
case 11: SSE_MOVSS_XMM_to_M32(offset, reg);
|
||||
SSE_MOVHPS_XMM_to_M64(offset+8, reg);
|
||||
case 11: xMOVSS(ptr32[ptr], reg);
|
||||
xMOVH.PS(ptr64[ptr+8], reg);
|
||||
break; //XZW
|
||||
case 13: if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
SSE_MOVLPS_XMM_to_M64(offset, reg);
|
||||
SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3);
|
||||
xMOVL.PS(ptr64[ptr], reg);
|
||||
xEXTRACTPS(ptr32[ptr+12], reg, 3);
|
||||
}
|
||||
else {
|
||||
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x4b); //YXZW
|
||||
SSE_MOVHPS_XMM_to_M64(offset, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, reg);
|
||||
xPSHUF.D(reg, reg, 0x4b); //YXZW
|
||||
xMOVH.PS(ptr64[ptr], reg);
|
||||
xMOVSS(ptr32[ptr+12], reg);
|
||||
}
|
||||
break; // XYW
|
||||
case 14: if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
SSE_MOVLPS_XMM_to_M64(offset, reg);
|
||||
SSE4_EXTRACTPS_XMM_to_M32(offset+8, reg, 2);
|
||||
xMOVL.PS(ptr64[ptr], reg);
|
||||
xEXTRACTPS(ptr32[ptr+8], reg, 2);
|
||||
}
|
||||
else {
|
||||
SSE_MOVLPS_XMM_to_M64(offset, reg);
|
||||
SSE_MOVHLPS_XMM_to_XMM(reg, reg);
|
||||
SSE_MOVSS_XMM_to_M32(offset+8, reg);
|
||||
xMOVL.PS(ptr64[ptr], reg);
|
||||
xMOVHL.PS(reg, reg);
|
||||
xMOVSS(ptr32[ptr+8], reg);
|
||||
}
|
||||
break; // XYZ
|
||||
case 4: if (!modXYZW) mVUunpack_xyzw(reg, reg, 1);
|
||||
SSE_MOVSS_XMM_to_M32(offset+4, reg);
|
||||
xMOVSS(ptr32[ptr+4], reg);
|
||||
break; // Y
|
||||
case 2: if (!modXYZW) mVUunpack_xyzw(reg, reg, 2);
|
||||
SSE_MOVSS_XMM_to_M32(offset+8, reg);
|
||||
xMOVSS(ptr32[ptr+8], reg);
|
||||
break; // Z
|
||||
case 1: if (!modXYZW) mVUunpack_xyzw(reg, reg, 3);
|
||||
SSE_MOVSS_XMM_to_M32(offset+12, reg);
|
||||
xMOVSS(ptr32[ptr+12], reg);
|
||||
break; // W
|
||||
case 8: SSE_MOVSS_XMM_to_M32(offset, reg); break; // X
|
||||
case 12: SSE_MOVLPS_XMM_to_M64(offset, reg); break; // XY
|
||||
case 3: SSE_MOVHPS_XMM_to_M64(offset+8, reg); break; // ZW
|
||||
default: SSE_MOVAPS_XMM_to_M128(offset, reg); break; // XYZW
|
||||
}
|
||||
}
|
||||
|
||||
// Modifies the Source Reg!
|
||||
void mVUsaveReg2(int reg, int gprReg, u32 offset, int xyzw) {
|
||||
/*SSE_MOVAPSRmtoR(xmmT2, gprReg, offset);
|
||||
if (xyzw == 8 || xyzw == 4 || xyzw == 2 || xyzw == 1) {
|
||||
mVUunpack_xyzw<vuIndex>(reg, reg, 0);
|
||||
}
|
||||
mVUmergeRegs(xmmT2, reg, xyzw);
|
||||
SSE_MOVAPSRtoRm(gprReg, xmmT2, offset);
|
||||
return;*/
|
||||
|
||||
switch ( xyzw ) {
|
||||
case 5: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY
|
||||
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+4);
|
||||
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
|
||||
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
|
||||
break; // YW
|
||||
case 6: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xc9);
|
||||
SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset+4);
|
||||
break; // YZ
|
||||
case 7: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x93); //ZYXW
|
||||
SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+4);
|
||||
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
|
||||
break; // YZW
|
||||
case 9: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
|
||||
SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW
|
||||
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
|
||||
break; // XW
|
||||
case 10: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
|
||||
SSE_MOVHLPS_XMM_to_XMM(reg, reg);
|
||||
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+8);
|
||||
break; //XZ
|
||||
case 11: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
|
||||
SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+8);
|
||||
break; //XZW
|
||||
case 13: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0x4b); //YXZW
|
||||
SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset);
|
||||
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12);
|
||||
break; // XYW
|
||||
case 14: SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset);
|
||||
SSE_MOVHLPS_XMM_to_XMM(reg, reg);
|
||||
SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+8);
|
||||
break; // XYZ
|
||||
case 8: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset); break; // X
|
||||
case 4: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+4); break; // Y
|
||||
case 2: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+8); break; // Z
|
||||
case 1: SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12); break; // W
|
||||
case 12: SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset); break; // XY
|
||||
case 3: SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+8); break; // ZW
|
||||
default: SSE_MOVAPSRtoRm(gprReg, reg, offset); break; // XYZW
|
||||
case 8: xMOVSS(ptr32[ptr], reg); break; // X
|
||||
case 12: xMOVL.PS(ptr64[ptr], reg); break; // XY
|
||||
case 3: xMOVH.PS(ptr64[ptr+8], reg); break; // ZW
|
||||
default: xMOVAPS(ptr128[ptr], reg); break; // XYZW
|
||||
}
|
||||
}
|
||||
|
||||
// Modifies the Source Reg! (ToDo: Optimize modXYZW = 1 cases)
|
||||
void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0) {
|
||||
void mVUmergeRegs(xmm dest, xmm src, int xyzw, bool modXYZW)
|
||||
{
|
||||
xyzw &= 0xf;
|
||||
if ( (dest != src) && (xyzw != 0) ) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf)) {
|
||||
if (modXYZW) {
|
||||
if (xyzw == 1) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 3, 0)); return; }
|
||||
else if (xyzw == 2) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 2, 0)); return; }
|
||||
else if (xyzw == 4) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 1, 0)); return; }
|
||||
if (xyzw == 1) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 3, 0)); return; }
|
||||
else if (xyzw == 2) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 2, 0)); return; }
|
||||
else if (xyzw == 4) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 1, 0)); return; }
|
||||
}
|
||||
xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
|
||||
SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw);
|
||||
xBLEND.PS(dest, src, xyzw);
|
||||
}
|
||||
else {
|
||||
switch (xyzw) {
|
||||
case 1: if (modXYZW) mVUunpack_xyzw(src, src, 0);
|
||||
SSE_MOVHLPS_XMM_to_XMM(src, dest); // src = Sw Sz Dw Dz
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4); // 11 00 01 00
|
||||
xMOVHL.PS(src, dest); // src = Sw Sz Dw Dz
|
||||
xSHUF.PS(dest, src, 0xc4); // 11 00 01 00
|
||||
break;
|
||||
case 2: if (modXYZW) mVUunpack_xyzw(src, src, 0);
|
||||
SSE_MOVHLPS_XMM_to_XMM(src, dest);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
|
||||
xMOVHL.PS(src, dest);
|
||||
xSHUF.PS(dest, src, 0x64);
|
||||
break;
|
||||
case 3: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
|
||||
case 3: xSHUF.PS(dest, src, 0xe4);
|
||||
break;
|
||||
case 4: if (modXYZW) mVUunpack_xyzw(src, src, 0);
|
||||
SSE_MOVSS_XMM_to_XMM(src, dest);
|
||||
SSE2_MOVSD_XMM_to_XMM(dest, src);
|
||||
xMOVSS(src, dest);
|
||||
xMOVSD(dest, src);
|
||||
break;
|
||||
case 5: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);
|
||||
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd8);
|
||||
case 5: xSHUF.PS(dest, src, 0xd8);
|
||||
xPSHUF.D(dest, dest, 0xd8);
|
||||
break;
|
||||
case 6: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c);
|
||||
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x78);
|
||||
case 6: xSHUF.PS(dest, src, 0x9c);
|
||||
xPSHUF.D(dest, dest, 0x78);
|
||||
break;
|
||||
case 7: SSE_MOVSS_XMM_to_XMM(src, dest);
|
||||
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
||||
case 7: xMOVSS(src, dest);
|
||||
xMOVAPS(dest, src);
|
||||
break;
|
||||
case 8: SSE_MOVSS_XMM_to_XMM(dest, src);
|
||||
case 8: xMOVSS(dest, src);
|
||||
break;
|
||||
case 9: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9);
|
||||
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0xd2);
|
||||
case 9: xSHUF.PS(dest, src, 0xc9);
|
||||
xPSHUF.D(dest, dest, 0xd2);
|
||||
break;
|
||||
case 10: SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d);
|
||||
SSE2_PSHUFD_XMM_to_XMM(dest, dest, 0x72);
|
||||
case 10: xSHUF.PS(dest, src, 0x8d);
|
||||
xPSHUF.D(dest, dest, 0x72);
|
||||
break;
|
||||
case 11: SSE_MOVSS_XMM_to_XMM(dest, src);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
|
||||
case 11: xMOVSS(dest, src);
|
||||
xSHUF.PS(dest, src, 0xe4);
|
||||
break;
|
||||
case 12: SSE2_MOVSD_XMM_to_XMM(dest, src);
|
||||
case 12: xMOVSD(dest, src);
|
||||
break;
|
||||
case 13: SSE_MOVHLPS_XMM_to_XMM(dest, src);
|
||||
SSE_SHUFPS_XMM_to_XMM(src, dest, 0x64);
|
||||
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
||||
case 13: xMOVHL.PS(dest, src);
|
||||
xSHUF.PS(src, dest, 0x64);
|
||||
xMOVAPS(dest, src);
|
||||
break;
|
||||
case 14: SSE_MOVHLPS_XMM_to_XMM(dest, src);
|
||||
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xc4);
|
||||
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
||||
case 14: xMOVHL.PS(dest, src);
|
||||
xSHUF.PS(src, dest, 0xc4);
|
||||
xMOVAPS(dest, src);
|
||||
break;
|
||||
default: SSE_MOVAPS_XMM_to_XMM(dest, src);
|
||||
default: xMOVAPS(dest, src);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -271,33 +214,35 @@ void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0) {
|
|||
//------------------------------------------------------------------
|
||||
|
||||
// Transforms the Address in gprReg to valid VU0/VU1 Address
|
||||
_f void mVUaddrFix(mV, int gprReg) {
|
||||
_f void mVUaddrFix(mV, x32 gprReg)
|
||||
{
|
||||
if (isVU1) {
|
||||
AND32ItoR(gprReg, 0x3ff); // wrap around
|
||||
SHL32ItoR(gprReg, 4);
|
||||
xAND(gprReg, 0x3ff); // wrap around
|
||||
xSHL(gprReg, 4);
|
||||
}
|
||||
else {
|
||||
u8 *jmpA, *jmpB;
|
||||
CMP32ItoR(gprReg, 0x400);
|
||||
jmpA = JL8(0); // if addr >= 0x4000, reads VU1's VF regs and VI regs
|
||||
AND32ItoR(gprReg, 0x43f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs!
|
||||
jmpB = JMP8(0);
|
||||
x86SetJ8(jmpA);
|
||||
AND32ItoR(gprReg, 0xff); // if addr < 0x4000, wrap around
|
||||
x86SetJ8(jmpB);
|
||||
SHL32ItoR(gprReg, 4); // multiply by 16 (shift left by 4)
|
||||
xCMP(gprReg, 0x400);
|
||||
xForwardJL8 jmpA; // if addr >= 0x4000, reads VU1's VF regs and VI regs
|
||||
xAND(gprReg, 0x43f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs!
|
||||
xForwardJump8 jmpB;
|
||||
jmpA.SetTarget();
|
||||
xAND(gprReg, 0xff); // if addr < 0x4000, wrap around
|
||||
jmpB.SetTarget();
|
||||
xSHL(gprReg, 4); // multiply by 16 (shift left by 4)
|
||||
}
|
||||
}
|
||||
|
||||
// Backup Volatile Regs (EAX, ECX, EDX, MM0~7, XMM0~7, are all volatile according to 32bit Win/Linux ABI)
|
||||
_f void mVUbackupRegs(microVU* mVU) {
|
||||
_f void mVUbackupRegs(microVU* mVU)
|
||||
{
|
||||
mVU->regAlloc->flushAll();
|
||||
SSE_MOVAPS_XMM_to_M128((uptr)&mVU->xmmPQb[0], xmmPQ);
|
||||
xMOVAPS(ptr128[&mVU->xmmPQb[0]], xmmPQ);
|
||||
}
|
||||
|
||||
// Restore Volatile Regs
|
||||
_f void mVUrestoreRegs(microVU* mVU) {
|
||||
SSE_MOVAPS_M128_to_XMM(xmmPQ, (uptr)&mVU->xmmPQb[0]);
|
||||
_f void mVUrestoreRegs(microVU* mVU)
|
||||
{
|
||||
xMOVAPS(xmmPQ, ptr128[&mVU->xmmPQb[0]]);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------
|
||||
|
@ -314,121 +259,117 @@ static const __aligned16 SSEMaskPair MIN_MAX =
|
|||
|
||||
|
||||
// Warning: Modifies t1 and t2
|
||||
void MIN_MAX_PS(microVU* mVU, int to, int from, int t1, int t2, bool min) {
|
||||
bool t1b = 0, t2b = 0;
|
||||
if (t1 < 0) { t1 = mVU->regAlloc->allocReg(); t1b = 1; }
|
||||
if (t2 < 0) { t2 = mVU->regAlloc->allocReg(); t2b = 1; }
|
||||
|
||||
void MIN_MAX_PS(microVU* mVU, xmm to, xmm from, xmm t1in, xmm t2in, bool min)
|
||||
{
|
||||
xmm t1 = t1in.IsEmpty() ? mVU->regAlloc->allocReg() : t1in;
|
||||
xmm t2 = t2in.IsEmpty() ? mVU->regAlloc->allocReg() : t2in;
|
||||
// ZW
|
||||
SSE2_PSHUFD_XMM_to_XMM(t1, to, 0xfa);
|
||||
SSE2_PAND_M128_to_XMM (t1, (uptr)MIN_MAX.mask1);
|
||||
SSE2_POR_M128_to_XMM (t1, (uptr)MIN_MAX.mask2);
|
||||
SSE2_PSHUFD_XMM_to_XMM(t2, from, 0xfa);
|
||||
SSE2_PAND_M128_to_XMM (t2, (uptr)MIN_MAX.mask1);
|
||||
SSE2_POR_M128_to_XMM (t2, (uptr)MIN_MAX.mask2);
|
||||
if (min) SSE2_MINPD_XMM_to_XMM(t1, t2);
|
||||
else SSE2_MAXPD_XMM_to_XMM(t1, t2);
|
||||
xPSHUF.D(t1, to, 0xfa);
|
||||
xPAND (t1, ptr128[MIN_MAX.mask1]);
|
||||
xPOR (t1, ptr128[MIN_MAX.mask2]);
|
||||
xPSHUF.D(t2, from, 0xfa);
|
||||
xPAND (t2, ptr128[MIN_MAX.mask1]);
|
||||
xPOR (t2, ptr128[MIN_MAX.mask2]);
|
||||
if (min) xMIN.PD(t1, t2);
|
||||
else xMAX.PD(t1, t2);
|
||||
|
||||
// XY
|
||||
SSE2_PSHUFD_XMM_to_XMM(t2, from, 0x50);
|
||||
SSE2_PAND_M128_to_XMM (t2, (uptr)MIN_MAX.mask1);
|
||||
SSE2_POR_M128_to_XMM (t2, (uptr)MIN_MAX.mask2);
|
||||
SSE2_PSHUFD_XMM_to_XMM(to, to, 0x50);
|
||||
SSE2_PAND_M128_to_XMM (to, (uptr)MIN_MAX.mask1);
|
||||
SSE2_POR_M128_to_XMM (to, (uptr)MIN_MAX.mask2);
|
||||
if (min) SSE2_MINPD_XMM_to_XMM(to, t2);
|
||||
else SSE2_MAXPD_XMM_to_XMM(to, t2);
|
||||
xPSHUF.D(t2, from, 0x50);
|
||||
xPAND (t2, ptr128[MIN_MAX.mask1]);
|
||||
xPOR (t2, ptr128[MIN_MAX.mask2]);
|
||||
xPSHUF.D(to, to, 0x50);
|
||||
xPAND (to, ptr128[MIN_MAX.mask1]);
|
||||
xPOR (to, ptr128[MIN_MAX.mask2]);
|
||||
if (min) xMIN.PD(to, t2);
|
||||
else xMAX.PD(to, t2);
|
||||
|
||||
SSE_SHUFPS_XMM_to_XMM(to, t1, 0x88);
|
||||
if (t1b) mVU->regAlloc->clearNeeded(t1);
|
||||
if (t2b) mVU->regAlloc->clearNeeded(t2);
|
||||
xSHUF.PS(to, t1, 0x88);
|
||||
if (t1 != t1in) mVU->regAlloc->clearNeeded(t1);
|
||||
if (t2 != t2in) mVU->regAlloc->clearNeeded(t2);
|
||||
}
|
||||
|
||||
// Warning: Modifies to's upper 3 vectors, and t1
|
||||
void MIN_MAX_SS(mV, int to, int from, int t1, bool min) {
|
||||
bool t1b = 0;
|
||||
if (t1 < 0) { t1 = mVU->regAlloc->allocReg(); t1b = 1; }
|
||||
SSE_SHUFPS_XMM_to_XMM (to, from, 0);
|
||||
SSE2_PAND_M128_to_XMM (to, (uptr)MIN_MAX.mask1);
|
||||
SSE2_POR_M128_to_XMM (to, (uptr)MIN_MAX.mask2);
|
||||
SSE2_PSHUFD_XMM_to_XMM(t1, to, 0xee);
|
||||
if (min) SSE2_MINPD_XMM_to_XMM(to, t1);
|
||||
else SSE2_MAXPD_XMM_to_XMM(to, t1);
|
||||
if (t1b) mVU->regAlloc->clearNeeded(t1);
|
||||
void MIN_MAX_SS(mV, xmm to, xmm from, xmm t1in, bool min)
|
||||
{
|
||||
xmm t1 = t1in.IsEmpty() ? mVU->regAlloc->allocReg() : t1in;
|
||||
xSHUF.PS(to, from, 0);
|
||||
xPAND (to, ptr128[MIN_MAX.mask1]);
|
||||
xPOR (to, ptr128[MIN_MAX.mask2]);
|
||||
xPSHUF.D(t1, to, 0xee);
|
||||
if (min) xMIN.PD(to, t1);
|
||||
else xMAX.PD(to, t1);
|
||||
if (t1 != t1in) mVU->regAlloc->clearNeeded(t1);
|
||||
}
|
||||
|
||||
// Warning: Modifies all vectors in 'to' and 'from', and Modifies xmmT1 and xmmT2
|
||||
void ADD_SS(microVU* mVU, int to, int from, int t1, int t2) {
|
||||
void ADD_SS(microVU* mVU, xmm to, xmm from, xmm t1in, xmm t2in)
|
||||
{
|
||||
xmm t1 = t1in.IsEmpty() ? mVU->regAlloc->allocReg() : t1in;
|
||||
xmm t2 = t2in.IsEmpty() ? mVU->regAlloc->allocReg() : t2in;
|
||||
|
||||
u8 *localptr[8];
|
||||
bool t1b = 0, t2b = 0;
|
||||
if (t1 < 0) { t1 = mVU->regAlloc->allocReg(); t1b = 1; }
|
||||
if (t2 < 0) { t2 = mVU->regAlloc->allocReg(); t2b = 1; }
|
||||
xMOVAPS(t1, to);
|
||||
xMOVAPS(t2, from);
|
||||
xMOVD(ecx, to);
|
||||
xSHR(ecx, 23);
|
||||
xMOVD(eax, from);
|
||||
xSHR(eax, 23);
|
||||
xAND(ecx, 0xff);
|
||||
xAND(eax, 0xff);
|
||||
xSUB(ecx, eax); //ecx = exponent difference
|
||||
|
||||
SSE_MOVAPS_XMM_to_XMM(t1, to);
|
||||
SSE_MOVAPS_XMM_to_XMM(t2, from);
|
||||
SSE2_MOVD_XMM_to_R(gprT2, to);
|
||||
SHR32ItoR(gprT2, 23);
|
||||
SSE2_MOVD_XMM_to_R(gprT1, from);
|
||||
SHR32ItoR(gprT1, 23);
|
||||
AND32ItoR(gprT2, 0xff);
|
||||
AND32ItoR(gprT1, 0xff);
|
||||
SUB32RtoR(gprT2, gprT1); //gprT2 = exponent difference
|
||||
xCMP(ecx, 25);
|
||||
xForwardJGE8 case2;
|
||||
xCMP(ecx, 0);
|
||||
xForwardJG8 case3;
|
||||
xForwardJE8 toend1;
|
||||
xCMP(ecx, -25);
|
||||
xForwardJLE8 case4;
|
||||
|
||||
CMP32ItoR(gprT2, 25);
|
||||
localptr[0] = JGE8(0);
|
||||
CMP32ItoR(gprT2, 0);
|
||||
localptr[1] = JG8(0);
|
||||
localptr[2] = JE8(0);
|
||||
CMP32ItoR(gprT2, -25);
|
||||
localptr[3] = JLE8(0);
|
||||
NEG32R(gprT2);
|
||||
DEC32R(gprT2);
|
||||
MOV32ItoR(gprT1, 0xffffffff);
|
||||
SHL32CLtoR(gprT1);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(to, to);
|
||||
SSE2_MOVD_R_to_XMM(from, gprT1);
|
||||
SSE_MOVSS_XMM_to_XMM(to, from);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(from, from);
|
||||
localptr[4] = JMP8(0);
|
||||
// negative small
|
||||
xNOT(ecx); // -ecx - 1
|
||||
xMOV(eax, 0xffffffff);
|
||||
xSHL(eax, cl);
|
||||
xPCMP.EQB(to, to);
|
||||
xMOVDZX(from, eax);
|
||||
xMOVSS(to, from);
|
||||
xPCMP.EQB(from, from);
|
||||
xForwardJump8 toend2;
|
||||
|
||||
x86SetJ8(localptr[0]);
|
||||
MOV32ItoR(gprT1, 0x80000000);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(from, from);
|
||||
SSE2_MOVD_R_to_XMM(to, gprT1);
|
||||
SSE_MOVSS_XMM_to_XMM(from, to);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(to, to);
|
||||
localptr[5] = JMP8(0);
|
||||
case2.SetTarget(); // positive large
|
||||
xMOV(eax, 0x80000000);
|
||||
xPCMP.EQB(from, from);
|
||||
xMOVDZX(to, eax);
|
||||
xMOVSS(from, to);
|
||||
xPCMP.EQB(to, to);
|
||||
xForwardJump8 toend3;
|
||||
|
||||
x86SetJ8(localptr[1]);
|
||||
DEC32R(gprT2);
|
||||
MOV32ItoR(gprT1, 0xffffffff);
|
||||
SHL32CLtoR(gprT1);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(from, from);
|
||||
SSE2_MOVD_R_to_XMM(to, gprT1);
|
||||
SSE_MOVSS_XMM_to_XMM(from, to);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(to, to);
|
||||
localptr[6] = JMP8(0);
|
||||
case3.SetTarget(); // positive small
|
||||
xDEC(ecx);
|
||||
xMOV(eax, 0xffffffff);
|
||||
xSHL(eax, cl);
|
||||
xPCMP.EQB(from, from);
|
||||
xMOVDZX(to, eax);
|
||||
xMOVSS(from, to);
|
||||
xPCMP.EQB(to, to);
|
||||
xForwardJump8 toend4;
|
||||
|
||||
x86SetJ8(localptr[3]);
|
||||
MOV32ItoR(gprT1, 0x80000000);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(to, to);
|
||||
SSE2_MOVD_R_to_XMM(from, gprT1);
|
||||
SSE_MOVSS_XMM_to_XMM(to, from);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(from, from);
|
||||
localptr[7] = JMP8(0);
|
||||
case4.SetTarget(); // negative large
|
||||
xMOV(eax, 0x80000000);
|
||||
xPCMP.EQB(to, to);
|
||||
xMOVDZX(from, eax);
|
||||
xMOVSS(to, from);
|
||||
xPCMP.EQB(from, from);
|
||||
|
||||
x86SetJ8(localptr[2]);
|
||||
x86SetJ8(localptr[4]);
|
||||
x86SetJ8(localptr[5]);
|
||||
x86SetJ8(localptr[6]);
|
||||
x86SetJ8(localptr[7]);
|
||||
toend1.SetTarget();
|
||||
toend2.SetTarget();
|
||||
toend3.SetTarget();
|
||||
toend4.SetTarget();
|
||||
|
||||
SSE_ANDPS_XMM_to_XMM(to, t1); // to contains mask
|
||||
SSE_ANDPS_XMM_to_XMM(from, t2); // from contains mask
|
||||
SSE_ADDSS_XMM_to_XMM(to, from);
|
||||
if (t1b) mVU->regAlloc->clearNeeded(t1);
|
||||
if (t2b) mVU->regAlloc->clearNeeded(t2);
|
||||
xAND.PS(to, t1); // to contains mask
|
||||
xAND.PS(from, t2); // from contains mask
|
||||
xADD.SS(to, from);
|
||||
if (t1 != t1in) mVU->regAlloc->clearNeeded(t1);
|
||||
if (t2 != t2in) mVU->regAlloc->clearNeeded(t2);
|
||||
}
|
||||
|
||||
#define clampOp(opX, isPS) { \
|
||||
|
@ -438,53 +379,68 @@ void ADD_SS(microVU* mVU, int to, int from, int t1, int t2) {
|
|||
mVUclamp4(to, t1, (isPS)?0xf:0x8); \
|
||||
}
|
||||
|
||||
void SSE_MAXPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
if (CHECK_VU_MINMAXHACK) { SSE_MAXPS_XMM_to_XMM(to, from); }
|
||||
void SSE_MAXPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
if (CHECK_VU_MINMAXHACK) { xMAX.PS(to, from); }
|
||||
else { MIN_MAX_PS(mVU, to, from, t1, t2, 0); }
|
||||
}
|
||||
void SSE_MINPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
if (CHECK_VU_MINMAXHACK) { SSE_MINPS_XMM_to_XMM(to, from); }
|
||||
void SSE_MINPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
if (CHECK_VU_MINMAXHACK) { xMIN.PS(to, from); }
|
||||
else { MIN_MAX_PS(mVU, to, from, t1, t2, 1); }
|
||||
}
|
||||
void SSE_MAXSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
if (CHECK_VU_MINMAXHACK) { SSE_MAXSS_XMM_to_XMM(to, from); }
|
||||
void SSE_MAXSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
if (CHECK_VU_MINMAXHACK) { xMAX.SS(to, from); }
|
||||
else { MIN_MAX_SS(mVU, to, from, t1, 0); }
|
||||
}
|
||||
void SSE_MINSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
if (CHECK_VU_MINMAXHACK) { SSE_MINSS_XMM_to_XMM(to, from); }
|
||||
void SSE_MINSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
if (CHECK_VU_MINMAXHACK) { xMIN.SS(to, from); }
|
||||
else { MIN_MAX_SS(mVU, to, from, t1, 1); }
|
||||
}
|
||||
void SSE_ADD2SS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
if (!CHECK_VUADDSUBHACK) { clampOp(SSE_ADDSS_XMM_to_XMM, 0); }
|
||||
void SSE_ADD2SS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
if (!CHECK_VUADDSUBHACK) { clampOp(xADD.SS, 0); }
|
||||
else { ADD_SS(mVU, to, from, t1, t2); }
|
||||
}
|
||||
|
||||
void SSE_ADD2PS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
clampOp(SSE_ADDPS_XMM_to_XMM, 1);
|
||||
// FIXME: why do we need two identical definitions with different names?
|
||||
void SSE_ADD2PS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
clampOp(xADD.PS, 1);
|
||||
}
|
||||
void SSE_ADDPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
clampOp(SSE_ADDPS_XMM_to_XMM, 1);
|
||||
void SSE_ADDPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
clampOp(xADD.PS, 1);
|
||||
}
|
||||
void SSE_ADDSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
clampOp(SSE_ADDSS_XMM_to_XMM, 0);
|
||||
void SSE_ADDSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
clampOp(xADD.SS, 0);
|
||||
}
|
||||
void SSE_SUBPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
clampOp(SSE_SUBPS_XMM_to_XMM, 1);
|
||||
void SSE_SUBPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
clampOp(xSUB.PS, 1);
|
||||
}
|
||||
void SSE_SUBSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
clampOp(SSE_SUBSS_XMM_to_XMM, 0);
|
||||
void SSE_SUBSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
clampOp(xSUB.SS, 0);
|
||||
}
|
||||
void SSE_MULPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
clampOp(SSE_MULPS_XMM_to_XMM, 1);
|
||||
void SSE_MULPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
clampOp(xMUL.PS, 1);
|
||||
}
|
||||
void SSE_MULSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
clampOp(SSE_MULSS_XMM_to_XMM, 0);
|
||||
void SSE_MULSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
clampOp(xMUL.SS, 0);
|
||||
}
|
||||
void SSE_DIVPS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
clampOp(SSE_DIVPS_XMM_to_XMM, 1);
|
||||
void SSE_DIVPS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
clampOp(xDIV.PS, 1);
|
||||
}
|
||||
void SSE_DIVSS(mV, int to, int from, int t1 = -1, int t2 = -1) {
|
||||
clampOp(SSE_DIVSS_XMM_to_XMM, 0);
|
||||
void SSE_DIVSS(mV, xmm to, xmm from, xmm t1 = xEmptyReg, xmm t2 = xEmptyReg)
|
||||
{
|
||||
clampOp(xDIV.SS, 0);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------
|
||||
|
|
|
@ -21,54 +21,61 @@
|
|||
|
||||
#define AND_XYZW ((_XYZW_SS && modXYZW) ? (1) : (mFLAG.doFlag ? (_X_Y_Z_W) : (flipMask[_X_Y_Z_W])))
|
||||
#define ADD_XYZW ((_XYZW_SS && modXYZW) ? (_X ? 3 : (_Y ? 2 : (_Z ? 1 : 0))) : 0)
|
||||
#define SHIFT_XYZW(gprReg) { if (_XYZW_SS && modXYZW && !_W) { SHL32ItoR(gprReg, ADD_XYZW); } }
|
||||
#define SHIFT_XYZW(gprReg) { if (_XYZW_SS && modXYZW && !_W) { xSHL(gprReg, ADD_XYZW); } }
|
||||
|
||||
// Note: If modXYZW is true, then it adjusts XYZW for Single Scalar operations
|
||||
static void mVUupdateFlags(mV, int reg, int regT1 = -1, int regT2 = -1, bool modXYZW = 1) {
|
||||
int sReg, mReg = gprT1, regT1b = 0, regT2b = 0;
|
||||
static void mVUupdateFlags(mV, xmm reg, xmm regT1in = xEmptyReg, xmm regT2 = xEmptyReg, bool modXYZW = 1) {
|
||||
x32 mReg = gprT1;
|
||||
bool regT2b = false;
|
||||
static const u16 flipMask[16] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
|
||||
|
||||
//SysPrintf("Status = %d; Mac = %d\n", sFLAG.doFlag, mFLAG.doFlag);
|
||||
if (!sFLAG.doFlag && !mFLAG.doFlag) { return; }
|
||||
if ((mFLAG.doFlag && !(_XYZW_SS && modXYZW))) {
|
||||
if (regT2 < 0) { regT2 = mVU->regAlloc->allocReg(); regT2b = 1; }
|
||||
SSE2_PSHUFD_XMM_to_XMM(regT2, reg, 0x1B); // Flip wzyx to xyzw
|
||||
|
||||
xmm regT1 = regT1in.IsEmpty() ? mVU->regAlloc->allocReg() : regT1in;
|
||||
if ((mFLAG.doFlag && !(_XYZW_SS && modXYZW)))
|
||||
{
|
||||
if (regT2.IsEmpty())
|
||||
{
|
||||
regT2 = mVU->regAlloc->allocReg();
|
||||
regT2b = true;
|
||||
}
|
||||
xPSHUF.D(regT2, reg, 0x1B); // Flip wzyx to xyzw
|
||||
}
|
||||
else regT2 = reg;
|
||||
else
|
||||
regT2 = reg;
|
||||
if (sFLAG.doFlag) {
|
||||
getFlagReg(sReg, sFLAG.write); // Set sReg to valid GPR by Cur Flag Instance
|
||||
mVUallocSFLAGa(sReg, sFLAG.lastWrite); // Get Prev Status Flag
|
||||
if (sFLAG.doNonSticky) AND32ItoR(sReg, 0xfffc00ff); // Clear O,U,S,Z flags
|
||||
mVUallocSFLAGa(getFlagReg(sFLAG.write), sFLAG.lastWrite); // Get Prev Status Flag
|
||||
if (sFLAG.doNonSticky) xAND(getFlagReg(sFLAG.write), 0xfffc00ff); // Clear O,U,S,Z flags
|
||||
}
|
||||
if (regT1 < 0) { regT1 = mVU->regAlloc->allocReg(); regT1b = 1; }
|
||||
|
||||
//-------------------------Check for Signed flags------------------------------
|
||||
|
||||
SSE_MOVMSKPS_XMM_to_R32(mReg, regT2); // Move the Sign Bits of the t2reg
|
||||
SSE_XORPS_XMM_to_XMM (regT1, regT1); // Clear regT1
|
||||
SSE_CMPEQPS_XMM_to_XMM (regT1, regT2); // Set all F's if each vector is zero
|
||||
SSE_MOVMSKPS_XMM_to_R32(gprT2, regT1); // Used for Zero Flag Calculation
|
||||
xMOVMSKPS(mReg, regT2); // Move the Sign Bits of the t2reg
|
||||
xXOR.PS (regT1, regT1); // Clear regT1
|
||||
xCMPEQ.PS(regT1, regT2); // Set all F's if each vector is zero
|
||||
xMOVMSKPS(gprT2, regT1); // Used for Zero Flag Calculation
|
||||
|
||||
AND32ItoR(mReg, AND_XYZW); // Grab "Is Signed" bits from the previous calculation
|
||||
SHL32ItoR(mReg, 4 + ADD_XYZW);
|
||||
xAND(mReg, AND_XYZW); // Grab "Is Signed" bits from the previous calculation
|
||||
xSHL(mReg, 4 + ADD_XYZW);
|
||||
|
||||
//-------------------------Check for Zero flags------------------------------
|
||||
|
||||
AND32ItoR(gprT2, AND_XYZW); // Grab "Is Zero" bits from the previous calculation
|
||||
xAND(gprT2, AND_XYZW); // Grab "Is Zero" bits from the previous calculation
|
||||
if (mFLAG.doFlag) { SHIFT_XYZW(gprT2); }
|
||||
OR32RtoR(mReg, gprT2);
|
||||
xOR(mReg, gprT2);
|
||||
|
||||
//-------------------------Write back flags------------------------------
|
||||
|
||||
if (mFLAG.doFlag) mVUallocMFLAGb(mVU, mReg, mFLAG.write); // Set Mac Flag
|
||||
if (sFLAG.doFlag) {
|
||||
OR32RtoR (sReg, mReg);
|
||||
xOR(getFlagReg(sFLAG.write), mReg);
|
||||
if (sFLAG.doNonSticky) {
|
||||
SHL32ItoR(mReg, 8);
|
||||
OR32RtoR (sReg, mReg);
|
||||
xSHL(mReg, 8);
|
||||
xOR(getFlagReg(sFLAG.write), mReg);
|
||||
}
|
||||
}
|
||||
if (regT1b) mVU->regAlloc->clearNeeded(regT1);
|
||||
if (regT1 != regT1in) mVU->regAlloc->clearNeeded(regT1);
|
||||
if (regT2b) mVU->regAlloc->clearNeeded(regT2);
|
||||
}
|
||||
|
||||
|
@ -76,7 +83,7 @@ static void mVUupdateFlags(mV, int reg, int regT1 = -1, int regT2 = -1, bool mod
|
|||
// Helper Macros and Functions
|
||||
//------------------------------------------------------------------
|
||||
|
||||
static void (*SSE_PS[]) (microVU*, int, int, int, int) = {
|
||||
static void (*SSE_PS[]) (microVU*, xmm, xmm, xmm, xmm) = {
|
||||
SSE_ADDPS, // 0
|
||||
SSE_SUBPS, // 1
|
||||
SSE_MULPS, // 2
|
||||
|
@ -85,7 +92,7 @@ static void (*SSE_PS[]) (microVU*, int, int, int, int) = {
|
|||
SSE_ADD2PS // 5
|
||||
};
|
||||
|
||||
static void (*SSE_SS[]) (microVU*, int, int, int, int) = {
|
||||
static void (*SSE_SS[]) (microVU*, xmm, xmm, xmm, xmm) = {
|
||||
SSE_ADDSS, // 0
|
||||
SSE_SUBSS, // 1
|
||||
SSE_MULSS, // 2
|
||||
|
@ -122,9 +129,9 @@ void setupPass1(microVU* mVU, int opCase, bool isACC, bool noFlagUpdate) {
|
|||
bool doSafeSub(microVU* mVU, int opCase, int opType, bool isACC) {
|
||||
opCase1 {
|
||||
if ((opType == 1) && (_Ft_ == _Fs_)) {
|
||||
int Fs = mVU->regAlloc->allocReg(-1, isACC ? 32 : _Fd_, _X_Y_Z_W);
|
||||
SSE2_PXOR_XMM_to_XMM(Fs, Fs); // Set to Positive 0
|
||||
mVUupdateFlags(mVU, Fs, -1);
|
||||
xmm Fs = mVU->regAlloc->allocReg(-1, isACC ? 32 : _Fd_, _X_Y_Z_W);
|
||||
xPXOR(Fs, Fs); // Set to Positive 0
|
||||
mVUupdateFlags(mVU, Fs);
|
||||
mVU->regAlloc->clearNeeded(Fs);
|
||||
return 1;
|
||||
}
|
||||
|
@ -133,11 +140,11 @@ bool doSafeSub(microVU* mVU, int opCase, int opType, bool isACC) {
|
|||
}
|
||||
|
||||
// Sets Up Ft Reg for Normal, BC, I, and Q Cases
|
||||
void setupFtReg(microVU* mVU, int& Ft, int& tempFt, int opCase) {
|
||||
void setupFtReg(microVU* mVU, xmm& Ft, xmm& tempFt, int opCase) {
|
||||
opCase1 {
|
||||
if (_XYZW_SS2) { Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W); tempFt = Ft; }
|
||||
else if (clampE) { Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf); tempFt = Ft; }
|
||||
else { Ft = mVU->regAlloc->allocReg(_Ft_); tempFt = -1; }
|
||||
else { Ft = mVU->regAlloc->allocReg(_Ft_); tempFt = xEmptyReg; }
|
||||
}
|
||||
opCase2 {
|
||||
tempFt = mVU->regAlloc->allocReg(_Ft_);
|
||||
|
@ -148,7 +155,7 @@ void setupFtReg(microVU* mVU, int& Ft, int& tempFt, int opCase) {
|
|||
}
|
||||
opCase3 { Ft = mVU->regAlloc->allocReg(33, 0, _X_Y_Z_W); tempFt = Ft; }
|
||||
opCase4 {
|
||||
if (!clampE && _XYZW_SS && !mVUinfo.readQ) { Ft = xmmPQ; tempFt = -1; }
|
||||
if (!clampE && _XYZW_SS && !mVUinfo.readQ) { Ft = xmmPQ; tempFt = xEmptyReg; }
|
||||
else { Ft = mVU->regAlloc->allocReg(); tempFt = Ft; getQreg(Ft, mVUinfo.readQ); }
|
||||
}
|
||||
}
|
||||
|
@ -159,27 +166,27 @@ void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, co
|
|||
pass2 {
|
||||
if (doSafeSub(mVU, opCase, opType, isACC)) return;
|
||||
|
||||
int Fs, Ft, ACC, tempFt;
|
||||
xmm Fs, Ft, ACC, tempFt;
|
||||
setupFtReg(mVU, Ft, tempFt, opCase);
|
||||
|
||||
if (isACC) {
|
||||
Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
|
||||
ACC = mVU->regAlloc->allocReg((_X_Y_Z_W == 0xf) ? -1 : 32, 32, 0xf, 0);
|
||||
if (_XYZW_SS2) SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W));
|
||||
if (_XYZW_SS2) xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W));
|
||||
}
|
||||
else { Fs = mVU->regAlloc->allocReg(_Fs_, _Fd_, _X_Y_Z_W); }
|
||||
|
||||
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W);
|
||||
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W);
|
||||
if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
|
||||
if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
|
||||
|
||||
if (_XYZW_SS) SSE_SS[opType](mVU, Fs, Ft, -1, -1);
|
||||
else SSE_PS[opType](mVU, Fs, Ft, -1, -1);
|
||||
if (_XYZW_SS) SSE_SS[opType](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
|
||||
else SSE_PS[opType](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
|
||||
|
||||
if (isACC) {
|
||||
if (_XYZW_SS) SSE_MOVSS_XMM_to_XMM(ACC, Fs);
|
||||
if (_XYZW_SS) xMOVSS(ACC, Fs);
|
||||
else mVUmergeRegs(ACC, Fs, _X_Y_Z_W);
|
||||
mVUupdateFlags(mVU, ACC, Fs, tempFt);
|
||||
if (_XYZW_SS2) SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W));
|
||||
if (_XYZW_SS2) xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W));
|
||||
mVU->regAlloc->clearNeeded(ACC);
|
||||
}
|
||||
else mVUupdateFlags(mVU, Fs, tempFt);
|
||||
|
@ -195,30 +202,30 @@ void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, co
|
|||
void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, const char* opName, int clampType) {
|
||||
pass1 { setupPass1(mVU, opCase, 1, 0); }
|
||||
pass2 {
|
||||
int Fs, Ft, ACC, tempFt;
|
||||
xmm Fs, Ft, ACC, tempFt;
|
||||
setupFtReg(mVU, Ft, tempFt, opCase);
|
||||
|
||||
Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
|
||||
ACC = mVU->regAlloc->allocReg(32, 32, 0xf, 0);
|
||||
|
||||
if (_XYZW_SS2) { SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
|
||||
if (_XYZW_SS2) { xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
|
||||
|
||||
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W);
|
||||
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W);
|
||||
if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
|
||||
if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
|
||||
|
||||
if (_XYZW_SS) SSE_SS[2](mVU, Fs, Ft, -1, -1);
|
||||
else SSE_PS[2](mVU, Fs, Ft, -1, -1);
|
||||
if (_XYZW_SS) SSE_SS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
|
||||
else SSE_PS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg);
|
||||
|
||||
if (_XYZW_SS || _X_Y_Z_W == 0xf) {
|
||||
if (_XYZW_SS) SSE_SS[opType](mVU, ACC, Fs, tempFt, -1);
|
||||
else SSE_PS[opType](mVU, ACC, Fs, tempFt, -1);
|
||||
if (_XYZW_SS) SSE_SS[opType](mVU, ACC, Fs, tempFt, xEmptyReg);
|
||||
else SSE_PS[opType](mVU, ACC, Fs, tempFt, xEmptyReg);
|
||||
mVUupdateFlags(mVU, ACC, Fs, tempFt);
|
||||
if (_XYZW_SS && _X_Y_Z_W != 8) SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W));
|
||||
if (_XYZW_SS && _X_Y_Z_W != 8) xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W));
|
||||
}
|
||||
else {
|
||||
int tempACC = mVU->regAlloc->allocReg();
|
||||
SSE_MOVAPS_XMM_to_XMM(tempACC, ACC);
|
||||
SSE_PS[opType](mVU, tempACC, Fs, tempFt, -1);
|
||||
xmm tempACC = mVU->regAlloc->allocReg();
|
||||
xMOVAPS(tempACC, ACC);
|
||||
SSE_PS[opType](mVU, tempACC, Fs, tempFt, xEmptyReg);
|
||||
mVUmergeRegs(ACC, tempACC, _X_Y_Z_W);
|
||||
mVUupdateFlags(mVU, ACC, Fs, tempFt);
|
||||
mVU->regAlloc->clearNeeded(tempACC);
|
||||
|
@ -236,22 +243,22 @@ void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, const char* op
|
|||
void mVU_FMACc(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) {
|
||||
pass1 { setupPass1(mVU, opCase, 0, 0); }
|
||||
pass2 {
|
||||
int Fs, Ft, ACC, tempFt;
|
||||
xmm Fs, Ft, ACC, tempFt;
|
||||
setupFtReg(mVU, Ft, tempFt, opCase);
|
||||
|
||||
ACC = mVU->regAlloc->allocReg(32);
|
||||
Fs = mVU->regAlloc->allocReg(_Fs_, _Fd_, _X_Y_Z_W);
|
||||
|
||||
if (_XYZW_SS2) { SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
|
||||
if (_XYZW_SS2) { xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
|
||||
|
||||
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W);
|
||||
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W);
|
||||
if (clampType & cACC) mVUclamp2(mVU, ACC, -1, _X_Y_Z_W);
|
||||
if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
|
||||
if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
|
||||
if (clampType & cACC) mVUclamp2(mVU, ACC, xEmptyReg, _X_Y_Z_W);
|
||||
|
||||
if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, -1, -1); SSE_SS[0](mVU, Fs, ACC, tempFt, -1); }
|
||||
else { SSE_PS[2](mVU, Fs, Ft, -1, -1); SSE_PS[0](mVU, Fs, ACC, tempFt, -1); }
|
||||
if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_SS[0](mVU, Fs, ACC, tempFt, xEmptyReg); }
|
||||
else { SSE_PS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_PS[0](mVU, Fs, ACC, tempFt, xEmptyReg); }
|
||||
|
||||
if (_XYZW_SS2) { SSE2_PSHUFD_XMM_to_XMM(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
|
||||
if (_XYZW_SS2) { xPSHUF.D(ACC, ACC, shuffleSS(_X_Y_Z_W)); }
|
||||
|
||||
mVUupdateFlags(mVU, Fs, tempFt);
|
||||
|
||||
|
@ -267,18 +274,18 @@ void mVU_FMACc(microVU* mVU, int recPass, int opCase, const char* opName, int cl
|
|||
void mVU_FMACd(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) {
|
||||
pass1 { setupPass1(mVU, opCase, 0, 0); }
|
||||
pass2 {
|
||||
int Fs, Ft, Fd, tempFt;
|
||||
xmm Fs, Ft, Fd, tempFt;
|
||||
setupFtReg(mVU, Ft, tempFt, opCase);
|
||||
|
||||
Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
|
||||
Fd = mVU->regAlloc->allocReg(32, _Fd_, _X_Y_Z_W);
|
||||
|
||||
if (clampType & cFt) mVUclamp2(mVU, Ft, -1, _X_Y_Z_W);
|
||||
if (clampType & cFs) mVUclamp2(mVU, Fs, -1, _X_Y_Z_W);
|
||||
if (clampType & cACC) mVUclamp2(mVU, Fd, -1, _X_Y_Z_W);
|
||||
if (clampType & cFt) mVUclamp2(mVU, Ft, xEmptyReg, _X_Y_Z_W);
|
||||
if (clampType & cFs) mVUclamp2(mVU, Fs, xEmptyReg, _X_Y_Z_W);
|
||||
if (clampType & cACC) mVUclamp2(mVU, Fd, xEmptyReg, _X_Y_Z_W);
|
||||
|
||||
if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, -1, -1); SSE_SS[1](mVU, Fd, Fs, tempFt, -1); }
|
||||
else { SSE_PS[2](mVU, Fs, Ft, -1, -1); SSE_PS[1](mVU, Fd, Fs, tempFt, -1); }
|
||||
if (_XYZW_SS) { SSE_SS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_SS[1](mVU, Fd, Fs, tempFt, xEmptyReg); }
|
||||
else { SSE_PS[2](mVU, Fs, Ft, xEmptyReg, xEmptyReg); SSE_PS[1](mVU, Fd, Fs, tempFt, xEmptyReg); }
|
||||
|
||||
mVUupdateFlags(mVU, Fd, Fs, tempFt);
|
||||
|
||||
|
@ -295,8 +302,8 @@ mVUop(mVU_ABS) {
|
|||
pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); }
|
||||
pass2 {
|
||||
if (!_Ft_) return;
|
||||
int Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
|
||||
SSE_ANDPS_M128_to_XMM(Fs, (uptr)mVUglob.absclip);
|
||||
xmm Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
|
||||
xAND.PS(Fs, ptr128[mVUglob.absclip]);
|
||||
mVU->regAlloc->clearNeeded(Fs);
|
||||
}
|
||||
pass3 { mVUlog("ABS"); mVUlogFtFs(); }
|
||||
|
@ -306,11 +313,11 @@ mVUop(mVU_ABS) {
|
|||
mVUop(mVU_OPMULA) {
|
||||
pass1 { mVUanalyzeFMAC1(mVU, 0, _Fs_, _Ft_); }
|
||||
pass2 {
|
||||
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W);
|
||||
int Fs = mVU->regAlloc->allocReg(_Fs_, 32, _X_Y_Z_W);
|
||||
xmm Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W);
|
||||
xmm Fs = mVU->regAlloc->allocReg(_Fs_, 32, _X_Y_Z_W);
|
||||
|
||||
SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
|
||||
SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
|
||||
xPSHUF.D(Fs, Fs, 0xC9); // WXZY
|
||||
xPSHUF.D(Ft, Ft, 0xD2); // WYXZ
|
||||
SSE_MULPS(mVU, Fs, Ft);
|
||||
mVU->regAlloc->clearNeeded(Ft);
|
||||
mVUupdateFlags(mVU, Fs);
|
||||
|
@ -324,12 +331,12 @@ mVUop(mVU_OPMULA) {
|
|||
mVUop(mVU_OPMSUB) {
|
||||
pass1 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, _Ft_); }
|
||||
pass2 {
|
||||
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf);
|
||||
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
|
||||
int ACC = mVU->regAlloc->allocReg(32, _Fd_, _X_Y_Z_W);
|
||||
xmm Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf);
|
||||
xmm Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
|
||||
xmm ACC = mVU->regAlloc->allocReg(32, _Fd_, _X_Y_Z_W);
|
||||
|
||||
SSE2_PSHUFD_XMM_to_XMM(Fs, Fs, 0xC9); // WXZY
|
||||
SSE2_PSHUFD_XMM_to_XMM(Ft, Ft, 0xD2); // WYXZ
|
||||
xPSHUF.D(Fs, Fs, 0xC9); // WXZY
|
||||
xPSHUF.D(Ft, Ft, 0xD2); // WYXZ
|
||||
SSE_MULPS(mVU, Fs, Ft);
|
||||
SSE_SUBPS(mVU, ACC, Fs);
|
||||
mVU->regAlloc->clearNeeded(Fs);
|
||||
|
@ -343,24 +350,24 @@ mVUop(mVU_OPMSUB) {
|
|||
}
|
||||
|
||||
// FTOI0/FTIO4/FTIO12/FTIO15 Opcodes
|
||||
static void mVU_FTOIx(mP, uptr addr, const char* opName) {
|
||||
static void mVU_FTOIx(mP, const float (*addr)[4], const char* opName) {
|
||||
pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); }
|
||||
pass2 {
|
||||
if (!_Ft_) return;
|
||||
int Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
|
||||
int t1 = mVU->regAlloc->allocReg();
|
||||
int t2 = mVU->regAlloc->allocReg();
|
||||
xmm Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
|
||||
xmm t1 = mVU->regAlloc->allocReg();
|
||||
xmm t2 = mVU->regAlloc->allocReg();
|
||||
|
||||
// Note: For help understanding this algorithm see recVUMI_FTOI_Saturate()
|
||||
SSE_MOVAPS_XMM_to_XMM(t1, Fs);
|
||||
if (addr) { SSE_MULPS_M128_to_XMM(Fs, addr); }
|
||||
SSE2_CVTTPS2DQ_XMM_to_XMM(Fs, Fs);
|
||||
SSE2_PXOR_M128_to_XMM(t1, (uptr)mVUglob.signbit);
|
||||
SSE2_PSRAD_I8_to_XMM (t1, 31);
|
||||
SSE_MOVAPS_XMM_to_XMM(t2, Fs);
|
||||
SSE2_PCMPEQD_M128_to_XMM(t2, (uptr)mVUglob.signbit);
|
||||
SSE_ANDPS_XMM_to_XMM (t1, t2);
|
||||
SSE2_PADDD_XMM_to_XMM(Fs, t1);
|
||||
xMOVAPS(t1, Fs);
|
||||
if (addr) { xMUL.PS(Fs, ptr128[addr]); }
|
||||
xCVTTPS2DQ(Fs, Fs);
|
||||
xPXOR(t1, ptr128[mVUglob.signbit]);
|
||||
xPSRA.D(t1, 31);
|
||||
xMOVAPS(t2, Fs);
|
||||
xPCMP.EQD(t2, ptr128[mVUglob.signbit]);
|
||||
xAND.PS(t1, t2);
|
||||
xPADD.D(Fs, t1);
|
||||
|
||||
mVU->regAlloc->clearNeeded(Fs);
|
||||
mVU->regAlloc->clearNeeded(t1);
|
||||
|
@ -370,14 +377,14 @@ static void mVU_FTOIx(mP, uptr addr, const char* opName) {
|
|||
}
|
||||
|
||||
// ITOF0/ITOF4/ITOF12/ITOF15 Opcodes
|
||||
static void mVU_ITOFx(mP, uptr addr, const char* opName) {
|
||||
static void mVU_ITOFx(mP, const float (*addr)[4], const char* opName) {
|
||||
pass1 { mVUanalyzeFMAC2(mVU, _Fs_, _Ft_); }
|
||||
pass2 {
|
||||
if (!_Ft_) return;
|
||||
int Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
|
||||
xmm Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
|
||||
|
||||
SSE2_CVTDQ2PS_XMM_to_XMM(Fs, Fs);
|
||||
if (addr) { SSE_MULPS_M128_to_XMM(Fs, addr); }
|
||||
xCVTDQ2PS(Fs, Fs);
|
||||
if (addr) { xMUL.PS(Fs, ptr128[addr]); }
|
||||
//mVUclamp2(Fs, xmmT1, 15); // Clamp (not sure if this is needed)
|
||||
|
||||
mVU->regAlloc->clearNeeded(Fs);
|
||||
|
@ -389,34 +396,34 @@ static void mVU_ITOFx(mP, uptr addr, const char* opName) {
|
|||
mVUop(mVU_CLIP) {
|
||||
pass1 { mVUanalyzeFMAC4(mVU, _Fs_, _Ft_); }
|
||||
pass2 {
|
||||
int Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
|
||||
int Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0x1);
|
||||
int t1 = mVU->regAlloc->allocReg();
|
||||
xmm Fs = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
|
||||
xmm Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0x1);
|
||||
xmm t1 = mVU->regAlloc->allocReg();
|
||||
|
||||
mVUunpack_xyzw(Ft, Ft, 0);
|
||||
mVUallocCFLAGa(mVU, gprT1, cFLAG.lastWrite);
|
||||
SHL32ItoR(gprT1, 6);
|
||||
xSHL(gprT1, 6);
|
||||
|
||||
SSE_ANDPS_M128_to_XMM(Ft, (uptr)mVUglob.absclip);
|
||||
SSE_MOVAPS_XMM_to_XMM(t1, Ft);
|
||||
SSE_ORPS_M128_to_XMM(t1, (uptr)mVUglob.signbit);
|
||||
xAND.PS(Ft, ptr128[&mVUglob.absclip[0]]);
|
||||
xMOVAPS(t1, Ft);
|
||||
xPOR(t1, ptr128[&mVUglob.signbit[0]]);
|
||||
|
||||
SSE_CMPNLEPS_XMM_to_XMM(t1, Fs); // -w, -z, -y, -x
|
||||
SSE_CMPLTPS_XMM_to_XMM(Ft, Fs); // +w, +z, +y, +x
|
||||
xCMPNLE.PS(t1, Fs); // -w, -z, -y, -x
|
||||
xCMPLT.PS(Ft, Fs); // +w, +z, +y, +x
|
||||
|
||||
SSE_MOVAPS_XMM_to_XMM(Fs, Ft); // Fs = +w, +z, +y, +x
|
||||
SSE_UNPCKLPS_XMM_to_XMM(Ft, t1); // Ft = -y,+y,-x,+x
|
||||
SSE_UNPCKHPS_XMM_to_XMM(Fs, t1); // Fs = -w,+w,-z,+z
|
||||
xMOVAPS(Fs, Ft); // Fs = +w, +z, +y, +x
|
||||
xUNPCK.LPS(Ft, t1); // Ft = -y,+y,-x,+x
|
||||
xUNPCK.HPS(Fs, t1); // Fs = -w,+w,-z,+z
|
||||
|
||||
SSE_MOVMSKPS_XMM_to_R32(gprT2, Fs); // -w,+w,-z,+z
|
||||
AND32ItoR(gprT2, 0x3);
|
||||
SHL32ItoR(gprT2, 4);
|
||||
OR32RtoR (gprT1, gprT2);
|
||||
xMOVMSKPS(gprT2, Fs); // -w,+w,-z,+z
|
||||
xAND(gprT2, 0x3);
|
||||
xSHL(gprT2, 4);
|
||||
xOR(gprT1, gprT2);
|
||||
|
||||
SSE_MOVMSKPS_XMM_to_R32(gprT2, Ft); // -y,+y,-x,+x
|
||||
AND32ItoR(gprT2, 0xf);
|
||||
OR32RtoR (gprT1, gprT2);
|
||||
AND32ItoR(gprT1, 0xffffff);
|
||||
xMOVMSKPS(gprT2, Ft); // -y,+y,-x,+x
|
||||
xAND(gprT2, 0xf);
|
||||
xOR(gprT1, gprT2);
|
||||
xAND(gprT1, 0xffffff);
|
||||
|
||||
mVUallocCFLAGb(mVU, gprT1, cFLAG.write);
|
||||
mVU->regAlloc->clearNeeded(Fs);
|
||||
|
@ -512,12 +519,12 @@ mVUop(mVU_MINIx) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIx", 0); }
|
|||
mVUop(mVU_MINIy) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIy", 0); }
|
||||
mVUop(mVU_MINIz) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIz", 0); }
|
||||
mVUop(mVU_MINIw) { mVU_FMACa(mVU, recPass, 2, 4, 0, "MINIw", 0); }
|
||||
mVUop(mVU_FTOI0) { mVU_FTOIx(mX, (uptr)0, "FTOI0"); }
|
||||
mVUop(mVU_FTOI4) { mVU_FTOIx(mX, (uptr)mVUglob.FTOI_4, "FTOI4"); }
|
||||
mVUop(mVU_FTOI12) { mVU_FTOIx(mX, (uptr)mVUglob.FTOI_12, "FTOI12"); }
|
||||
mVUop(mVU_FTOI15) { mVU_FTOIx(mX, (uptr)mVUglob.FTOI_15, "FTOI15"); }
|
||||
mVUop(mVU_ITOF0) { mVU_ITOFx(mX, (uptr)0, "ITOF0"); }
|
||||
mVUop(mVU_ITOF4) { mVU_ITOFx(mX, (uptr)mVUglob.ITOF_4, "ITOF4"); }
|
||||
mVUop(mVU_ITOF12) { mVU_ITOFx(mX, (uptr)mVUglob.ITOF_12, "ITOF12"); }
|
||||
mVUop(mVU_ITOF15) { mVU_ITOFx(mX, (uptr)mVUglob.ITOF_15, "ITOF15"); }
|
||||
mVUop(mVU_FTOI0) { mVU_FTOIx(mX, NULL, "FTOI0"); }
|
||||
mVUop(mVU_FTOI4) { mVU_FTOIx(mX, &mVUglob.FTOI_4, "FTOI4"); }
|
||||
mVUop(mVU_FTOI12) { mVU_FTOIx(mX, &mVUglob.FTOI_12, "FTOI12"); }
|
||||
mVUop(mVU_FTOI15) { mVU_FTOIx(mX, &mVUglob.FTOI_15, "FTOI15"); }
|
||||
mVUop(mVU_ITOF0) { mVU_ITOFx(mX, NULL, "ITOF0"); }
|
||||
mVUop(mVU_ITOF4) { mVU_ITOFx(mX, &mVUglob.ITOF_4, "ITOF4"); }
|
||||
mVUop(mVU_ITOF12) { mVU_ITOFx(mX, &mVUglob.ITOF_12, "ITOF12"); }
|
||||
mVUop(mVU_ITOF15) { mVU_ITOFx(mX, &mVUglob.ITOF_15, "ITOF15"); }
|
||||
mVUop(mVU_NOP) { pass3 { mVUlog("NOP"); } }
|
||||
|
|
|
@ -33,7 +33,7 @@ typedef void (__fastcall *nVifrecCall)(uptr dest, uptr src);
|
|||
#include "newVif_BlockBuffer.h"
|
||||
#include "newVif_HashBucket.h"
|
||||
|
||||
extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0);
|
||||
extern void mVUmergeRegs(xRegisterSSE dest, xRegisterSSE src, int xyzw, bool modXYZW = 0);
|
||||
extern void _nVifUnpack (int idx, u8 *data, u32 size, bool isFill);
|
||||
extern void dVifUnpack (int idx, u8 *data, u32 size, bool isFill);
|
||||
extern void dVifReset (int idx);
|
||||
|
|
|
@ -84,7 +84,7 @@ _f void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
|
|||
|
||||
void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
|
||||
pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
|
||||
int t = regX.Id ? 0 : 1; // Get Temp Reg
|
||||
xRegisterSSE t = regX == xmm0 ? xmm1 : xmm0; // Get Temp Reg
|
||||
int cc = aMin(vCL, 3);
|
||||
u32 m0 = (vB.mask >> (cc * 8)) & 0xff;
|
||||
u32 m1 = m0 & 0xaa;
|
||||
|
@ -95,18 +95,18 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
|
|||
makeMergeMask(m3);
|
||||
makeMergeMask(m4);
|
||||
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect
|
||||
if (doMask&&m2) { mergeVectors(regX.Id, xmmRow.Id, t, m2); } // Merge Row
|
||||
if (doMask&&m3) { mergeVectors(regX.Id, xmmCol0.Id+cc, t, m3); } // Merge Col
|
||||
if (doMask&&m4) { mergeVectors(regX.Id, xmmTemp.Id, t, m4); } // Merge Write Protect
|
||||
if (doMask&&m2) { mergeVectors(regX, xmmRow, t, m2); } // Merge Row
|
||||
if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), t, m3); } // Merge Col
|
||||
if (doMask&&m4) { mergeVectors(regX, xmmTemp, t, m4); } // Merge Write Protect
|
||||
if (doMode) {
|
||||
u32 m5 = (~m1>>1) & ~m0;
|
||||
if (!doMask) m5 = 0xf;
|
||||
else makeMergeMask(m5);
|
||||
if (m5 < 0xf) {
|
||||
xPXOR(xmmTemp, xmmTemp);
|
||||
mergeVectors(xmmTemp.Id, xmmRow.Id, t, m5);
|
||||
mergeVectors(xmmTemp, xmmRow, t, m5);
|
||||
xPADD.D(regX, xmmTemp);
|
||||
if (doMode==2) mergeVectors(xmmRow.Id, regX.Id, t, m5);
|
||||
if (doMode==2) mergeVectors(xmmRow, regX, t, m5);
|
||||
}
|
||||
else if (m5 == 0xf) {
|
||||
xPADD.D(regX, xmmRow);
|
||||
|
|
|
@ -25,13 +25,13 @@
|
|||
static __pagealigned u8 nVifUpkExec[__pagesize*4];
|
||||
|
||||
// Merges xmm vectors without modifying source reg
|
||||
void mergeVectors(int dest, int src, int temp, int xyzw) {
|
||||
void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions || (xyzw==15)
|
||||
|| (xyzw==12) || (xyzw==11) || (xyzw==8) || (xyzw==3)) {
|
||||
mVUmergeRegs(dest, src, xyzw);
|
||||
}
|
||||
else {
|
||||
SSE_MOVAPS_XMM_to_XMM(temp, src);
|
||||
xMOVAPS(temp, src);
|
||||
mVUmergeRegs(dest, temp, xyzw);
|
||||
}
|
||||
}
|
||||
|
@ -48,9 +48,9 @@ void loadRowCol(nVifStruct& v) {
|
|||
xPSHUF.D(xmm1, xmm1, _v0);
|
||||
xPSHUF.D(xmm2, xmm2, _v0);
|
||||
xPSHUF.D(xmm6, xmm6, _v0);
|
||||
mVUmergeRegs(XMM6, XMM0, 8);
|
||||
mVUmergeRegs(XMM6, XMM1, 4);
|
||||
mVUmergeRegs(XMM6, XMM2, 2);
|
||||
mVUmergeRegs(xmm6, xmm0, 8);
|
||||
mVUmergeRegs(xmm6, xmm1, 4);
|
||||
mVUmergeRegs(xmm6, xmm2, 2);
|
||||
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
|
||||
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
|
||||
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
|
||||
|
@ -221,13 +221,13 @@ void VifUnpackSSE_Base::xUPK_V4_5() const {
|
|||
xMOVAPS (destReg, workReg); // x|x|x|R
|
||||
xPSRL.D (workReg, 8); // ABG
|
||||
xPSLL.D (workReg, 3); // AB|G5.000
|
||||
mVUmergeRegs(destReg.Id, workReg.Id, 0x4); // x|x|G|R
|
||||
mVUmergeRegs(destReg, workReg, 0x4);// x|x|G|R
|
||||
xPSRL.D (workReg, 8); // AB
|
||||
xPSLL.D (workReg, 3); // A|B5.000
|
||||
mVUmergeRegs(destReg.Id, workReg.Id, 0x2); // x|B|G|R
|
||||
mVUmergeRegs(destReg, workReg, 0x2);// x|B|G|R
|
||||
xPSRL.D (workReg, 8); // A
|
||||
xPSLL.D (workReg, 7); // A.0000000
|
||||
mVUmergeRegs(destReg.Id, workReg.Id, 0x1); // A|B|G|R
|
||||
mVUmergeRegs(destReg, workReg, 0x1);// A|B|G|R
|
||||
xPSLL.D (destReg, 24); // can optimize to
|
||||
xPSRL.D (destReg, 24); // single AND...
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
using namespace x86Emitter;
|
||||
|
||||
extern void mergeVectors(int dest, int src, int temp, int xyzw);
|
||||
extern void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw);
|
||||
extern void loadRowCol(nVifStruct& v);
|
||||
|
||||
// --------------------------------------------------------------------------------------
|
||||
|
|
Loading…
Reference in New Issue