diff --git a/common/include/x86emitter/x86types.h b/common/include/x86emitter/x86types.h
index 951b0f21b7..1b0db38900 100644
--- a/common/include/x86emitter/x86types.h
+++ b/common/include/x86emitter/x86types.h
@@ -157,9 +157,12 @@ template< typename T > void xWrite( T val );
class ModSibBase;
extern void xSetPtr( void* ptr );
- extern u8* xGetPtr();
extern void xAlignPtr( uint bytes );
extern void xAdvancePtr( uint bytes );
+ extern void xAlignCallTarget();
+
+ extern u8* xGetPtr();
+ extern u8* xGetAlignedCallTarget();
extern JccComparisonType xInvertCond( JccComparisonType src );
diff --git a/common/src/x86emitter/x86emitter.cpp b/common/src/x86emitter/x86emitter.cpp
index f540415b49..6de0c6a7e2 100644
--- a/common/src/x86emitter/x86emitter.cpp
+++ b/common/src/x86emitter/x86emitter.cpp
@@ -395,6 +395,32 @@ __emitinline void xAlignPtr( uint bytes )
x86Ptr = (u8*)( ( (uptr)x86Ptr + bytes - 1) & ~(bytes - 1) );
}
+// Performs best-case alignment for the target CPU, for use prior to starting a new
+// function. This is not meant to be used prior to jump targets, since it doesn't
+// add padding (additionally, speed benefit from jump alignment is minimal, and often
+// a loss).
+__emitinline void xAlignCallTarget()
+{
+ // Core2/i7 CPUs prefer unaligned addresses. Checking for SSSE3 is a decent filter.
+ // (also align in debug modes for disasm convenience)
+
+ if( IsDebugBuild || !x86caps.hasSupplementalStreamingSIMD3Extensions )
+ {
+ // - P4's and earlier prefer 16 byte alignment.
+ // - AMD Athlons and Phenoms prefer 8 byte alignment, but I don't have an easy
+ // heuristic for it yet.
+ // - AMD Phenom IIs are unknown (either prefer 8 byte, or unaligned).
+
+ xAlignPtr( 16 );
+ }
+}
+
+__emitinline u8* xGetAlignedCallTarget()
+{
+ xAlignCallTarget();
+ return x86Ptr;
+}
+
__emitinline void xAdvancePtr( uint bytes )
{
if( IsDevBuild )
diff --git a/pcsx2/Vif1Dma.cpp b/pcsx2/Vif1Dma.cpp
index 8c7df26cb3..873ffd8c33 100644
--- a/pcsx2/Vif1Dma.cpp
+++ b/pcsx2/Vif1Dma.cpp
@@ -58,6 +58,11 @@ __forceinline void vif1FLUSH()
void vif1Init()
{
+#ifdef newVif1
+ extern void initNewVif(int idx);
+ initNewVif(1);
+#endif
+
SetNewMask(g_vif1Masks, g_vif1HasMask3, 0, 0xffffffff);
}
@@ -313,19 +318,13 @@ static int __fastcall Vif1TransDirectHL(u32 *data)
return ret;
}
-#ifdef newVif1
- extern void initNewVif(int idx);
- extern int nVifUnpack(int idx, u32 *data);
- static int testVif = 0;
-#endif
static int __fastcall Vif1TransUnpack(u32 *data)
{
#ifdef newVif1
- if (!testVif) { initNewVif(1); testVif = 1; }
- //int temp = nVifUnpack(1, data);
- //if (temp >= 0) return temp;
+ extern int nVifUnpack(int idx, u32 *data);
return nVifUnpack(1, data);
#endif
+
XMMRegisters::Freeze();
if (vif1.vifpacketsize < vif1.tag.size)
diff --git a/pcsx2/VifDma_internal.h b/pcsx2/VifDma_internal.h
index 8c02fd7576..7ce8556ebf 100644
--- a/pcsx2/VifDma_internal.h
+++ b/pcsx2/VifDma_internal.h
@@ -60,7 +60,7 @@ static __forceinline u32 vif_size(u8 num)
return (num == 0) ? 0x1000 : 0x4000;
}
-#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
-#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
+//#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
+//#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
//#define newVif0 // Use New Code for Vif0 Unpacks (not implemented)
#endif
diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp
index b1441f63f4..8bfc45bbf0 100644
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@@ -371,7 +371,7 @@ static DynGenFunc* _DynGen_JITCompile()
{
pxAssertMsg( DispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple. Thanks." );
- u8* retval = xGetPtr();
+ u8* retval = xGetAlignedCallTarget();
_DynGen_StackFrameCheck();
xMOV( ecx, &cpuRegs.pc );
@@ -388,7 +388,7 @@ static DynGenFunc* _DynGen_JITCompile()
static DynGenFunc* _DynGen_JITCompileInBlock()
{
- u8* retval = xGetPtr();
+ u8* retval = xGetAlignedCallTarget();
xJMP( JITCompile );
return (DynGenFunc*)retval;
}
@@ -396,7 +396,7 @@ static DynGenFunc* _DynGen_JITCompileInBlock()
// called when jumping to variable pc address
static DynGenFunc* _DynGen_DispatcherReg()
{
- u8* retval = xGetPtr();
+ u8* retval = xGetPtr(); // fallthrough target, can't align it!
_DynGen_StackFrameCheck();
xMOV( eax, &cpuRegs.pc );
@@ -410,7 +410,7 @@ static DynGenFunc* _DynGen_DispatcherReg()
static DynGenFunc* _DynGen_EnterRecompiledCode()
{
- u8* retval = xGetPtr();
+ u8* retval = xGetAlignedCallTarget();
// "standard" frame pointer setup for aligned stack: Record the original
// esp into ebp, and then align esp. ebp references the original esp base
@@ -446,6 +446,8 @@ static DynGenFunc* _DynGen_EnterRecompiledCode()
xMOV( &s_store_ebp, ebp );
xJMP( ptr32[&DispatcherReg] );
+
+ xAlignCallTarget();
imm = (uptr)xGetPtr();
ExitRecompiledCode = (DynGenFunc*)xGetPtr();
@@ -1254,7 +1256,7 @@ void recompileNextInstruction(int delayslot)
// _flushCachedRegs();
// g_cpuHasConstReg = 1;
- if (!delayslot && x86Ptr - recPtr > 0x1000)
+ if (!delayslot && (xGetPtr() - recPtr > 0x1000) )
s_nEndBlock = pc;
}
@@ -1335,9 +1337,8 @@ static void __fastcall recRecompile( const u32 startpc )
recResetEE();
}
- x86SetPtr( recPtr );
- x86Align(16);
- recPtr = x86Ptr;
+ xSetPtr( recPtr );
+ recPtr = xGetAlignedCallTarget();
s_nBlockFF = false;
if (HWADDR(startpc) == 0x81fc0)
@@ -1718,14 +1719,14 @@ StartRecomp:
}
}
- pxAssert( x86Ptr < recMem+REC_CACHEMEM );
+ pxAssert( xGetPtr() < recMem+REC_CACHEMEM );
pxAssert( recConstBufPtr < recConstBuf + RECCONSTBUF_SIZE );
pxAssert( x86FpuState == 0 );
- pxAssert(x86Ptr - recPtr < 0x10000);
- s_pCurBlockEx->x86size = x86Ptr - recPtr;
+ pxAssert(xGetPtr() - recPtr < 0x10000);
+ s_pCurBlockEx->x86size = xGetPtr() - recPtr;
- recPtr = x86Ptr;
+ recPtr = xGetPtr();
pxAssert( (g_cpuHasConstReg&g_cpuFlushedConstReg) == g_cpuHasConstReg );
diff --git a/pcsx2/x86/microVU_Analyze.inl b/pcsx2/x86/microVU_Analyze.inl
index ddb2133d97..52f76cadd6 100644
--- a/pcsx2/x86/microVU_Analyze.inl
+++ b/pcsx2/x86/microVU_Analyze.inl
@@ -1,441 +1,441 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-#pragma once
-
-//------------------------------------------------------------------
-// Micro VU - Pass 1 Functions
-//------------------------------------------------------------------
-
-//------------------------------------------------------------------
-// Helper Macros
-//------------------------------------------------------------------
-
-#define aReg(x) mVUregs.VF[x]
-#define bReg(x, y) mVUregsTemp.VFreg[y] = x; mVUregsTemp.VF[y]
-#define aMax(x, y) ((x > y) ? x : y)
-#define aMin(x, y) ((x < y) ? x : y)
-
-// Read a VF reg
-#define analyzeReg1(xReg, vfRead) { \
- if (xReg) { \
- if (_X) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \
- if (_Y) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \
- if (_Z) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \
- if (_W) { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \
- } \
-}
-
-// Write to a VF reg
-#define analyzeReg2(xReg, vfWrite, isLowOp) { \
- if (xReg) { \
- if (_X) { bReg(xReg, isLowOp).x = 4; vfWrite.reg = xReg; vfWrite.x = 4; } \
- if (_Y) { bReg(xReg, isLowOp).y = 4; vfWrite.reg = xReg; vfWrite.y = 4; } \
- if (_Z) { bReg(xReg, isLowOp).z = 4; vfWrite.reg = xReg; vfWrite.z = 4; } \
- if (_W) { bReg(xReg, isLowOp).w = 4; vfWrite.reg = xReg; vfWrite.w = 4; } \
- } \
-}
-
-// Read a VF reg (BC opcodes)
-#define analyzeReg3(xReg, vfRead) { \
- if (xReg) { \
- if (_bc_x) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \
- else if (_bc_y) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \
- else if (_bc_z) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \
- else { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \
- } \
-}
-
-// For Clip Opcode
-#define analyzeReg4(xReg, vfRead) { \
- if (xReg) { \
- mVUstall = aMax(mVUstall, aReg(xReg).w); \
- vfRead.reg = xReg; vfRead.w = 1; \
- } \
-}
-
-// Read VF reg (FsF/FtF)
-#define analyzeReg5(xReg, fxf, vfRead) { \
- if (xReg) { \
- switch (fxf) { \
- case 0: mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; break; \
- case 1: mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; break; \
- case 2: mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; break; \
- case 3: mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; break; \
- } \
- } \
-}
-
-// Flips xyzw stalls to yzwx (MR32 Opcode)
-#define analyzeReg6(xReg, vfRead) { \
- if (xReg) { \
- if (_X) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \
- if (_Y) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \
- if (_Z) { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \
- if (_W) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \
- } \
-}
-
-// Reading a VI reg
-#define analyzeVIreg1(xReg, viRead) { \
- if (xReg) { \
- mVUstall = aMax(mVUstall, mVUregs.VI[xReg]); \
- viRead.reg = xReg; viRead.used = 1; \
- } \
-}
-
-// Writing to a VI reg
-#define analyzeVIreg2(xReg, viWrite, aCycles) { \
- if (xReg) { \
- mVUconstReg[xReg].isValid = 0; \
- mVUregsTemp.VIreg = xReg; \
- mVUregsTemp.VI = aCycles; \
- viWrite.reg = xReg; \
- viWrite.used = aCycles; \
- } \
-}
-
-#define analyzeQreg(x) { mVUregsTemp.q = x; mVUstall = aMax(mVUstall, mVUregs.q); }
-#define analyzePreg(x) { mVUregsTemp.p = x; mVUstall = aMax(mVUstall, ((mVUregs.p) ? (mVUregs.p - 1) : 0)); }
-#define analyzeRreg() { mVUregsTemp.r = 1; }
-#define analyzeXGkick1() { mVUstall = aMax(mVUstall, mVUregs.xgkick); }
-#define analyzeXGkick2(x) { mVUregsTemp.xgkick = x; }
-#define setConstReg(x, v) { if (x) { mVUconstReg[x].isValid = 1; mVUconstReg[x].regValue = v; } }
-
-//------------------------------------------------------------------
-// FMAC1 - Normal FMAC Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeFMAC1(mV, int Fd, int Fs, int Ft) {
- sFLAG.doFlag = 1;
- analyzeReg1(Fs, mVUup.VF_read[0]);
- analyzeReg1(Ft, mVUup.VF_read[1]);
- analyzeReg2(Fd, mVUup.VF_write, 0);
-}
-
-//------------------------------------------------------------------
-// FMAC2 - ABS/FTOI/ITOF Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeFMAC2(mV, int Fs, int Ft) {
- analyzeReg1(Fs, mVUup.VF_read[0]);
- analyzeReg2(Ft, mVUup.VF_write, 0);
-}
-
-//------------------------------------------------------------------
-// FMAC3 - BC(xyzw) FMAC Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeFMAC3(mV, int Fd, int Fs, int Ft) {
- sFLAG.doFlag = 1;
- analyzeReg1(Fs, mVUup.VF_read[0]);
- analyzeReg3(Ft, mVUup.VF_read[1]);
- analyzeReg2(Fd, mVUup.VF_write, 0);
-}
-
-//------------------------------------------------------------------
-// FMAC4 - Clip FMAC Opcode
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeFMAC4(mV, int Fs, int Ft) {
- cFLAG.doFlag = 1;
- analyzeReg1(Fs, mVUup.VF_read[0]);
- analyzeReg4(Ft, mVUup.VF_read[1]);
-}
-
-//------------------------------------------------------------------
-// IALU - IALU Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeIALU1(mV, int Id, int Is, int It) {
- if (!Id) { mVUlow.isNOP = 1; }
- analyzeVIreg1(Is, mVUlow.VI_read[0]);
- analyzeVIreg1(It, mVUlow.VI_read[1]);
- analyzeVIreg2(Id, mVUlow.VI_write, 1);
-}
-
-microVUt(void) mVUanalyzeIALU2(mV, int Is, int It) {
- if (!It) { mVUlow.isNOP = 1; }
- analyzeVIreg1(Is, mVUlow.VI_read[0]);
- analyzeVIreg2(It, mVUlow.VI_write, 1);
-}
-
-microVUt(void) mVUanalyzeIADDI(mV, int Is, int It, s16 imm) {
- mVUanalyzeIALU2(mVU, Is, It);
- if (!Is) { setConstReg(It, imm); }
-}
-
-//------------------------------------------------------------------
-// MR32 - MR32 Opcode
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeMR32(mV, int Fs, int Ft) {
- if (!Ft) { mVUlow.isNOP = 1; }
- analyzeReg6(Fs, mVUlow.VF_read[0]);
- analyzeReg2(Ft, mVUlow.VF_write, 1);
-}
-
-//------------------------------------------------------------------
-// FDIV - DIV/SQRT/RSQRT Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeFDIV(mV, int Fs, int Fsf, int Ft, int Ftf, u8 xCycles) {
- mVUprint("microVU: DIV Opcode");
- analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]);
- analyzeReg5(Ft, Ftf, mVUlow.VF_read[1]);
- analyzeQreg(xCycles);
-}
-
-//------------------------------------------------------------------
-// EFU - EFU Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeEFU1(mV, int Fs, int Fsf, u8 xCycles) {
- mVUprint("microVU: EFU Opcode");
- analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]);
- analyzePreg(xCycles);
-}
-
-microVUt(void) mVUanalyzeEFU2(mV, int Fs, u8 xCycles) {
- mVUprint("microVU: EFU Opcode");
- analyzeReg1(Fs, mVUlow.VF_read[0]);
- analyzePreg(xCycles);
-}
-
-//------------------------------------------------------------------
-// MFP - MFP Opcode
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeMFP(mV, int Ft) {
- if (!Ft) { mVUlow.isNOP = 1; }
- analyzeReg2(Ft, mVUlow.VF_write, 1);
-}
-
-//------------------------------------------------------------------
-// MOVE - MOVE Opcode
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeMOVE(mV, int Fs, int Ft) {
- if (!Ft || (Ft == Fs)) { mVUlow.isNOP = 1; }
- analyzeReg1(Fs, mVUlow.VF_read[0]);
- analyzeReg2(Ft, mVUlow.VF_write, 1);
-}
-
-//------------------------------------------------------------------
-// LQx - LQ/LQD/LQI Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeLQ(mV, int Ft, int Is, bool writeIs) {
- analyzeVIreg1(Is, mVUlow.VI_read[0]);
- analyzeReg2 (Ft, mVUlow.VF_write, 1);
- if (!Ft) { if (writeIs && Is) { mVUlow.noWriteVF = 1; } else { mVUlow.isNOP = 1; } }
- if (writeIs) { analyzeVIreg2(Is, mVUlow.VI_write, 1); }
-}
-
-//------------------------------------------------------------------
-// SQx - SQ/SQD/SQI Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeSQ(mV, int Fs, int It, bool writeIt) {
- analyzeReg1 (Fs, mVUlow.VF_read[0]);
- analyzeVIreg1(It, mVUlow.VI_read[0]);
- if (writeIt) { analyzeVIreg2(It, mVUlow.VI_write, 1); }
-}
-
-//------------------------------------------------------------------
-// R*** - R Reg Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeR1(mV, int Fs, int Fsf) {
- analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]);
- analyzeRreg();
-}
-
-microVUt(void) mVUanalyzeR2(mV, int Ft, bool canBeNOP) {
- if (!Ft) { if (canBeNOP) { mVUlow.isNOP = 1; } else { mVUlow.noWriteVF = 1; } }
- analyzeReg2(Ft, mVUlow.VF_write, 1);
- analyzeRreg();
-}
-
-//------------------------------------------------------------------
-// Sflag - Status Flag Opcodes
-//------------------------------------------------------------------
-microVUt(void) flagSet(mV, bool setMacFlag) {
- int curPC = iPC;
- for (int i = mVUcount, j = 0; i > 0; i--, j++) {
- j += mVUstall;
- incPC2(-2);
- if (sFLAG.doFlag && (j >= 3)) {
- if (setMacFlag) { mFLAG.doFlag = 1; }
- else { sFLAG.doNonSticky = 1; }
- break;
- }
- }
- iPC = curPC;
-}
-
-microVUt(void) mVUanalyzeSflag(mV, int It) {
- mVUlow.readFlags = 1;
- analyzeVIreg2(It, mVUlow.VI_write, 1);
- if (!It) { mVUlow.isNOP = 1; }
- else {
- mVUsFlagHack = 0; // Don't Optimize Out Status Flags for this block
- mVUinfo.swapOps = 1;
- flagSet(mVU, 0);
- if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 1; }
- }
-}
-
-microVUt(void) mVUanalyzeFSSET(mV) {
- mVUlow.isFSSET = 1;
- mVUlow.readFlags = 1;
-}
-
-//------------------------------------------------------------------
-// Mflag - Mac Flag Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeMflag(mV, int Is, int It) {
- mVUlow.readFlags = 1;
- analyzeVIreg1(Is, mVUlow.VI_read[0]);
- analyzeVIreg2(It, mVUlow.VI_write, 1);
- if (!It) { mVUlow.isNOP = 1; }
- else {
- mVUinfo.swapOps = 1;
- flagSet(mVU, 1);
- if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 2; }
- }
-}
-
-//------------------------------------------------------------------
-// Cflag - Clip Flag Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeCflag(mV, int It) {
- mVUinfo.swapOps = 1;
- mVUlow.readFlags = 1;
- if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 4; }
- analyzeVIreg2(It, mVUlow.VI_write, 1);
-}
-
-//------------------------------------------------------------------
-// XGkick
-//------------------------------------------------------------------
-
-microVUt(void) mVUanalyzeXGkick(mV, int Fs, int xCycles) {
- analyzeVIreg1(Fs, mVUlow.VI_read[0]);
- analyzeXGkick1();
- analyzeXGkick2(xCycles);
- // Note: Technically XGKICK should stall on the next instruction,
- // this code stalls on the same instruction. The only case where this
- // will be a problem with, is if you have very-specifically placed
- // FMxxx or FSxxx opcodes checking flags near this instruction AND
- // the XGKICK instruction stalls. No-game should be effected by
- // this minor difference.
-}
-
-//------------------------------------------------------------------
-// Branches - Branch Opcodes
-//------------------------------------------------------------------
-
-microVUt(void) analyzeBranchVI(mV, int xReg, bool &infoVar) {
- if (!xReg) return;
- int i;
- int iEnd = aMin(5, (mVUcount+1));
- int bPC = iPC;
- incPC2(-2);
- for (i = 0; i < iEnd; i++) {
- if ((i == mVUcount) && (i < 5)) {
- if (mVUpBlock->pState.viBackUp == xReg) {
- infoVar = 1;
- i++;
- }
- break;
- }
- if ((mVUlow.VI_write.reg == xReg) && mVUlow.VI_write.used) {
- if (mVUlow.readFlags || i == 5) break;
- if (i == 0) { incPC2(-2); continue; }
- if (((mVUlow.VI_read[0].reg == xReg) && (mVUlow.VI_read[0].used))
- || ((mVUlow.VI_read[1].reg == xReg) && (mVUlow.VI_read[1].used)))
- { incPC2(-2); continue; }
- }
- break;
- }
- if (i) {
- if (!infoVar) {
- incPC2(2);
- mVUlow.backupVI = 1;
- infoVar = 1;
- }
- iPC = bPC;
- Console.WriteLn( Color_Green, "microVU%d: Branch VI-Delay (%d) [%04x]", getIndex, i, xPC);
- }
- else iPC = bPC;
-}
-
-// Branch in Branch Delay-Slots
-microVUt(int) mVUbranchCheck(mV) {
- if (!mVUcount) return 0;
- incPC(-2);
- if (mVUlow.branch) {
- mVUlow.badBranch = 1;
- incPC(2);
- mVUlow.evilBranch = 1;
- mVUregs.blockType = 2;
- Console.Warning("microVU%d Warning: Branch in Branch delay slot! [%04x]", mVU->index, xPC);
- return 1;
- }
- incPC(2);
- return 0;
-}
-
-microVUt(void) mVUanalyzeCondBranch1(mV, int Is) {
- analyzeVIreg1(Is, mVUlow.VI_read[0]);
- if (!mVUstall && !mVUbranchCheck(mVU)) {
- analyzeBranchVI(mVU, Is, mVUlow.memReadIs);
- }
-}
-
-microVUt(void) mVUanalyzeCondBranch2(mV, int Is, int It) {
- analyzeVIreg1(Is, mVUlow.VI_read[0]);
- analyzeVIreg1(It, mVUlow.VI_read[1]);
- if (!mVUstall && !mVUbranchCheck(mVU)) {
- analyzeBranchVI(mVU, Is, mVUlow.memReadIs);
- analyzeBranchVI(mVU, It, mVUlow.memReadIt);
- }
-}
-
-microVUt(void) mVUanalyzeNormBranch(mV, int It, bool isBAL) {
- mVUbranchCheck(mVU);
- if (isBAL) {
- analyzeVIreg2(It, mVUlow.VI_write, 1);
- setConstReg(It, bSaveAddr);
- }
-}
-
-microVUt(void) mVUanalyzeJump(mV, int Is, int It, bool isJALR) {
- mVUbranchCheck(mVU);
- mVUlow.branch = (isJALR) ? 10 : 9;
- if (mVUconstReg[Is].isValid && !CHECK_VU_CONSTHACK) {
- mVUlow.constJump.isValid = 1;
- mVUlow.constJump.regValue = mVUconstReg[Is].regValue;
- //DevCon.Status("microVU%d: Constant JR/JALR Address Optimization", mVU->index);
- }
- analyzeVIreg1(Is, mVUlow.VI_read[0]);
- if (isJALR) {
- analyzeVIreg2(It, mVUlow.VI_write, 1);
- setConstReg(It, bSaveAddr);
- }
-}
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+#pragma once
+
+//------------------------------------------------------------------
+// Micro VU - Pass 1 Functions
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// Helper Macros
+//------------------------------------------------------------------
+
+#define aReg(x) mVUregs.VF[x]
+#define bReg(x, y) mVUregsTemp.VFreg[y] = x; mVUregsTemp.VF[y]
+#define aMax(x, y) ((x > y) ? x : y)
+#define aMin(x, y) ((x < y) ? x : y)
+
+// Read a VF reg
+#define analyzeReg1(xReg, vfRead) { \
+ if (xReg) { \
+ if (_X) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \
+ if (_Y) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \
+ if (_Z) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \
+ if (_W) { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \
+ } \
+}
+
+// Write to a VF reg
+#define analyzeReg2(xReg, vfWrite, isLowOp) { \
+ if (xReg) { \
+ if (_X) { bReg(xReg, isLowOp).x = 4; vfWrite.reg = xReg; vfWrite.x = 4; } \
+ if (_Y) { bReg(xReg, isLowOp).y = 4; vfWrite.reg = xReg; vfWrite.y = 4; } \
+ if (_Z) { bReg(xReg, isLowOp).z = 4; vfWrite.reg = xReg; vfWrite.z = 4; } \
+ if (_W) { bReg(xReg, isLowOp).w = 4; vfWrite.reg = xReg; vfWrite.w = 4; } \
+ } \
+}
+
+// Read a VF reg (BC opcodes)
+#define analyzeReg3(xReg, vfRead) { \
+ if (xReg) { \
+ if (_bc_x) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \
+ else if (_bc_y) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \
+ else if (_bc_z) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \
+ else { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \
+ } \
+}
+
+// For Clip Opcode
+#define analyzeReg4(xReg, vfRead) { \
+ if (xReg) { \
+ mVUstall = aMax(mVUstall, aReg(xReg).w); \
+ vfRead.reg = xReg; vfRead.w = 1; \
+ } \
+}
+
+// Read VF reg (FsF/FtF)
+#define analyzeReg5(xReg, fxf, vfRead) { \
+ if (xReg) { \
+ switch (fxf) { \
+ case 0: mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; break; \
+ case 1: mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; break; \
+ case 2: mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; break; \
+ case 3: mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; break; \
+ } \
+ } \
+}
+
+// Flips xyzw stalls to yzwx (MR32 Opcode)
+#define analyzeReg6(xReg, vfRead) { \
+ if (xReg) { \
+ if (_X) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \
+ if (_Y) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \
+ if (_Z) { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \
+ if (_W) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \
+ } \
+}
+
+// Reading a VI reg
+#define analyzeVIreg1(xReg, viRead) { \
+ if (xReg) { \
+ mVUstall = aMax(mVUstall, mVUregs.VI[xReg]); \
+ viRead.reg = xReg; viRead.used = 1; \
+ } \
+}
+
+// Writing to a VI reg
+#define analyzeVIreg2(xReg, viWrite, aCycles) { \
+ if (xReg) { \
+ mVUconstReg[xReg].isValid = 0; \
+ mVUregsTemp.VIreg = xReg; \
+ mVUregsTemp.VI = aCycles; \
+ viWrite.reg = xReg; \
+ viWrite.used = aCycles; \
+ } \
+}
+
+#define analyzeQreg(x) { mVUregsTemp.q = x; mVUstall = aMax(mVUstall, mVUregs.q); }
+#define analyzePreg(x) { mVUregsTemp.p = x; mVUstall = aMax(mVUstall, ((mVUregs.p) ? (mVUregs.p - 1) : 0)); }
+#define analyzeRreg() { mVUregsTemp.r = 1; }
+#define analyzeXGkick1() { mVUstall = aMax(mVUstall, mVUregs.xgkick); }
+#define analyzeXGkick2(x) { mVUregsTemp.xgkick = x; }
+#define setConstReg(x, v) { if (x) { mVUconstReg[x].isValid = 1; mVUconstReg[x].regValue = v; } }
+
+//------------------------------------------------------------------
+// FMAC1 - Normal FMAC Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeFMAC1(mV, int Fd, int Fs, int Ft) {
+ sFLAG.doFlag = 1;
+ analyzeReg1(Fs, mVUup.VF_read[0]);
+ analyzeReg1(Ft, mVUup.VF_read[1]);
+ analyzeReg2(Fd, mVUup.VF_write, 0);
+}
+
+//------------------------------------------------------------------
+// FMAC2 - ABS/FTOI/ITOF Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeFMAC2(mV, int Fs, int Ft) {
+ analyzeReg1(Fs, mVUup.VF_read[0]);
+ analyzeReg2(Ft, mVUup.VF_write, 0);
+}
+
+//------------------------------------------------------------------
+// FMAC3 - BC(xyzw) FMAC Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeFMAC3(mV, int Fd, int Fs, int Ft) {
+ sFLAG.doFlag = 1;
+ analyzeReg1(Fs, mVUup.VF_read[0]);
+ analyzeReg3(Ft, mVUup.VF_read[1]);
+ analyzeReg2(Fd, mVUup.VF_write, 0);
+}
+
+//------------------------------------------------------------------
+// FMAC4 - Clip FMAC Opcode
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeFMAC4(mV, int Fs, int Ft) {
+ cFLAG.doFlag = 1;
+ analyzeReg1(Fs, mVUup.VF_read[0]);
+ analyzeReg4(Ft, mVUup.VF_read[1]);
+}
+
+//------------------------------------------------------------------
+// IALU - IALU Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeIALU1(mV, int Id, int Is, int It) {
+ if (!Id) { mVUlow.isNOP = 1; }
+ analyzeVIreg1(Is, mVUlow.VI_read[0]);
+ analyzeVIreg1(It, mVUlow.VI_read[1]);
+ analyzeVIreg2(Id, mVUlow.VI_write, 1);
+}
+
+microVUt(void) mVUanalyzeIALU2(mV, int Is, int It) {
+ if (!It) { mVUlow.isNOP = 1; }
+ analyzeVIreg1(Is, mVUlow.VI_read[0]);
+ analyzeVIreg2(It, mVUlow.VI_write, 1);
+}
+
+microVUt(void) mVUanalyzeIADDI(mV, int Is, int It, s16 imm) {
+ mVUanalyzeIALU2(mVU, Is, It);
+ if (!Is) { setConstReg(It, imm); }
+}
+
+//------------------------------------------------------------------
+// MR32 - MR32 Opcode
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeMR32(mV, int Fs, int Ft) {
+ if (!Ft) { mVUlow.isNOP = 1; }
+ analyzeReg6(Fs, mVUlow.VF_read[0]);
+ analyzeReg2(Ft, mVUlow.VF_write, 1);
+}
+
+//------------------------------------------------------------------
+// FDIV - DIV/SQRT/RSQRT Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeFDIV(mV, int Fs, int Fsf, int Ft, int Ftf, u8 xCycles) {
+ mVUprint("microVU: DIV Opcode");
+ analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]);
+ analyzeReg5(Ft, Ftf, mVUlow.VF_read[1]);
+ analyzeQreg(xCycles);
+}
+
+//------------------------------------------------------------------
+// EFU - EFU Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeEFU1(mV, int Fs, int Fsf, u8 xCycles) {
+ mVUprint("microVU: EFU Opcode");
+ analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]);
+ analyzePreg(xCycles);
+}
+
+microVUt(void) mVUanalyzeEFU2(mV, int Fs, u8 xCycles) {
+ mVUprint("microVU: EFU Opcode");
+ analyzeReg1(Fs, mVUlow.VF_read[0]);
+ analyzePreg(xCycles);
+}
+
+//------------------------------------------------------------------
+// MFP - MFP Opcode
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeMFP(mV, int Ft) {
+ if (!Ft) { mVUlow.isNOP = 1; }
+ analyzeReg2(Ft, mVUlow.VF_write, 1);
+}
+
+//------------------------------------------------------------------
+// MOVE - MOVE Opcode
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeMOVE(mV, int Fs, int Ft) {
+ if (!Ft || (Ft == Fs)) { mVUlow.isNOP = 1; }
+ analyzeReg1(Fs, mVUlow.VF_read[0]);
+ analyzeReg2(Ft, mVUlow.VF_write, 1);
+}
+
+//------------------------------------------------------------------
+// LQx - LQ/LQD/LQI Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeLQ(mV, int Ft, int Is, bool writeIs) {
+ analyzeVIreg1(Is, mVUlow.VI_read[0]);
+ analyzeReg2 (Ft, mVUlow.VF_write, 1);
+ if (!Ft) { if (writeIs && Is) { mVUlow.noWriteVF = 1; } else { mVUlow.isNOP = 1; } }
+ if (writeIs) { analyzeVIreg2(Is, mVUlow.VI_write, 1); }
+}
+
+//------------------------------------------------------------------
+// SQx - SQ/SQD/SQI Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeSQ(mV, int Fs, int It, bool writeIt) {
+ analyzeReg1 (Fs, mVUlow.VF_read[0]);
+ analyzeVIreg1(It, mVUlow.VI_read[0]);
+ if (writeIt) { analyzeVIreg2(It, mVUlow.VI_write, 1); }
+}
+
+//------------------------------------------------------------------
+// R*** - R Reg Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeR1(mV, int Fs, int Fsf) {
+ analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]);
+ analyzeRreg();
+}
+
+microVUt(void) mVUanalyzeR2(mV, int Ft, bool canBeNOP) {
+ if (!Ft) { if (canBeNOP) { mVUlow.isNOP = 1; } else { mVUlow.noWriteVF = 1; } }
+ analyzeReg2(Ft, mVUlow.VF_write, 1);
+ analyzeRreg();
+}
+
+//------------------------------------------------------------------
+// Sflag - Status Flag Opcodes
+//------------------------------------------------------------------
+microVUt(void) flagSet(mV, bool setMacFlag) {
+ int curPC = iPC;
+ for (int i = mVUcount, j = 0; i > 0; i--, j++) {
+ j += mVUstall;
+ incPC2(-2);
+ if (sFLAG.doFlag && (j >= 3)) {
+ if (setMacFlag) { mFLAG.doFlag = 1; }
+ else { sFLAG.doNonSticky = 1; }
+ break;
+ }
+ }
+ iPC = curPC;
+}
+
+microVUt(void) mVUanalyzeSflag(mV, int It) {
+ mVUlow.readFlags = 1;
+ analyzeVIreg2(It, mVUlow.VI_write, 1);
+ if (!It) { mVUlow.isNOP = 1; }
+ else {
+ mVUsFlagHack = 0; // Don't Optimize Out Status Flags for this block
+ mVUinfo.swapOps = 1;
+ flagSet(mVU, 0);
+ if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 1; }
+ }
+}
+
+microVUt(void) mVUanalyzeFSSET(mV) {
+ mVUlow.isFSSET = 1;
+ mVUlow.readFlags = 1;
+}
+
+//------------------------------------------------------------------
+// Mflag - Mac Flag Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeMflag(mV, int Is, int It) {
+ mVUlow.readFlags = 1;
+ analyzeVIreg1(Is, mVUlow.VI_read[0]);
+ analyzeVIreg2(It, mVUlow.VI_write, 1);
+ if (!It) { mVUlow.isNOP = 1; }
+ else {
+ mVUinfo.swapOps = 1;
+ flagSet(mVU, 1);
+ if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 2; }
+ }
+}
+
+//------------------------------------------------------------------
+// Cflag - Clip Flag Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeCflag(mV, int It) {
+ mVUinfo.swapOps = 1;
+ mVUlow.readFlags = 1;
+ if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 4; }
+ analyzeVIreg2(It, mVUlow.VI_write, 1);
+}
+
+//------------------------------------------------------------------
+// XGkick
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeXGkick(mV, int Fs, int xCycles) {
+ analyzeVIreg1(Fs, mVUlow.VI_read[0]);
+ analyzeXGkick1();
+ analyzeXGkick2(xCycles);
+ // Note: Technically XGKICK should stall on the next instruction,
+ // this code stalls on the same instruction. The only case where this
+ // will be a problem with, is if you have very-specifically placed
+ // FMxxx or FSxxx opcodes checking flags near this instruction AND
+ // the XGKICK instruction stalls. No-game should be effected by
+ // this minor difference.
+}
+
+//------------------------------------------------------------------
+// Branches - Branch Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) analyzeBranchVI(mV, int xReg, bool &infoVar) {
+ if (!xReg) return;
+ int i;
+ int iEnd = aMin(5, (mVUcount+1));
+ int bPC = iPC;
+ incPC2(-2);
+ for (i = 0; i < iEnd; i++) {
+ if ((i == mVUcount) && (i < 5)) {
+ if (mVUpBlock->pState.viBackUp == xReg) {
+ infoVar = 1;
+ i++;
+ }
+ break;
+ }
+ if ((mVUlow.VI_write.reg == xReg) && mVUlow.VI_write.used) {
+ if (mVUlow.readFlags || i == 5) break;
+ if (i == 0) { incPC2(-2); continue; }
+ if (((mVUlow.VI_read[0].reg == xReg) && (mVUlow.VI_read[0].used))
+ || ((mVUlow.VI_read[1].reg == xReg) && (mVUlow.VI_read[1].used)))
+ { incPC2(-2); continue; }
+ }
+ break;
+ }
+ if (i) {
+ if (!infoVar) {
+ incPC2(2);
+ mVUlow.backupVI = 1;
+ infoVar = 1;
+ }
+ iPC = bPC;
+ Console.WriteLn( Color_Green, "microVU%d: Branch VI-Delay (%d) [%04x]", getIndex, i, xPC);
+ }
+ else iPC = bPC;
+}
+
+// Branch in Branch Delay-Slots
+microVUt(int) mVUbranchCheck(mV) {
+ if (!mVUcount) return 0;
+ incPC(-2);
+ if (mVUlow.branch) {
+ mVUlow.badBranch = 1;
+ incPC(2);
+ mVUlow.evilBranch = 1;
+ mVUregs.blockType = 2;
+ Console.Warning("microVU%d Warning: Branch in Branch delay slot! [%04x]", mVU->index, xPC);
+ return 1;
+ }
+ incPC(2);
+ return 0;
+}
+
+microVUt(void) mVUanalyzeCondBranch1(mV, int Is) {
+ analyzeVIreg1(Is, mVUlow.VI_read[0]);
+ if (!mVUstall && !mVUbranchCheck(mVU)) {
+ analyzeBranchVI(mVU, Is, mVUlow.memReadIs);
+ }
+}
+
+microVUt(void) mVUanalyzeCondBranch2(mV, int Is, int It) {
+ analyzeVIreg1(Is, mVUlow.VI_read[0]);
+ analyzeVIreg1(It, mVUlow.VI_read[1]);
+ if (!mVUstall && !mVUbranchCheck(mVU)) {
+ analyzeBranchVI(mVU, Is, mVUlow.memReadIs);
+ analyzeBranchVI(mVU, It, mVUlow.memReadIt);
+ }
+}
+
+microVUt(void) mVUanalyzeNormBranch(mV, int It, bool isBAL) {
+ mVUbranchCheck(mVU);
+ if (isBAL) {
+ analyzeVIreg2(It, mVUlow.VI_write, 1);
+ setConstReg(It, bSaveAddr);
+ }
+}
+
+microVUt(void) mVUanalyzeJump(mV, int Is, int It, bool isJALR) {
+ mVUbranchCheck(mVU);
+ mVUlow.branch = (isJALR) ? 10 : 9;
+ if (mVUconstReg[Is].isValid && !CHECK_VU_CONSTHACK) {
+ mVUlow.constJump.isValid = 1;
+ mVUlow.constJump.regValue = mVUconstReg[Is].regValue;
+ //DevCon.Status("microVU%d: Constant JR/JALR Address Optimization", mVU->index);
+ }
+ analyzeVIreg1(Is, mVUlow.VI_read[0]);
+ if (isJALR) {
+ analyzeVIreg2(It, mVUlow.VI_write, 1);
+ setConstReg(It, bSaveAddr);
+ }
+}
diff --git a/pcsx2/x86/microVU_Clamp.inl b/pcsx2/x86/microVU_Clamp.inl
index e0b2b6fa67..61c5139e4d 100644
--- a/pcsx2/x86/microVU_Clamp.inl
+++ b/pcsx2/x86/microVU_Clamp.inl
@@ -1,106 +1,106 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-#pragma once
-
-//------------------------------------------------------------------
-// Micro VU - Clamp Functions
-//------------------------------------------------------------------
-
-const __aligned16 u32 sse4_minvals[2][4] = {
- { 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000
- { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //1111
-};
-const __aligned16 u32 sse4_maxvals[2][4] = {
- { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000
- { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //1111
-};
-
-// Used for Result Clamping
-// Note: This function will not preserve NaN values' sign.
-// The theory behind this is that when we compute a result, and we've
-// gotten a NaN value, then something went wrong; and the NaN's sign
-// is not to be trusted. Games like positive values better usually,
-// and its faster... so just always make NaNs into positive infinity.
-void mVUclamp1(int reg, int regT1, int xyzw, bool bClampE = 0) {
- if ((!clampE && CHECK_VU_OVERFLOW) || (clampE && bClampE)) {
- switch (xyzw) {
- case 1: case 2: case 4: case 8:
- SSE_MINSS_M32_to_XMM(reg, (uptr)mVUglob.maxvals);
- SSE_MAXSS_M32_to_XMM(reg, (uptr)mVUglob.minvals);
- break;
- default:
- SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals);
- SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals);
- break;
- }
- }
-}
-
-// Used for Operand Clamping
-// Note 1: If 'preserve sign' mode is on, it will preserve the sign of NaN values.
-// Note 2: Using regalloc here seems to contaminate some regs in certain games.
-// Must be some specific case I've overlooked (or I used regalloc improperly on an opcode)
-// so we just use a temporary mem location for our backup for now... (non-sse4 version only)
-void mVUclamp2(microVU* mVU, int reg, int regT1, int xyzw, bool bClampE = 0) {
- if ((!clampE && CHECK_VU_SIGN_OVERFLOW) || (clampE && bClampE && CHECK_VU_SIGN_OVERFLOW)) {
- if (x86caps.hasStreamingSIMD4Extensions) {
- int i = (xyzw==1||xyzw==2||xyzw==4||xyzw==8) ? 0: 1;
- SSE4_PMINSD_M128_to_XMM(reg, (uptr)&sse4_maxvals[i][0]);
- SSE4_PMINUD_M128_to_XMM(reg, (uptr)&sse4_minvals[i][0]);
- return;
- }
- int regT1b = 0;
- if (regT1 < 0) {
- regT1b = 1; regT1=(reg+1)%8;
- SSE_MOVAPS_XMM_to_M128((uptr)mVU->xmmCTemp, regT1);
- //regT1 = mVU->regAlloc->allocReg();
- }
- switch (xyzw) {
- case 1: case 2: case 4: case 8:
- SSE_MOVAPS_XMM_to_XMM(regT1, reg);
- SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit);
- SSE_MINSS_M32_to_XMM (reg, (uptr)mVUglob.maxvals);
- SSE_MAXSS_M32_to_XMM (reg, (uptr)mVUglob.minvals);
- SSE_ORPS_XMM_to_XMM (reg, regT1);
- break;
- default:
- SSE_MOVAPS_XMM_to_XMM(regT1, reg);
- SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit);
- SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals);
- SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals);
- SSE_ORPS_XMM_to_XMM (reg, regT1);
- break;
- }
- //if (regT1b) mVU->regAlloc->clearNeeded(regT1);
- if (regT1b) SSE_MOVAPS_M128_to_XMM(regT1, (uptr)mVU->xmmCTemp);
- }
- else mVUclamp1(reg, regT1, xyzw, bClampE);
-}
-
-// Used for operand clamping on every SSE instruction (add/sub/mul/div)
-void mVUclamp3(microVU* mVU, int reg, int regT1, int xyzw) {
- if (clampE) mVUclamp2(mVU, reg, regT1, xyzw, 1);
-}
-
-// Used for result clamping on every SSE instruction (add/sub/mul/div)
-// Note: Disabled in "preserve sign" mode because in certain cases it
-// makes too much code-gen, and you get jump8-overflows in certain
-// emulated opcodes (causing crashes). Since we're clamping the operands
-// with mVUclamp3, we should almost never be getting a NaN result,
-// but this clamp is just a precaution just-in-case.
-void mVUclamp4(int reg, int regT1, int xyzw) {
- if (clampE && !CHECK_VU_SIGN_OVERFLOW) mVUclamp1(reg, regT1, xyzw, 1);
-}
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+#pragma once
+
+//------------------------------------------------------------------
+// Micro VU - Clamp Functions
+//------------------------------------------------------------------
+
+const __aligned16 u32 sse4_minvals[2][4] = {
+ { 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000
+ { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //1111
+};
+const __aligned16 u32 sse4_maxvals[2][4] = {
+ { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000
+ { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //1111
+};
+
+// Used for Result Clamping
+// Note: This function will not preserve NaN values' sign.
+// The theory behind this is that when we compute a result, and we've
+// gotten a NaN value, then something went wrong; and the NaN's sign
+// is not to be trusted. Games like positive values better usually,
+// and its faster... so just always make NaNs into positive infinity.
+void mVUclamp1(int reg, int regT1, int xyzw, bool bClampE = 0) {
+ if ((!clampE && CHECK_VU_OVERFLOW) || (clampE && bClampE)) {
+ switch (xyzw) {
+ case 1: case 2: case 4: case 8:
+ SSE_MINSS_M32_to_XMM(reg, (uptr)mVUglob.maxvals);
+ SSE_MAXSS_M32_to_XMM(reg, (uptr)mVUglob.minvals);
+ break;
+ default:
+ SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals);
+ SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals);
+ break;
+ }
+ }
+}
+
+// Used for Operand Clamping
+// Note 1: If 'preserve sign' mode is on, it will preserve the sign of NaN values.
+// Note 2: Using regalloc here seems to contaminate some regs in certain games.
+// Must be some specific case I've overlooked (or I used regalloc improperly on an opcode)
+// so we just use a temporary mem location for our backup for now... (non-sse4 version only)
+void mVUclamp2(microVU* mVU, int reg, int regT1, int xyzw, bool bClampE = 0) {
+ if ((!clampE && CHECK_VU_SIGN_OVERFLOW) || (clampE && bClampE && CHECK_VU_SIGN_OVERFLOW)) {
+ if (x86caps.hasStreamingSIMD4Extensions) {
+ int i = (xyzw==1||xyzw==2||xyzw==4||xyzw==8) ? 0: 1;
+ SSE4_PMINSD_M128_to_XMM(reg, (uptr)&sse4_maxvals[i][0]);
+ SSE4_PMINUD_M128_to_XMM(reg, (uptr)&sse4_minvals[i][0]);
+ return;
+ }
+ int regT1b = 0;
+ if (regT1 < 0) {
+ regT1b = 1; regT1=(reg+1)%8;
+ SSE_MOVAPS_XMM_to_M128((uptr)mVU->xmmCTemp, regT1);
+ //regT1 = mVU->regAlloc->allocReg();
+ }
+ switch (xyzw) {
+ case 1: case 2: case 4: case 8:
+ SSE_MOVAPS_XMM_to_XMM(regT1, reg);
+ SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit);
+ SSE_MINSS_M32_to_XMM (reg, (uptr)mVUglob.maxvals);
+ SSE_MAXSS_M32_to_XMM (reg, (uptr)mVUglob.minvals);
+ SSE_ORPS_XMM_to_XMM (reg, regT1);
+ break;
+ default:
+ SSE_MOVAPS_XMM_to_XMM(regT1, reg);
+ SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit);
+ SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals);
+ SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals);
+ SSE_ORPS_XMM_to_XMM (reg, regT1);
+ break;
+ }
+ //if (regT1b) mVU->regAlloc->clearNeeded(regT1);
+ if (regT1b) SSE_MOVAPS_M128_to_XMM(regT1, (uptr)mVU->xmmCTemp);
+ }
+ else mVUclamp1(reg, regT1, xyzw, bClampE);
+}
+
+// Used for operand clamping on every SSE instruction (add/sub/mul/div)
+void mVUclamp3(microVU* mVU, int reg, int regT1, int xyzw) {
+ if (clampE) mVUclamp2(mVU, reg, regT1, xyzw, 1);
+}
+
+// Used for result clamping on every SSE instruction (add/sub/mul/div)
+// Note: Disabled in "preserve sign" mode because in certain cases it
+// makes too much code-gen, and you get jump8-overflows in certain
+// emulated opcodes (causing crashes). Since we're clamping the operands
+// with mVUclamp3, we should almost never be getting a NaN result,
+// but this clamp is just a precaution just-in-case.
+void mVUclamp4(int reg, int regT1, int xyzw) {
+ if (clampE && !CHECK_VU_SIGN_OVERFLOW) mVUclamp1(reg, regT1, xyzw, 1);
+}
diff --git a/pcsx2/x86/microVU_IR.h b/pcsx2/x86/microVU_IR.h
index 014ff69a9f..7f27848dbe 100644
--- a/pcsx2/x86/microVU_IR.h
+++ b/pcsx2/x86/microVU_IR.h
@@ -1,337 +1,337 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-#pragma once
-
-union regInfo {
- u32 reg;
- struct {
- u8 x;
- u8 y;
- u8 z;
- u8 w;
- };
-};
-
-#ifdef _MSC_VER
-# pragma pack(1)
-# pragma warning(disable:4996) // 'function': was declared deprecated
-#endif
-
-struct __aligned16 microRegInfo { // Ordered for Faster Compares
- u32 vi15; // Constant Prop Info for vi15 (only valid if sign-bit set)
- u8 needExactMatch; // If set, block needs an exact match of pipeline state
- u8 q;
- u8 p;
- u8 r;
- u8 xgkick;
- u8 viBackUp;
- u8 VI[16];
- regInfo VF[32];
- u8 flags; // clip x2 :: status x2
- u8 blockType; // 0 = Normal; 1,2 = Compile one instruction (E-bit/Branch Ending)
- u8 padding[5]; // 160 bytes
-} __packed;
-
-struct __aligned16 microBlock {
- microRegInfo pState; // Detailed State of Pipeline
- microRegInfo pStateEnd; // Detailed State of Pipeline at End of Block (needed by JR/JALR opcodes)
- u8* x86ptrStart; // Start of code
-} __packed;
-
-#ifdef _MSC_VER
-# pragma pack()
-#endif
-
-struct microTempRegInfo {
- regInfo VF[2]; // Holds cycle info for Fd, VF[0] = Upper Instruction, VF[1] = Lower Instruction
- u8 VFreg[2]; // Index of the VF reg
- u8 VI; // Holds cycle info for Id
- u8 VIreg; // Index of the VI reg
- u8 q; // Holds cycle info for Q reg
- u8 p; // Holds cycle info for P reg
- u8 r; // Holds cycle info for R reg (Will never cause stalls, but useful to know if R is modified)
- u8 xgkick; // Holds the cycle info for XGkick
-};
-
-struct microVFreg {
- u8 reg; // Reg Index
- u8 x; // X vector read/written to?
- u8 y; // Y vector read/written to?
- u8 z; // Z vector read/written to?
- u8 w; // W vector read/written to?
-};
-
-struct microVIreg {
- u8 reg; // Reg Index
- u8 used; // Reg is Used? (Read/Written)
-};
-
-struct microConstInfo {
- u8 isValid; // Is the constant in regValue valid?
- u32 regValue; // Constant Value
-};
-
-struct microUpperOp {
- bool eBit; // Has E-bit set
- bool iBit; // Has I-bit set
- bool mBit; // Has M-bit set
- microVFreg VF_write; // VF Vectors written to by this instruction
- microVFreg VF_read[2]; // VF Vectors read by this instruction
-};
-
-struct microLowerOp {
- microVFreg VF_write; // VF Vectors written to by this instruction
- microVFreg VF_read[2]; // VF Vectors read by this instruction
- microVIreg VI_write; // VI reg written to by this instruction
- microVIreg VI_read[2]; // VI regs read by this instruction
- microConstInfo constJump; // Constant Reg Info for JR/JARL instructions
- u32 branch; // Branch Type (0 = Not a Branch, 1 = B. 2 = BAL, 3~8 = Conditional Branches, 9 = JALR, 10 = JR)
- bool badBranch; // This instruction is a Branch who has another branch in its Delay Slot
- bool evilBranch;// This instruction is a Branch in a Branch Delay Slot (Instruction after badBranch)
- bool isNOP; // This instruction is a NOP
- bool isFSSET; // This instruction is a FSSET
- bool noWriteVF; // Don't write back the result of a lower op to VF reg if upper op writes to same reg (or if VF = 0)
- bool backupVI; // Backup VI reg to memory if modified before branch (branch uses old VI value unless opcode is ILW or ILWR)
- bool memReadIs; // Read Is (VI reg) from memory (used by branches)
- bool memReadIt; // Read If (VI reg) from memory (used by branches)
- bool readFlags; // Current Instruction reads Status, Mac, or Clip flags
-};
-
-struct microFlagInst {
- bool doFlag; // Update Flag on this Instruction
- bool doNonSticky; // Update O,U,S,Z (non-sticky) bits on this Instruction (status flag only)
- u8 write; // Points to the instance that should be written to (s-stage write)
- u8 lastWrite; // Points to the instance that was last written to (most up-to-date flag)
- u8 read; // Points to the instance that should be read by a lower instruction (t-stage read)
-};
-
-struct microFlagCycles {
- int xStatus[4];
- int xMac[4];
- int xClip[4];
- int cycles;
-};
-
-struct microOp {
- u8 stall; // Info on how much current instruction stalled
- bool isEOB; // Cur Instruction is last instruction in block (End of Block)
- bool isBdelay; // Cur Instruction in Branch Delay slot
- bool swapOps; // Run Lower Instruction before Upper Instruction
- bool backupVF; // Backup mVUlow.VF_write.reg, and restore it before the Upper Instruction is called
- bool doXGKICK; // Do XGKICK transfer on this instruction
- bool doDivFlag; // Transfer Div flag to Status Flag on this instruction
- int readQ; // Q instance for reading
- int writeQ; // Q instance for writing
- int readP; // P instance for reading
- int writeP; // P instance for writing
- microFlagInst sFlag; // Status Flag Instance Info
- microFlagInst mFlag; // Mac Flag Instance Info
- microFlagInst cFlag; // Clip Flag Instance Info
- microUpperOp uOp; // Upper Op Info
- microLowerOp lOp; // Lower Op Info
-};
-
-template
-struct microIR {
- microBlock block; // Block/Pipeline info
- microBlock* pBlock; // Pointer to a block in mVUblocks
- microTempRegInfo regsTemp; // Temp Pipeline info (used so that new pipeline info isn't conflicting between upper and lower instructions in the same cycle)
- microOp info[pSize/2]; // Info for Instructions in current block
- microConstInfo constReg[16]; // Simple Const Propagation Info for VI regs within blocks
- u8 branch;
- u32 cycles; // Cycles for current block
- u32 count; // Number of VU 64bit instructions ran (starts at 0 for each block)
- u32 curPC; // Current PC
- u32 startPC; // Start PC for Cur Block
- u32 sFlagHack; // Optimize out all Status flag updates if microProgram doesn't use Status flags
-};
-
-//------------------------------------------------------------------
-// Reg Alloc
-//------------------------------------------------------------------
-
-void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW);
-void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW);
-void mVUloadReg(int reg, uptr offset, int xyzw);
-void mVUloadIreg(int reg, int xyzw, VURegs* vuRegs);
-
-struct microXMM {
- int reg; // VF Reg Number Stored (-1 = Temp; 0 = vf0 and will not be written back; 32 = ACC; 33 = I reg)
- int xyzw; // xyzw to write back (0 = Don't write back anything AND cached vfReg has all vectors valid)
- int count; // Count of when last used
- bool isNeeded; // Is needed for current instruction
-};
-
-#define xmmTotal 7 // Don't allocate PQ?
-class microRegAlloc {
-private:
- microXMM xmmReg[xmmTotal];
- VURegs* vuRegs;
- int counter;
- int findFreeRegRec(int startIdx) {
- for (int i = startIdx; i < xmmTotal; i++) {
- if (!xmmReg[i].isNeeded) {
- int x = findFreeRegRec(i+1);
- if (x == -1) return i;
- return ((xmmReg[i].count < xmmReg[x].count) ? i : x);
- }
- }
- return -1;
- }
- int findFreeReg() {
- for (int i = 0; i < xmmTotal; i++) {
- if (!xmmReg[i].isNeeded && (xmmReg[i].reg < 0)) {
- return i; // Reg is not needed and was a temp reg
- }
- }
- int x = findFreeRegRec(0);
- if (x < 0) { DevCon.Error("microVU Allocation Error!"); return 0; }
- return x;
- }
-
-public:
- microRegAlloc(VURegs* vuRegsPtr) {
- vuRegs = vuRegsPtr;
- reset();
- }
- void reset() {
- for (int i = 0; i < xmmTotal; i++) {
- clearReg(i);
- }
- counter = 0;
- }
- void flushAll(bool clearState = 1) {
- for (int i = 0; i < xmmTotal; i++) {
- writeBackReg(i);
- if (clearState) clearReg(i);
- }
- }
- void clearReg(int reg) {
- xmmReg[reg].reg = -1;
- xmmReg[reg].count = 0;
- xmmReg[reg].xyzw = 0;
- xmmReg[reg].isNeeded = 0;
- }
- void clearRegVF(int VFreg) {
- for (int i = 0; i < xmmTotal; i++) {
- if (xmmReg[i].reg == VFreg) clearReg(i);
- }
- }
- void writeBackReg(int reg, bool invalidateRegs = 1) {
- if ((xmmReg[reg].reg > 0) && xmmReg[reg].xyzw) { // Reg was modified and not Temp or vf0
- if (xmmReg[reg].reg == 33) SSE_MOVSS_XMM_to_M32((uptr)&vuRegs->VI[REG_I].UL, reg);
- else if (xmmReg[reg].reg == 32) mVUsaveReg(reg, (uptr)&vuRegs->ACC.UL[0], xmmReg[reg].xyzw, 1);
- else mVUsaveReg(reg, (uptr)&vuRegs->VF[xmmReg[reg].reg].UL[0], xmmReg[reg].xyzw, 1);
- if (invalidateRegs) {
- for (int i = 0; i < xmmTotal; i++) {
- if ((i == reg) || xmmReg[i].isNeeded) continue;
- if (xmmReg[i].reg == xmmReg[reg].reg) {
- if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon.Error("microVU Error: writeBackReg() [%d]", xmmReg[i].reg);
- clearReg(i); // Invalidate any Cached Regs of same vf Reg
- }
- }
- }
- if (xmmReg[reg].xyzw == 0xf) { // Make Cached Reg if All Vectors were Modified
- xmmReg[reg].count = counter;
- xmmReg[reg].xyzw = 0;
- xmmReg[reg].isNeeded = 0;
- return;
- }
- }
- clearReg(reg); // Clear Reg
- }
- void clearNeeded(int reg) {
- if ((reg < 0) || (reg >= xmmTotal)) return;
- xmmReg[reg].isNeeded = 0;
- if (xmmReg[reg].xyzw) { // Reg was modified
- if (xmmReg[reg].reg > 0) {
- int mergeRegs = 0;
- if (xmmReg[reg].xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes
- for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg
- if (i == reg) continue;
- if (xmmReg[i].reg == xmmReg[reg].reg) {
- if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon.Error("microVU Error: clearNeeded() [%d]", xmmReg[i].reg);
- if (mergeRegs == 1) {
- mVUmergeRegs(i, reg, xmmReg[reg].xyzw, 1);
- xmmReg[i].xyzw = 0xf;
- xmmReg[i].count = counter;
- mergeRegs = 2;
- }
- else clearReg(i);
- }
- }
- if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged
- else if (mergeRegs) writeBackReg(reg); // Write Back Partial Writes if couldn't merge
- }
- else clearReg(reg); // If Reg was temp or vf0, then invalidate itself
- }
- }
- int allocReg(int vfLoadReg = -1, int vfWriteReg = -1, int xyzw = 0, bool cloneWrite = 1) {
- counter++;
- if (vfLoadReg >= 0) { // Search For Cached Regs
- for (int i = 0; i < xmmTotal; i++) {
- if ((xmmReg[i].reg == vfLoadReg) && (!xmmReg[i].xyzw // Reg Was Not Modified
- || (xmmReg[i].reg && (xmmReg[i].xyzw==0xf)))) { // Reg Had All Vectors Modified and != VF0
- int z = i;
- if (vfWriteReg >= 0) { // Reg will be modified
- if (cloneWrite) { // Clone Reg so as not to use the same Cached Reg
- z = findFreeReg();
- writeBackReg(z);
- if (z!=i && xyzw==8) SSE_MOVAPS_XMM_to_XMM (z, i);
- else if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1);
- else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2);
- else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3);
- else if (z != i) SSE_MOVAPS_XMM_to_XMM (z, i);
- xmmReg[i].count = counter; // Reg i was used, so update counter
- }
- else { // Don't clone reg, but shuffle to adjust for SS ops
- if ((vfLoadReg != vfWriteReg) || (xyzw != 0xf)) { writeBackReg(z); }
- if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1);
- else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2);
- else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3);
- }
- xmmReg[z].reg = vfWriteReg;
- xmmReg[z].xyzw = xyzw;
- }
- xmmReg[z].count = counter;
- xmmReg[z].isNeeded = 1;
- return z;
- }
- }
- }
- int x = findFreeReg();
- writeBackReg(x);
-
- if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading)
- if ((vfLoadReg == 0) && !(xyzw & 1)) { SSE2_PXOR_XMM_to_XMM(x, x); }
- else if (vfLoadReg == 33) mVUloadIreg(x, xyzw, vuRegs);
- else if (vfLoadReg == 32) mVUloadReg (x, (uptr)&vuRegs->ACC.UL[0], xyzw);
- else if (vfLoadReg >= 0) mVUloadReg (x, (uptr)&vuRegs->VF[vfLoadReg].UL[0], xyzw);
- xmmReg[x].reg = vfWriteReg;
- xmmReg[x].xyzw = xyzw;
- }
- else { // Reg Will Not Be Modified (always load full reg for caching)
- if (vfLoadReg == 33) mVUloadIreg(x, 0xf, vuRegs);
- else if (vfLoadReg == 32) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->ACC.UL[0]);
- else if (vfLoadReg >= 0) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->VF[vfLoadReg].UL[0]);
- xmmReg[x].reg = vfLoadReg;
- xmmReg[x].xyzw = 0;
- }
- xmmReg[x].count = counter;
- xmmReg[x].isNeeded = 1;
- return x;
- }
-};
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+#pragma once
+
+union regInfo {
+ u32 reg;
+ struct {
+ u8 x;
+ u8 y;
+ u8 z;
+ u8 w;
+ };
+};
+
+#ifdef _MSC_VER
+# pragma pack(1)
+# pragma warning(disable:4996) // 'function': was declared deprecated
+#endif
+
+struct __aligned16 microRegInfo { // Ordered for Faster Compares
+ u32 vi15; // Constant Prop Info for vi15 (only valid if sign-bit set)
+ u8 needExactMatch; // If set, block needs an exact match of pipeline state
+ u8 q;
+ u8 p;
+ u8 r;
+ u8 xgkick;
+ u8 viBackUp;
+ u8 VI[16];
+ regInfo VF[32];
+ u8 flags; // clip x2 :: status x2
+ u8 blockType; // 0 = Normal; 1,2 = Compile one instruction (E-bit/Branch Ending)
+ u8 padding[5]; // 160 bytes
+} __packed;
+
+struct __aligned16 microBlock {
+ microRegInfo pState; // Detailed State of Pipeline
+ microRegInfo pStateEnd; // Detailed State of Pipeline at End of Block (needed by JR/JALR opcodes)
+ u8* x86ptrStart; // Start of code
+} __packed;
+
+#ifdef _MSC_VER
+# pragma pack()
+#endif
+
+struct microTempRegInfo {
+ regInfo VF[2]; // Holds cycle info for Fd, VF[0] = Upper Instruction, VF[1] = Lower Instruction
+ u8 VFreg[2]; // Index of the VF reg
+ u8 VI; // Holds cycle info for Id
+ u8 VIreg; // Index of the VI reg
+ u8 q; // Holds cycle info for Q reg
+ u8 p; // Holds cycle info for P reg
+ u8 r; // Holds cycle info for R reg (Will never cause stalls, but useful to know if R is modified)
+ u8 xgkick; // Holds the cycle info for XGkick
+};
+
+struct microVFreg {
+ u8 reg; // Reg Index
+ u8 x; // X vector read/written to?
+ u8 y; // Y vector read/written to?
+ u8 z; // Z vector read/written to?
+ u8 w; // W vector read/written to?
+};
+
+struct microVIreg {
+ u8 reg; // Reg Index
+ u8 used; // Reg is Used? (Read/Written)
+};
+
+struct microConstInfo {
+ u8 isValid; // Is the constant in regValue valid?
+ u32 regValue; // Constant Value
+};
+
+struct microUpperOp {
+ bool eBit; // Has E-bit set
+ bool iBit; // Has I-bit set
+ bool mBit; // Has M-bit set
+ microVFreg VF_write; // VF Vectors written to by this instruction
+ microVFreg VF_read[2]; // VF Vectors read by this instruction
+};
+
+struct microLowerOp {
+ microVFreg VF_write; // VF Vectors written to by this instruction
+ microVFreg VF_read[2]; // VF Vectors read by this instruction
+ microVIreg VI_write; // VI reg written to by this instruction
+ microVIreg VI_read[2]; // VI regs read by this instruction
+ microConstInfo constJump; // Constant Reg Info for JR/JARL instructions
+ u32 branch; // Branch Type (0 = Not a Branch, 1 = B. 2 = BAL, 3~8 = Conditional Branches, 9 = JALR, 10 = JR)
+ bool badBranch; // This instruction is a Branch who has another branch in its Delay Slot
+ bool evilBranch;// This instruction is a Branch in a Branch Delay Slot (Instruction after badBranch)
+ bool isNOP; // This instruction is a NOP
+ bool isFSSET; // This instruction is a FSSET
+ bool noWriteVF; // Don't write back the result of a lower op to VF reg if upper op writes to same reg (or if VF = 0)
+ bool backupVI; // Backup VI reg to memory if modified before branch (branch uses old VI value unless opcode is ILW or ILWR)
+ bool memReadIs; // Read Is (VI reg) from memory (used by branches)
+ bool memReadIt; // Read If (VI reg) from memory (used by branches)
+ bool readFlags; // Current Instruction reads Status, Mac, or Clip flags
+};
+
+struct microFlagInst {
+ bool doFlag; // Update Flag on this Instruction
+ bool doNonSticky; // Update O,U,S,Z (non-sticky) bits on this Instruction (status flag only)
+ u8 write; // Points to the instance that should be written to (s-stage write)
+ u8 lastWrite; // Points to the instance that was last written to (most up-to-date flag)
+ u8 read; // Points to the instance that should be read by a lower instruction (t-stage read)
+};
+
+struct microFlagCycles {
+ int xStatus[4];
+ int xMac[4];
+ int xClip[4];
+ int cycles;
+};
+
+struct microOp {
+ u8 stall; // Info on how much current instruction stalled
+ bool isEOB; // Cur Instruction is last instruction in block (End of Block)
+ bool isBdelay; // Cur Instruction in Branch Delay slot
+ bool swapOps; // Run Lower Instruction before Upper Instruction
+ bool backupVF; // Backup mVUlow.VF_write.reg, and restore it before the Upper Instruction is called
+ bool doXGKICK; // Do XGKICK transfer on this instruction
+ bool doDivFlag; // Transfer Div flag to Status Flag on this instruction
+ int readQ; // Q instance for reading
+ int writeQ; // Q instance for writing
+ int readP; // P instance for reading
+ int writeP; // P instance for writing
+ microFlagInst sFlag; // Status Flag Instance Info
+ microFlagInst mFlag; // Mac Flag Instance Info
+ microFlagInst cFlag; // Clip Flag Instance Info
+ microUpperOp uOp; // Upper Op Info
+ microLowerOp lOp; // Lower Op Info
+};
+
+template
+struct microIR {
+ microBlock block; // Block/Pipeline info
+ microBlock* pBlock; // Pointer to a block in mVUblocks
+ microTempRegInfo regsTemp; // Temp Pipeline info (used so that new pipeline info isn't conflicting between upper and lower instructions in the same cycle)
+ microOp info[pSize/2]; // Info for Instructions in current block
+ microConstInfo constReg[16]; // Simple Const Propagation Info for VI regs within blocks
+ u8 branch;
+ u32 cycles; // Cycles for current block
+ u32 count; // Number of VU 64bit instructions ran (starts at 0 for each block)
+ u32 curPC; // Current PC
+ u32 startPC; // Start PC for Cur Block
+ u32 sFlagHack; // Optimize out all Status flag updates if microProgram doesn't use Status flags
+};
+
+//------------------------------------------------------------------
+// Reg Alloc
+//------------------------------------------------------------------
+
+void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW);
+void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW);
+void mVUloadReg(int reg, uptr offset, int xyzw);
+void mVUloadIreg(int reg, int xyzw, VURegs* vuRegs);
+
+struct microXMM {
+ int reg; // VF Reg Number Stored (-1 = Temp; 0 = vf0 and will not be written back; 32 = ACC; 33 = I reg)
+ int xyzw; // xyzw to write back (0 = Don't write back anything AND cached vfReg has all vectors valid)
+ int count; // Count of when last used
+ bool isNeeded; // Is needed for current instruction
+};
+
+#define xmmTotal 7 // Don't allocate PQ?
+class microRegAlloc {
+private:
+ microXMM xmmReg[xmmTotal];
+ VURegs* vuRegs;
+ int counter;
+ int findFreeRegRec(int startIdx) {
+ for (int i = startIdx; i < xmmTotal; i++) {
+ if (!xmmReg[i].isNeeded) {
+ int x = findFreeRegRec(i+1);
+ if (x == -1) return i;
+ return ((xmmReg[i].count < xmmReg[x].count) ? i : x);
+ }
+ }
+ return -1;
+ }
+ int findFreeReg() {
+ for (int i = 0; i < xmmTotal; i++) {
+ if (!xmmReg[i].isNeeded && (xmmReg[i].reg < 0)) {
+ return i; // Reg is not needed and was a temp reg
+ }
+ }
+ int x = findFreeRegRec(0);
+ if (x < 0) { DevCon.Error("microVU Allocation Error!"); return 0; }
+ return x;
+ }
+
+public:
+ microRegAlloc(VURegs* vuRegsPtr) {
+ vuRegs = vuRegsPtr;
+ reset();
+ }
+ void reset() {
+ for (int i = 0; i < xmmTotal; i++) {
+ clearReg(i);
+ }
+ counter = 0;
+ }
+ void flushAll(bool clearState = 1) {
+ for (int i = 0; i < xmmTotal; i++) {
+ writeBackReg(i);
+ if (clearState) clearReg(i);
+ }
+ }
+ void clearReg(int reg) {
+ xmmReg[reg].reg = -1;
+ xmmReg[reg].count = 0;
+ xmmReg[reg].xyzw = 0;
+ xmmReg[reg].isNeeded = 0;
+ }
+ void clearRegVF(int VFreg) {
+ for (int i = 0; i < xmmTotal; i++) {
+ if (xmmReg[i].reg == VFreg) clearReg(i);
+ }
+ }
+ void writeBackReg(int reg, bool invalidateRegs = 1) {
+ if ((xmmReg[reg].reg > 0) && xmmReg[reg].xyzw) { // Reg was modified and not Temp or vf0
+ if (xmmReg[reg].reg == 33) SSE_MOVSS_XMM_to_M32((uptr)&vuRegs->VI[REG_I].UL, reg);
+ else if (xmmReg[reg].reg == 32) mVUsaveReg(reg, (uptr)&vuRegs->ACC.UL[0], xmmReg[reg].xyzw, 1);
+ else mVUsaveReg(reg, (uptr)&vuRegs->VF[xmmReg[reg].reg].UL[0], xmmReg[reg].xyzw, 1);
+ if (invalidateRegs) {
+ for (int i = 0; i < xmmTotal; i++) {
+ if ((i == reg) || xmmReg[i].isNeeded) continue;
+ if (xmmReg[i].reg == xmmReg[reg].reg) {
+ if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon.Error("microVU Error: writeBackReg() [%d]", xmmReg[i].reg);
+ clearReg(i); // Invalidate any Cached Regs of same vf Reg
+ }
+ }
+ }
+ if (xmmReg[reg].xyzw == 0xf) { // Make Cached Reg if All Vectors were Modified
+ xmmReg[reg].count = counter;
+ xmmReg[reg].xyzw = 0;
+ xmmReg[reg].isNeeded = 0;
+ return;
+ }
+ }
+ clearReg(reg); // Clear Reg
+ }
+ void clearNeeded(int reg) {
+ if ((reg < 0) || (reg >= xmmTotal)) return;
+ xmmReg[reg].isNeeded = 0;
+ if (xmmReg[reg].xyzw) { // Reg was modified
+ if (xmmReg[reg].reg > 0) {
+ int mergeRegs = 0;
+ if (xmmReg[reg].xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes
+ for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg
+ if (i == reg) continue;
+ if (xmmReg[i].reg == xmmReg[reg].reg) {
+ if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon.Error("microVU Error: clearNeeded() [%d]", xmmReg[i].reg);
+ if (mergeRegs == 1) {
+ mVUmergeRegs(i, reg, xmmReg[reg].xyzw, 1);
+ xmmReg[i].xyzw = 0xf;
+ xmmReg[i].count = counter;
+ mergeRegs = 2;
+ }
+ else clearReg(i);
+ }
+ }
+ if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged
+ else if (mergeRegs) writeBackReg(reg); // Write Back Partial Writes if couldn't merge
+ }
+ else clearReg(reg); // If Reg was temp or vf0, then invalidate itself
+ }
+ }
+ int allocReg(int vfLoadReg = -1, int vfWriteReg = -1, int xyzw = 0, bool cloneWrite = 1) {
+ counter++;
+ if (vfLoadReg >= 0) { // Search For Cached Regs
+ for (int i = 0; i < xmmTotal; i++) {
+ if ((xmmReg[i].reg == vfLoadReg) && (!xmmReg[i].xyzw // Reg Was Not Modified
+ || (xmmReg[i].reg && (xmmReg[i].xyzw==0xf)))) { // Reg Had All Vectors Modified and != VF0
+ int z = i;
+ if (vfWriteReg >= 0) { // Reg will be modified
+ if (cloneWrite) { // Clone Reg so as not to use the same Cached Reg
+ z = findFreeReg();
+ writeBackReg(z);
+ if (z!=i && xyzw==8) SSE_MOVAPS_XMM_to_XMM (z, i);
+ else if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1);
+ else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2);
+ else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3);
+ else if (z != i) SSE_MOVAPS_XMM_to_XMM (z, i);
+ xmmReg[i].count = counter; // Reg i was used, so update counter
+ }
+ else { // Don't clone reg, but shuffle to adjust for SS ops
+ if ((vfLoadReg != vfWriteReg) || (xyzw != 0xf)) { writeBackReg(z); }
+ if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1);
+ else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2);
+ else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3);
+ }
+ xmmReg[z].reg = vfWriteReg;
+ xmmReg[z].xyzw = xyzw;
+ }
+ xmmReg[z].count = counter;
+ xmmReg[z].isNeeded = 1;
+ return z;
+ }
+ }
+ }
+ int x = findFreeReg();
+ writeBackReg(x);
+
+ if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading)
+ if ((vfLoadReg == 0) && !(xyzw & 1)) { SSE2_PXOR_XMM_to_XMM(x, x); }
+ else if (vfLoadReg == 33) mVUloadIreg(x, xyzw, vuRegs);
+ else if (vfLoadReg == 32) mVUloadReg (x, (uptr)&vuRegs->ACC.UL[0], xyzw);
+ else if (vfLoadReg >= 0) mVUloadReg (x, (uptr)&vuRegs->VF[vfLoadReg].UL[0], xyzw);
+ xmmReg[x].reg = vfWriteReg;
+ xmmReg[x].xyzw = xyzw;
+ }
+ else { // Reg Will Not Be Modified (always load full reg for caching)
+ if (vfLoadReg == 33) mVUloadIreg(x, 0xf, vuRegs);
+ else if (vfLoadReg == 32) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->ACC.UL[0]);
+ else if (vfLoadReg >= 0) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->VF[vfLoadReg].UL[0]);
+ xmmReg[x].reg = vfLoadReg;
+ xmmReg[x].xyzw = 0;
+ }
+ xmmReg[x].count = counter;
+ xmmReg[x].isNeeded = 1;
+ return x;
+ }
+};
diff --git a/pcsx2/x86/newVif.h b/pcsx2/x86/newVif.h
index 9db218179e..6d369a8d00 100644
--- a/pcsx2/x86/newVif.h
+++ b/pcsx2/x86/newVif.h
@@ -1,67 +1,67 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-#pragma once
-
-#ifdef newVif
-#include "x86emitter/x86emitter.h"
-using namespace x86Emitter;
-extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0);
-extern void _nVifUnpack(int idx, u8 *data, u32 size);
-
-typedef u32 (__fastcall *nVifCall)(void*, void*);
-
-static __pagealigned u8 nVifUpkExec[__pagesize*16];
-static __aligned16 nVifCall nVifUpk[(2*2*16)*4*4]; // ([USN][Masking][Unpack Type]) [curCycle][CyclesToWrite-1]
-static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
-
-#define _v0 0
-#define _v1 0x55
-#define _v2 0xaa
-#define _v3 0xff
-#define aMax(x, y) std::max(x,y)
-#define aMin(x, y) std::min(x,y)
-#define _f __forceinline
-
-#define xShiftR(regX, n) { \
- if (usn) { xPSRL.D(regX, n); } \
- else { xPSRA.D(regX, n); } \
-}
-
-static const u32 nVifT[16] = {
- 4, // S-32
- 2, // S-16
- 1, // S-8
- 0, // ----
- 8, // V2-32
- 4, // V2-16
- 2, // V2-8
- 0, // ----
- 12,// V3-32
- 6, // V3-16
- 3, // V3-8
- 0, // ----
- 16,// V4-32
- 8, // V4-16
- 4, // V4-8
- 2, // V4-5
-};
-
-#include "newVif_BlockBuffer.h"
-#include "newVif_OldUnpack.inl"
-#include "newVif_UnpackGen.inl"
-#include "newVif_Unpack.inl"
-
-#endif
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+#pragma once
+
+#ifdef newVif
+#include "x86emitter/x86emitter.h"
+using namespace x86Emitter;
+extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0);
+extern void _nVifUnpack(int idx, u8 *data, u32 size);
+
+typedef u32 (__fastcall *nVifCall)(void*, void*);
+
+static __pagealigned u8 nVifUpkExec[__pagesize*16];
+static __aligned16 nVifCall nVifUpk[(2*2*16)*4*4]; // ([USN][Masking][Unpack Type]) [curCycle][CyclesToWrite-1]
+static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
+
+#define _v0 0
+#define _v1 0x55
+#define _v2 0xaa
+#define _v3 0xff
+#define aMax(x, y) std::max(x,y)
+#define aMin(x, y) std::min(x,y)
+#define _f __forceinline
+
+#define xShiftR(regX, n) { \
+ if (usn) { xPSRL.D(regX, n); } \
+ else { xPSRA.D(regX, n); } \
+}
+
+static const u32 nVifT[16] = {
+ 4, // S-32
+ 2, // S-16
+ 1, // S-8
+ 0, // ----
+ 8, // V2-32
+ 4, // V2-16
+ 2, // V2-8
+ 0, // ----
+ 12,// V3-32
+ 6, // V3-16
+ 3, // V3-8
+ 0, // ----
+ 16,// V4-32
+ 8, // V4-16
+ 4, // V4-8
+ 2, // V4-5
+};
+
+#include "newVif_BlockBuffer.h"
+#include "newVif_OldUnpack.inl"
+#include "newVif_UnpackGen.inl"
+#include "newVif_Unpack.inl"
+
+#endif
diff --git a/pcsx2/x86/newVif_BlockBuffer.h b/pcsx2/x86/newVif_BlockBuffer.h
index 806565cff4..cb378c74e5 100644
--- a/pcsx2/x86/newVif_BlockBuffer.h
+++ b/pcsx2/x86/newVif_BlockBuffer.h
@@ -1,40 +1,40 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-#pragma once
-
-class BlockBuffer {
-private:
- u32 mSize; // Cur Size
- u32 mSizeT; // Total Size
- u8* mData; // Data Ptr
- void grow(u32 newSize) {
- u8* temp = new u8[newSize];
- memcpy(temp, mData, mSizeT);
- safe_delete( mData );
- mData = temp;
- }
-public:
- BlockBuffer(u32 tSize) { mSizeT = tSize; mSize = 0; mData = new u8[mSizeT]; }
- virtual ~BlockBuffer() { safe_delete(mData); }
- void append(void *addr, u32 size) {
- if (mSize + size > mSizeT) grow(mSize*2 + size);
- memcpy(&mData[mSize], addr, size);
- mSize += size;
- }
- void clear() { mSize = 0; }
- u32 getSize() { return mSize; }
- u8* getBlock() { return mData; }
-};
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+#pragma once
+
+class BlockBuffer {
+private:
+ u32 mSize; // Cur Size
+ u32 mSizeT; // Total Size
+ u8* mData; // Data Ptr
+ void grow(u32 newSize) {
+ u8* temp = new u8[newSize];
+ memcpy(temp, mData, mSizeT);
+ safe_delete( mData );
+ mData = temp;
+ }
+public:
+ BlockBuffer(u32 tSize) { mSizeT = tSize; mSize = 0; mData = new u8[mSizeT]; }
+ virtual ~BlockBuffer() { safe_delete(mData); }
+ void append(void *addr, u32 size) {
+ if (mSize + size > mSizeT) grow(mSize*2 + size);
+ memcpy(&mData[mSize], addr, size);
+ mSize += size;
+ }
+ void clear() { mSize = 0; }
+ u32 getSize() { return mSize; }
+ u8* getBlock() { return mData; }
+};
diff --git a/pcsx2/x86/newVif_OldUnpack.inl b/pcsx2/x86/newVif_OldUnpack.inl
index 3f19e93a43..6dcaebf2b1 100644
--- a/pcsx2/x86/newVif_OldUnpack.inl
+++ b/pcsx2/x86/newVif_OldUnpack.inl
@@ -1,167 +1,167 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-// Old Vif Unpack Code
-// Only here for testing/reference
-// If newVif is defined and newVif1 isn't, vif1 will use this code
-// same goes for vif0...
-template void VIFunpack<0>(u32 *data, vifCode *v, u32 size);
-template void VIFunpack<1>(u32 *data, vifCode *v, u32 size);
-template void VIFunpack(u32 *data, vifCode *v, u32 size) {
- //if (!VIFdmanum) DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data);
- UNPACKFUNCTYPE func;
- const VIFUnpackFuncTable *ft;
- VURegs * VU;
- u8 *cdata = (u8*)data;
- u32 tempsize = 0;
- const u32 memlimit = vif_size(VIFdmanum);
-
- if (VIFdmanum == 0) {
- VU = &VU0;
- vifRegs = vif0Regs;
- vifMaskRegs = g_vif0Masks;
- vif = &vif0;
- vifRow = g_vifmask.Row0;
- }
- else {
- VU = &VU1;
- vifRegs = vif1Regs;
- vifMaskRegs = g_vif1Masks;
- vif = &vif1;
- vifRow = g_vifmask.Row1;
- }
-
- u32 *dest = (u32*)(VU->Mem + v->addr);
- u32 unpackType = v->cmd & 0xf;
-
- ft = &VIFfuncTable[ unpackType ];
- func = vif->usn ? ft->funcU : ft->funcS;
- size <<= 2;
-
- if (vifRegs->cycle.cl >= vifRegs->cycle.wl) { // skipping write
- if (v->addr >= memlimit) {
- DevCon.Warning("Overflown at the start");
- v->addr &= (memlimit - 1);
- dest = (u32*)(VU->Mem + v->addr);
- }
-
- size = min(size, (int)vifRegs->num * ft->gsize); //size will always be the same or smaller
-
- tempsize = v->addr + ((((vifRegs->num-1) / vifRegs->cycle.wl) *
- (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16);
-
- //Sanity Check (memory overflow)
- if (tempsize > memlimit) {
- if (((vifRegs->cycle.cl != vifRegs->cycle.wl) &&
- ((memlimit + (vifRegs->cycle.cl - vifRegs->cycle.wl) * 16) == tempsize))) {
- //It's a red herring, so ignore it! SSE unpacks will be much quicker.
- DevCon.WriteLn("what!!!!!!!!!");
- //tempsize = 0;
- tempsize = size;
- size = 0;
- }
- else {
- DevCon.Warning("VIF%x Unpack ending %x > %x", VIFdmanum, tempsize, VIFdmanum ? 0x4000 : 0x1000);
- tempsize = size;
- size = 0;
- }
- }
- else {
- tempsize = size;
- size = 0;
- }
- if (tempsize) {
- int incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4;
- size = 0;
- int addrstart = v->addr;
- //if((tempsize >> 2) != v->size) DevCon.Warning("split when size != tagsize");
-
- VIFUNPACK_LOG("sorting tempsize :p, size %d, vifnum %d, addr %x", tempsize, vifRegs->num, v->addr);
-
- while ((tempsize >= ft->gsize) && (vifRegs->num > 0)) {
- if(v->addr >= memlimit) {
- DevCon.Warning("Mem limit overflow");
- v->addr &= (memlimit - 1);
- dest = (u32*)(VU->Mem + v->addr);
- }
-
- func(dest, (u32*)cdata, ft->qsize);
- cdata += ft->gsize;
- tempsize -= ft->gsize;
-
- vifRegs->num--;
- vif->cl++;
-
- if (vif->cl == vifRegs->cycle.wl) {
- dest += incdest;
- v->addr +=(incdest * 4);
- vif->cl = 0;
- }
- else {
- dest += 4;
- v->addr += 16;
- }
- }
- if (v->addr >= memlimit) {
- v->addr &=(memlimit - 1);
- dest = (u32*)(VU->Mem + v->addr);
- }
- v->addr = addrstart;
- if(tempsize > 0) size = tempsize;
- }
-
- if (size >= ft->dsize && vifRegs->num > 0) { //Else write what we do have
- DevCon.Warning("huh!!!!!!!!!!!!!!!!!!!!!!");
- VIF_LOG("warning, end with size = %d", size);
- // unpack one qword
- //v->addr += (size / ft->dsize) * 4;
- func(dest, (u32*)cdata, size / ft->dsize);
- size = 0;
- VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, v->addr);
- }
- }
- else { // filling write
- if(vifRegs->cycle.cl > 0) // Quicker and avoids zero division :P
- if((u32)(((size / ft->gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num)
- DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft->gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl);
-
- DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType, vif->tag.addr);
- while (vifRegs->num > 0) {
- if (vif->cl == vifRegs->cycle.wl) {
- vif->cl = 0;
- }
- // unpack one qword
- if (vif->cl < vifRegs->cycle.cl) {
- if(size < ft->gsize) { DevCon.WriteLn("Out of Filling write data!"); break; }
- func(dest, (u32*)cdata, ft->qsize);
- cdata += ft->gsize;
- size -= ft->gsize;
- vif->cl++;
- vifRegs->num--;
- if (vif->cl == vifRegs->cycle.wl) {
- vif->cl = 0;
- }
- }
- else {
- func(dest, (u32*)cdata, ft->qsize);
- v->addr += 16;
- vifRegs->num--;
- vif->cl++;
- }
- dest += 4;
- if (vifRegs->num == 0) break;
- }
- }
-}
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+// Old Vif Unpack Code
+// Only here for testing/reference
+// If newVif is defined and newVif1 isn't, vif1 will use this code
+// same goes for vif0...
+template void VIFunpack<0>(u32 *data, vifCode *v, u32 size);
+template void VIFunpack<1>(u32 *data, vifCode *v, u32 size);
+template void VIFunpack(u32 *data, vifCode *v, u32 size) {
+ //if (!VIFdmanum) DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data);
+ UNPACKFUNCTYPE func;
+ const VIFUnpackFuncTable *ft;
+ VURegs * VU;
+ u8 *cdata = (u8*)data;
+ u32 tempsize = 0;
+ const u32 memlimit = vif_size(VIFdmanum);
+
+ if (VIFdmanum == 0) {
+ VU = &VU0;
+ vifRegs = vif0Regs;
+ vifMaskRegs = g_vif0Masks;
+ vif = &vif0;
+ vifRow = g_vifmask.Row0;
+ }
+ else {
+ VU = &VU1;
+ vifRegs = vif1Regs;
+ vifMaskRegs = g_vif1Masks;
+ vif = &vif1;
+ vifRow = g_vifmask.Row1;
+ }
+
+ u32 *dest = (u32*)(VU->Mem + v->addr);
+ u32 unpackType = v->cmd & 0xf;
+
+ ft = &VIFfuncTable[ unpackType ];
+ func = vif->usn ? ft->funcU : ft->funcS;
+ size <<= 2;
+
+ if (vifRegs->cycle.cl >= vifRegs->cycle.wl) { // skipping write
+ if (v->addr >= memlimit) {
+ DevCon.Warning("Overflown at the start");
+ v->addr &= (memlimit - 1);
+ dest = (u32*)(VU->Mem + v->addr);
+ }
+
+ size = min(size, (int)vifRegs->num * ft->gsize); //size will always be the same or smaller
+
+ tempsize = v->addr + ((((vifRegs->num-1) / vifRegs->cycle.wl) *
+ (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16);
+
+ //Sanity Check (memory overflow)
+ if (tempsize > memlimit) {
+ if (((vifRegs->cycle.cl != vifRegs->cycle.wl) &&
+ ((memlimit + (vifRegs->cycle.cl - vifRegs->cycle.wl) * 16) == tempsize))) {
+ //It's a red herring, so ignore it! SSE unpacks will be much quicker.
+ DevCon.WriteLn("what!!!!!!!!!");
+ //tempsize = 0;
+ tempsize = size;
+ size = 0;
+ }
+ else {
+ DevCon.Warning("VIF%x Unpack ending %x > %x", VIFdmanum, tempsize, VIFdmanum ? 0x4000 : 0x1000);
+ tempsize = size;
+ size = 0;
+ }
+ }
+ else {
+ tempsize = size;
+ size = 0;
+ }
+ if (tempsize) {
+ int incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4;
+ size = 0;
+ int addrstart = v->addr;
+ //if((tempsize >> 2) != v->size) DevCon.Warning("split when size != tagsize");
+
+ VIFUNPACK_LOG("sorting tempsize :p, size %d, vifnum %d, addr %x", tempsize, vifRegs->num, v->addr);
+
+ while ((tempsize >= ft->gsize) && (vifRegs->num > 0)) {
+ if(v->addr >= memlimit) {
+ DevCon.Warning("Mem limit overflow");
+ v->addr &= (memlimit - 1);
+ dest = (u32*)(VU->Mem + v->addr);
+ }
+
+ func(dest, (u32*)cdata, ft->qsize);
+ cdata += ft->gsize;
+ tempsize -= ft->gsize;
+
+ vifRegs->num--;
+ vif->cl++;
+
+ if (vif->cl == vifRegs->cycle.wl) {
+ dest += incdest;
+ v->addr +=(incdest * 4);
+ vif->cl = 0;
+ }
+ else {
+ dest += 4;
+ v->addr += 16;
+ }
+ }
+ if (v->addr >= memlimit) {
+ v->addr &=(memlimit - 1);
+ dest = (u32*)(VU->Mem + v->addr);
+ }
+ v->addr = addrstart;
+ if(tempsize > 0) size = tempsize;
+ }
+
+ if (size >= ft->dsize && vifRegs->num > 0) { //Else write what we do have
+ DevCon.Warning("huh!!!!!!!!!!!!!!!!!!!!!!");
+ VIF_LOG("warning, end with size = %d", size);
+ // unpack one qword
+ //v->addr += (size / ft->dsize) * 4;
+ func(dest, (u32*)cdata, size / ft->dsize);
+ size = 0;
+ VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, v->addr);
+ }
+ }
+ else { // filling write
+ if(vifRegs->cycle.cl > 0) // Quicker and avoids zero division :P
+ if((u32)(((size / ft->gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num)
+ DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft->gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl);
+
+ DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType, vif->tag.addr);
+ while (vifRegs->num > 0) {
+ if (vif->cl == vifRegs->cycle.wl) {
+ vif->cl = 0;
+ }
+ // unpack one qword
+ if (vif->cl < vifRegs->cycle.cl) {
+ if(size < ft->gsize) { DevCon.WriteLn("Out of Filling write data!"); break; }
+ func(dest, (u32*)cdata, ft->qsize);
+ cdata += ft->gsize;
+ size -= ft->gsize;
+ vif->cl++;
+ vifRegs->num--;
+ if (vif->cl == vifRegs->cycle.wl) {
+ vif->cl = 0;
+ }
+ }
+ else {
+ func(dest, (u32*)cdata, ft->qsize);
+ v->addr += 16;
+ vifRegs->num--;
+ vif->cl++;
+ }
+ dest += 4;
+ if (vifRegs->num == 0) break;
+ }
+ }
+}
diff --git a/pcsx2/x86/newVif_Unpack.inl b/pcsx2/x86/newVif_Unpack.inl
index 544560221d..6d1296f8c0 100644
--- a/pcsx2/x86/newVif_Unpack.inl
+++ b/pcsx2/x86/newVif_Unpack.inl
@@ -1,261 +1,279 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-// newVif! - author: cottonvibes(@gmail.com)
-
-#pragma once
-
-struct nVifStruct {
- u32 idx; // VIF0 or VIF1
- vifStruct* vif; // Vif Struct ptr
- VIFregisters* vifRegs; // Vif Regs ptr
- VURegs* VU; // VU Regs ptr
- u8* vuMemEnd; // End of VU Memory
- u32 vuMemLimit; // Use for fast AND
- BlockBuffer* vifBlock; // Block Buffer
-};
-nVifStruct nVif[2];
-
-void initNewVif(int idx) {
- nVif[idx].idx = idx;
- nVif[idx].VU = idx ? &VU1 : &VU0;
- nVif[idx].vif = idx ? &vif1 : &vif0;
- nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
- nVif[idx].vifBlock = new BlockBuffer(0x2000); // 8kb Block Buffer
- nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
- nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
-
- HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
- memset8<0xcc>( nVifUpkExec );
-
- xSetPtr( nVifUpkExec );
-
- for (int a = 0; a < 2; a++) {
- for (int b = 0; b < 2; b++) {
- for (int c = 0; c < 4; c++) {
- for (int d = 0; d < 3; d++) {
- nVifGen(a, b, c, d);
- }}}}
-
- HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
-}
-
-int nVifUnpack(int idx, u32 *data) {
- XMMRegisters::Freeze();
- //BlockBuffer* vB = nVif[idx].vifBlock;
- int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
- vif1.tag.size -= ret;
- _nVifUnpack(idx, (u8*)data, ret<<2);
- if (vif1.tag.size <= 0) vif1.tag.size = 0;
- if (vif1.tag.size <= 0) vif1.cmd = 0;
- XMMRegisters::Thaw();
- return ret;
-}
-
-_f u8* setVUptr(int idx, int offset) {
- return (u8*)(nVif[idx].VU->Mem + (offset & nVif[idx].vuMemLimit));
-}
-
-_f void incVUptr(int idx, u8* &ptr, int amount) {
- ptr += amount;
- int diff = ptr - nVif[idx].vuMemEnd;
- if (diff >= 0) {
- ptr = nVif[idx].VU->Mem + diff;
- }
- if ((uptr)ptr & 0xf) DevCon.WriteLn("unaligned wtf :(");
-}
-
-static void setMasks(const VIFregisters& v) {
- for (int i = 0; i < 16; i++) {
- int m = (v.mask >> (i*2)) & 3;
- switch (m) {
- case 0: // Data
- nVifMask[0][i/4][i%4] = 0xffffffff;
- nVifMask[1][i/4][i%4] = 0;
- nVifMask[2][i/4][i%4] = 0;
- break;
- case 1: // Row
- nVifMask[0][i/4][i%4] = 0;
- nVifMask[1][i/4][i%4] = 0;
- nVifMask[2][i/4][i%4] = ((u32*)&v.r0)[(i%4)*4];
- break;
- case 2: // Col
- nVifMask[0][i/4][i%4] = 0;
- nVifMask[1][i/4][i%4] = 0;
- nVifMask[2][i/4][i%4] = ((u32*)&v.c0)[(i/4)*4];
- break;
- case 3: // Write Protect
- nVifMask[0][i/4][i%4] = 0;
- nVifMask[1][i/4][i%4] = 0xffffffff;
- nVifMask[2][i/4][i%4] = 0;
- break;
- }
- }
-}
-
-// ----------------------------------------------------------------------------
-// Unpacking Optimization notes:
-// ----------------------------------------------------------------------------
-// Some games send a LOT of small packets. This is a problem because the new VIF unpacker
-// has a lot of setup code to establish which unpack function to call. The best way to
-// optimize this is to cache the unpack function's base (see fnbase below) and update it
-// when the variables it depends on are modified: writes to vif->tag.cmd and vif->usn.
-//
-// A secondary optimization would be adding special handlers for packets where vifRegs->num==1.
-// (which would remove the loop, simplify the incVUptr code, etc). But checking for it has
-// to be simple enough that it doesn't offset the benefits (which I'm not sure is possible).
-// -- air
-
-
-template< int idx, bool doMode, bool isFill >
-__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
-{
- // Eh... template attempt, tho not sure it helped much. There's too much setup code (see
- // optimization note above) -- air
-
- const int usn = !!(vif->usn);
- const int doMask = !!(vif->tag.cmd & 0x10);
- const int upkNum = vif->tag.cmd & 0xf;
- const u32& vift = nVifT[upkNum];
-
- u8* dest = setVUptr(idx, vif->tag.addr);
- const VIFUnpackFuncTable& ft = VIFfuncTable[vif->tag.cmd & 0xf];
- UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS;
-
- const nVifCall* fnbase = &nVifUpk[
- ((usn*2*16) + (doMask*16) + (upkNum)) * (4*4)
- ];
-
- const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
- const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
-
- if (doMask)
- setMasks(*vifRegs);
-
- if (vif->cl >= blockSize) {
- vif->cl = 0;
- }
-
- while (vifRegs->num > 0) {
- if (vif->cl < cycleSize) {
- //if (size <= 0) { DbgCon.WriteLn("_nVifUnpack: Out of Data!"); break; }
- if (doMode /*|| doMask*/) {
- //if (doMask)
- //DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum);
- func((u32*)dest, (u32*)data, ft.qsize);
- data += ft.gsize;
- size -= ft.gsize;
- vifRegs->num--;
- }
- else if (1) {
- //DevCon.WriteLn("SSE Unpack!");
- fnbase[aMin(vif->cl, 4) * 4](dest, data);
- data += vift;
- size -= vift;
- vifRegs->num--;
- }
- else {
-
- //DevCon.WriteLn("SSE Unpack!");
- int c = aMin((cycleSize - vif->cl), 3);
- size -= vift * c;
- //if (c>1) { DevCon.WriteLn("C > 1!"); }
- if (c<0||c>3) { DbgCon.WriteLn("C wtf!"); }
- if (size < 0) { DbgCon.WriteLn("Size Shit"); size+=vift*c;c=1;size-=vift*c;}
- fnbase[(aMin(vif->cl, 4) * 4) + c-1](dest, data);
- data += vift * c;
- vifRegs->num -= c;
- }
- }
- else if (isFill) {
- func((u32*)dest, (u32*)data, ft.qsize);
- vifRegs->num--;
- }
- incVUptr(idx, dest, 16);
-
- // Removing this modulo was a huge speedup for God of War. (62->73 fps)
- // (GoW uses a lot of blockSize==1 packets, resulting in tons of loops -- so the biggest
- // factor in performance ends up being the top-level conditionals of the loop, and
- // also the loop prep code.) --air
-
- //vif->cl = (vif->cl+1) % blockSize;
- if( ++vif->cl == blockSize ) vif->cl = 0;
- }
-}
-
-void _nVifUnpack(int idx, u8 *data, u32 size) {
- /*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write
- if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2);
- else VIFunpack<1>((u32*)data, &vif1.tag, size>>2);
- return;
- }
- else*/ { // filling write
- vif = nVif[idx].vif;
- vifRegs = nVif[idx].vifRegs;
-
- const bool doMode = !!vifRegs->mode;
- const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
-
- //UnpackLoopTable[idx][doMode][isFill]( data, size );
-
- if( idx )
- {
- if( doMode )
- {
- if( isFill )
- _nVifUnpackLoop<1,true,true>( data, size );
- else
- _nVifUnpackLoop<1,true,false>( data, size );
- }
- else
- {
- if( isFill )
- _nVifUnpackLoop<1,false,true>( data, size );
- else
- _nVifUnpackLoop<1,false,false>( data, size );
- }
- }
- else
- {
- pxFailDev( "No VIF0 support yet, sorry!" );
- }
-
- //if (isFill)
- //DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
- //DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);
-
- }
-}
-
-//int nVifUnpack(int idx, u32 *data) {
-// XMMRegisters::Freeze();
-// BlockBuffer* vB = nVif[idx].vifBlock;
-// int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
-// //vB->append(data, ret<<2);
-// vif1.tag.size -= ret;
-// //DevCon.WriteLn("2 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2);
-// //if (vif1.tag.size <= 0) {
-// //DevCon.WriteLn("3 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2);
-// //VIFunpack<1>(vB->getBlock(), &vif1.tag, vB->getSize()>>2);
-// //_nVifUnpack(idx, vB->getBlock(), vB->getSize());
-// _nVifUnpack(idx, (u8*)data, ret<<2);
-// if (vif1.tag.size <= 0) vif1.tag.size = 0;
-// if (vif1.tag.size <= 0) vif1.cmd = 0;
-// //vB->clear();
-// //}
-// //else { vif1.tag.size+=ret; ret = -1; vB->clear(); }
-// XMMRegisters::Thaw();
-// return ret;
-//}
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+// newVif! - author: cottonvibes(@gmail.com)
+
+#pragma once
+
+struct nVifStruct {
+ u32 idx; // VIF0 or VIF1
+ vifStruct* vif; // Vif Struct ptr
+ VIFregisters* vifRegs; // Vif Regs ptr
+ VURegs* VU; // VU Regs ptr
+ u8* vuMemEnd; // End of VU Memory
+ u32 vuMemLimit; // Use for fast AND
+ BlockBuffer* vifBlock; // Block Buffer
+};
+
+static __aligned16 nVifStruct nVif[2];
+
+void initNewVif(int idx) {
+ nVif[idx].idx = idx;
+ nVif[idx].VU = idx ? &VU1 : &VU0;
+ nVif[idx].vif = idx ? &vif1 : &vif0;
+ nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
+ nVif[idx].vifBlock = new BlockBuffer(0x2000); // 8kb Block Buffer
+ nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
+ nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
+
+ HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
+ memset8<0xcc>( nVifUpkExec );
+
+ xSetPtr( nVifUpkExec );
+
+ for (int a = 0; a < 2; a++) {
+ for (int b = 0; b < 2; b++) {
+ for (int c = 0; c < 4; c++) {
+ for (int d = 0; d < 3; d++) {
+ nVifGen(a, b, c, d);
+ }}}}
+
+ HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
+}
+
+int nVifUnpack(int idx, u32 *data) {
+ XMMRegisters::Freeze();
+ //BlockBuffer* vB = nVif[idx].vifBlock;
+ int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
+ vif1.tag.size -= ret;
+ _nVifUnpack(idx, (u8*)data, ret<<2);
+ if (vif1.tag.size <= 0) vif1.tag.size = 0;
+ if (vif1.tag.size <= 0) vif1.cmd = 0;
+ XMMRegisters::Thaw();
+ return ret;
+}
+
+_f u8* setVUptr(int idx, int offset) {
+ return (u8*)(nVif[idx].VU->Mem + (offset & nVif[idx].vuMemLimit));
+}
+
+_f void incVUptr(int idx, u8* &ptr, int amount) {
+ ptr += amount;
+ int diff = ptr - nVif[idx].vuMemEnd;
+ if (diff >= 0) {
+ ptr = nVif[idx].VU->Mem + diff;
+ }
+ if ((uptr)ptr & 0xf) DevCon.WriteLn("unaligned wtf :(");
+}
+
+static void setMasks(const VIFregisters& v) {
+ for (int i = 0; i < 16; i++) {
+ int m = (v.mask >> (i*2)) & 3;
+ switch (m) {
+ case 0: // Data
+ nVifMask[0][i/4][i%4] = 0xffffffff;
+ nVifMask[1][i/4][i%4] = 0;
+ nVifMask[2][i/4][i%4] = 0;
+ break;
+ case 1: // Row
+ nVifMask[0][i/4][i%4] = 0;
+ nVifMask[1][i/4][i%4] = 0;
+ nVifMask[2][i/4][i%4] = ((u32*)&v.r0)[(i%4)*4];
+ break;
+ case 2: // Col
+ nVifMask[0][i/4][i%4] = 0;
+ nVifMask[1][i/4][i%4] = 0;
+ nVifMask[2][i/4][i%4] = ((u32*)&v.c0)[(i/4)*4];
+ break;
+ case 3: // Write Protect
+ nVifMask[0][i/4][i%4] = 0;
+ nVifMask[1][i/4][i%4] = 0xffffffff;
+ nVifMask[2][i/4][i%4] = 0;
+ break;
+ }
+ }
+}
+
+// ----------------------------------------------------------------------------
+// Unpacking Optimization notes:
+// ----------------------------------------------------------------------------
+// Some games send a LOT of small packets. This is a problem because the new VIF unpacker
+// has a lot of setup code to establish which unpack function to call. The best way to
+// optimize this is to cache the unpack function's base (see fnbase below) and update it
+// when the variables it depends on are modified: writes to vif->tag.cmd and vif->usn.
+// Problem: vif->tag.cmd is modified a lot. Like, constantly. So won't work.
+//
+// A secondary optimization would be adding special handlers for packets where vifRegs->num==1.
+// (which would remove the loop, simplify the incVUptr code, etc). But checking for it has
+// to be simple enough that it doesn't offset the benefits (which I'm not sure is possible).
+// -- air
+
+
+//template< int idx, bool doMode, bool isFill >
+//__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
+__releaseinline void __fastcall _nVifUnpackLoop( int idx, u8 *data, u32 size )
+{
+ // comment out the following 2 lines to test templated version...
+ const bool doMode = !!vifRegs->mode;
+ const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
+
+ const int usn = !!(vif->usn);
+ const int doMask = !!(vif->tag.cmd & 0x10);
+ const int upkNum = vif->tag.cmd & 0xf;
+ const u32& vift = nVifT[upkNum];
+
+ u8* dest = setVUptr(idx, vif->tag.addr);
+ const VIFUnpackFuncTable& ft = VIFfuncTable[upkNum];
+ UNPACKFUNCTYPE func = usn ? ft.funcU : ft.funcS;
+
+ // Did a bunch of work to make it so I could optimize this index lookup to outside
+ // the main loop but it was for naught -- too often the loop is only 1-2 iterations,
+ // so this setup code ends up being slower (1 iter) or same speed (2 iters).
+ const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*4) ];
+
+ const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
+ const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
+
+ if (doMask)
+ setMasks(*vifRegs);
+
+ if (vif->cl >= blockSize) {
+
+ // This condition doesn't appear to ever occur, and really it never should.
+ // Normally it wouldn't matter, but even simple setup code matters here (see
+ // optimization notes above) >_<
+
+ vif->cl = 0;
+ }
+
+ while (vifRegs->num > 0) {
+ if (vif->cl < cycleSize) {
+ //if (size <= 0) { DbgCon.WriteLn("_nVifUnpack: Out of Data!"); break; }
+ if (doMode /*|| doMask*/) {
+ //if (doMask)
+ //DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum);
+ func((u32*)dest, (u32*)data, ft.qsize);
+ data += ft.gsize;
+ size -= ft.gsize;
+ vifRegs->num--;
+ }
+ else if (1) {
+ //DevCon.WriteLn("SSE Unpack!");
+ fnbase[aMin(vif->cl, 4) * 4](dest, data);
+ data += vift;
+ size -= vift;
+ vifRegs->num--;
+ }
+ else {
+ //DevCon.WriteLn("SSE Unpack!");
+ int c = aMin((cycleSize - vif->cl), 3);
+ size -= vift * c;
+ //if (c>1) { DevCon.WriteLn("C > 1!"); }
+ if (c<0||c>3) { DbgCon.WriteLn("C wtf!"); }
+ if (size < 0) { DbgCon.WriteLn("Size Shit"); size+=vift*c;c=1;size-=vift*c;}
+ fnbase[(aMin(vif->cl, 4) * 4) + c-1](dest, data);
+ data += vift * c;
+ vifRegs->num -= c;
+ }
+ }
+ else if (isFill) {
+ func((u32*)dest, (u32*)data, ft.qsize);
+ vifRegs->num--;
+ }
+ incVUptr(idx, dest, 16);
+
+ // Removing this modulo was a huge speedup for God of War start menu. (62->73 fps)
+ // (GoW and tri-ace games both use a lot of blockSize==1 packets, resulting in tons
+ // of loops -- so the biggest factor in performance ends up being the top-level
+ // conditionals of the loop, and also the loop prep code.) --air
+
+ //vif->cl = (vif->cl+1) % blockSize;
+ if( ++vif->cl == blockSize ) vif->cl = 0;
+ }
+}
+
+void _nVifUnpack(int idx, u8 *data, u32 size) {
+ /*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write
+ if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2);
+ else VIFunpack<1>((u32*)data, &vif1.tag, size>>2);
+ return;
+ }
+ else*/ { // filling write
+
+ vif = nVif[idx].vif;
+ vifRegs = nVif[idx].vifRegs;
+
+#if 1
+ _nVifUnpackLoop( idx, data, size );
+#else
+ // Eh... template attempt, tho it didn't help much. There's too much setup code,
+ // and the template only optimizes code inside the loop, which often times seems to
+ // only be run once or twice anyway. Better to use recompilation than templating
+ // anyway, but I'll leave it in for now for reference. -- air
+
+ const bool doMode = !!vifRegs->mode;
+ const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
+
+ //UnpackLoopTable[idx][doMode][isFill]( data, size );
+
+ if( idx )
+ {
+ if( doMode )
+ {
+ if( isFill )
+ _nVifUnpackLoop<1,true,true>( data, size );
+ else
+ _nVifUnpackLoop<1,true,false>( data, size );
+ }
+ else
+ {
+ if( isFill )
+ _nVifUnpackLoop<1,false,true>( data, size );
+ else
+ _nVifUnpackLoop<1,false,false>( data, size );
+ }
+ }
+ else
+ {
+ pxFailDev( "No VIF0 support yet, sorry!" );
+ }
+#endif
+ //if (isFill)
+ //DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
+ //DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);
+
+ }
+}
+
+//int nVifUnpack(int idx, u32 *data) {
+// XMMRegisters::Freeze();
+// BlockBuffer* vB = nVif[idx].vifBlock;
+// int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
+// //vB->append(data, ret<<2);
+// vif1.tag.size -= ret;
+// //DevCon.WriteLn("2 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2);
+// //if (vif1.tag.size <= 0) {
+// //DevCon.WriteLn("3 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2);
+// //VIFunpack<1>(vB->getBlock(), &vif1.tag, vB->getSize()>>2);
+// //_nVifUnpack(idx, vB->getBlock(), vB->getSize());
+// _nVifUnpack(idx, (u8*)data, ret<<2);
+// if (vif1.tag.size <= 0) vif1.tag.size = 0;
+// if (vif1.tag.size <= 0) vif1.cmd = 0;
+// //vB->clear();
+// //}
+// //else { vif1.tag.size+=ret; ret = -1; vB->clear(); }
+// XMMRegisters::Thaw();
+// return ret;
+//}
diff --git a/pcsx2/x86/newVif_UnpackGen.inl b/pcsx2/x86/newVif_UnpackGen.inl
index 8a6be76fc9..e735704e62 100644
--- a/pcsx2/x86/newVif_UnpackGen.inl
+++ b/pcsx2/x86/newVif_UnpackGen.inl
@@ -1,256 +1,255 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-#pragma once
-
-#define xMaskWrite(regX, x) { \
- if (x==0) xMOVAPS(xmm7, ptr32[ecx]); \
- if (x==1) xMOVAPS(xmm7, ptr32[ecx+0x10]); \
- if (x==2) xMOVAPS(xmm7, ptr32[ecx+0x20]); \
- int offX = aMin(curCycle+x, 4); \
- xPAND(regX, ptr32[nVifMask[0][offX]]); \
- xPAND(xmm7, ptr32[nVifMask[1][offX]]); \
- xPOR (regX, ptr32[nVifMask[2][offX]]); \
- xPOR (regX, xmm7); \
- if (x==0) xMOVAPS(ptr32[ecx], regX); \
- if (x==1) xMOVAPS(ptr32[ecx+0x10], regX); \
- if (x==2) xMOVAPS(ptr32[ecx+0x20], regX); \
-}
-
-#define xMovDest(reg0, reg1, reg2) { \
- if (mask==0) { \
- if (cycles>=0) { xMOVAPS (ptr32[ecx], reg0); } \
- if (cycles>=1) { xMOVAPS (ptr32[ecx+0x10], reg1); } \
- if (cycles>=2) { xMOVAPS (ptr32[ecx+0x20], reg2); } \
- } \
- else { \
- if (cycles>=0) { xMaskWrite(reg0, 0); } \
- if (cycles>=1) { xMaskWrite(reg1, 1); } \
- if (cycles>=2) { xMaskWrite(reg2, 2); } \
- } \
-}
-
-// xmm2 gets result
-void convertRGB() {
- xPSLL.D (xmm1, 3); // ABG|R5.000
- xMOVAPS (xmm2, xmm1);// R5.000 (garbage upper bits)
- xPSRL.D (xmm1, 8); // ABG
- xPSLL.D (xmm1, 3); // AB|G5.000
- xMOVAPS (xmm3, xmm1);// G5.000 (garbage upper bits)
- xPSRL.D (xmm1, 8); // AB
- xPSLL.D (xmm1, 3); // A|B5.000
- xMOVAPS (xmm4, xmm1);// B5.000 (garbage upper bits)
- xPSRL.D (xmm1, 8); // A
- xPSLL.D (xmm1, 7); // A.0000000
-
- xPSHUF.D (xmm1, xmm1, _v0); // A|A|A|A
- xPSHUF.D (xmm3, xmm3, _v0); // G|G|G|G
- xPSHUF.D (xmm4, xmm4, _v0); // B|B|B|B
- mVUmergeRegs(XMM2, XMM1, 0x3); // A|x|x|R
- mVUmergeRegs(XMM2, XMM3, 0x4); // A|x|G|R
- mVUmergeRegs(XMM2, XMM4, 0x2); // A|B|G|R
-
- xPSLL.D (xmm2, 24); // can optimize to
- xPSRL.D (xmm2, 24); // single AND...
-}
-
-struct VifUnpackIndexer
-{
- int usn, mask;
- int curCycle, cyclesToWrite;
-
- nVifCall& GetCall( int packType ) const
- {
- int usnpart = usn*2*16;
- int maskpart = mask*16;
- int packpart = packType;
-
- int curpart = curCycle*4;
- int cycpespart = cyclesToWrite;
-
- return nVifUpk[((usnpart+maskpart+packpart)*(4*4)) + (curpart+cycpespart)];
- }
-
- void xSetCall( int packType ) const
- {
- xAlignPtr(16);
- GetCall( packType ) = (nVifCall)xGetPtr();
- }
-
- void xSetNullCall( int packType ) const
- {
- GetCall( packType ) = NULL;
- }
-};
-
-// ecx = dest, edx = src
-void nVifGen(int usn, int mask, int curCycle, int cycles) {
- const VifUnpackIndexer indexer = { usn, mask, curCycle, cycles };
-
- indexer.xSetCall(0x0); // S-32
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0);
- if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1);
- if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2);
- if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
- xRET();
-
- indexer.xSetCall(0x1); // S-16
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
- if (cycles>=0) xShiftR (xmm0, 16);
- if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0);
- if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1);
- if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2);
- if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
- xRET();
-
- indexer.xSetCall(0x2); // S-8
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
- if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
- if (cycles>=0) xShiftR (xmm0, 24);
- if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0);
- if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1);
- if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2);
- if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
- xRET();
-
- indexer.xSetNullCall(0x3); // ----
-
- indexer.xSetCall(0x4); // V2-32
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x10]);
- if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe);
- if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
- xRET();
-
- indexer.xSetCall(0x5); // V2-16
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2);
- if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
- if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
- if (cycles>=0) xShiftR (xmm0, 16);
- if (cycles>=2) xShiftR (xmm2, 16);
- if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe);
- if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
- xRET();
-
- indexer.xSetCall(0x6); // V2-8
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
- if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2);
- if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
- if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
- if (cycles>=0) xShiftR (xmm0, 24);
- if (cycles>=2) xShiftR (xmm2, 24);
- if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe);
- if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
- xRET();
-
- indexer.xSetNullCall(0x7); // ----
-
- indexer.xSetCall(0x8); // V3-32
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+12]);
- if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+24]);
- if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
- xRET();
-
- indexer.xSetCall(0x9); // V3-16
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+6]);
- if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+12]);
- if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
- if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
- if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
- if (cycles>=0) xShiftR (xmm0, 16);
- if (cycles>=1) xShiftR (xmm1, 16);
- if (cycles>=2) xShiftR (xmm2, 16);
- if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
- xRET();
-
- indexer.xSetCall(0xa); // V3-8
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+3]);
- if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+6]);
- if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
- if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1);
- if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2);
- if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
- if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
- if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
- if (cycles>=0) xShiftR (xmm0, 24);
- if (cycles>=1) xShiftR (xmm1, 24);
- if (cycles>=2) xShiftR (xmm2, 24);
- if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
- xRET();
-
- indexer.xSetNullCall(0xb); // ----
-
- indexer.xSetCall(0xc); // V4-32
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]);
- if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]);
- if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
- xRET();
-
- indexer.xSetCall(0xd); // V4-16
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]);
- if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]);
- if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
- if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
- if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
- if (cycles>=0) xShiftR (xmm0, 16);
- if (cycles>=1) xShiftR (xmm1, 16);
- if (cycles>=2) xShiftR (xmm2, 16);
- if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
- xRET();
-
- indexer.xSetCall(0xe); // V4-8
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+4]);
- if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+8]);
- if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
- if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1);
- if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2);
- if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
- if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
- if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
- if (cycles>=0) xShiftR (xmm0, 24);
- if (cycles>=1) xShiftR (xmm1, 24);
- if (cycles>=2) xShiftR (xmm2, 24);
- if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
- xRET();
-
- // A | B5 | G5 | R5
- // ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000
- indexer.xSetCall(0xf); // V4-5
- if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
- if (cycles>=0) xMOVAPS (xmm1, xmm0);
- if (cycles>=0) convertRGB();
- if (cycles>=0) xMOVAPS (ptr32[ecx], xmm2);
- if (cycles>=1) xMOVAPS (xmm1, xmm0);
- if (cycles>=1) xPSRL.D (xmm1, 16);
- if (cycles>=1) convertRGB();
- if (cycles>=1) xMOVAPS (ptr32[ecx+0x10], xmm2);
- if (cycles>=2) xPSHUF.D (xmm1, xmm0, _v1);
- if (cycles>=2) convertRGB();
- if (cycles>=2) xMOVAPS (ptr32[ecx+0x20], xmm2);
- xRET();
-
- pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
-}
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+#pragma once
+
+#define xMaskWrite(regX, x) { \
+ if (x==0) xMOVAPS(xmm7, ptr32[ecx]); \
+ if (x==1) xMOVAPS(xmm7, ptr32[ecx+0x10]); \
+ if (x==2) xMOVAPS(xmm7, ptr32[ecx+0x20]); \
+ int offX = aMin(curCycle+x, 4); \
+ xPAND(regX, ptr32[nVifMask[0][offX]]); \
+ xPAND(xmm7, ptr32[nVifMask[1][offX]]); \
+ xPOR (regX, ptr32[nVifMask[2][offX]]); \
+ xPOR (regX, xmm7); \
+ if (x==0) xMOVAPS(ptr32[ecx], regX); \
+ if (x==1) xMOVAPS(ptr32[ecx+0x10], regX); \
+ if (x==2) xMOVAPS(ptr32[ecx+0x20], regX); \
+}
+
+#define xMovDest(reg0, reg1, reg2) { \
+ if (mask==0) { \
+ if (cycles>=0) { xMOVAPS (ptr32[ecx], reg0); } \
+ if (cycles>=1) { xMOVAPS (ptr32[ecx+0x10], reg1); } \
+ if (cycles>=2) { xMOVAPS (ptr32[ecx+0x20], reg2); } \
+ } \
+ else { \
+ if (cycles>=0) { xMaskWrite(reg0, 0); } \
+ if (cycles>=1) { xMaskWrite(reg1, 1); } \
+ if (cycles>=2) { xMaskWrite(reg2, 2); } \
+ } \
+}
+
+// xmm2 gets result
+void convertRGB() {
+ xPSLL.D (xmm1, 3); // ABG|R5.000
+ xMOVAPS (xmm2, xmm1);// R5.000 (garbage upper bits)
+ xPSRL.D (xmm1, 8); // ABG
+ xPSLL.D (xmm1, 3); // AB|G5.000
+ xMOVAPS (xmm3, xmm1);// G5.000 (garbage upper bits)
+ xPSRL.D (xmm1, 8); // AB
+ xPSLL.D (xmm1, 3); // A|B5.000
+ xMOVAPS (xmm4, xmm1);// B5.000 (garbage upper bits)
+ xPSRL.D (xmm1, 8); // A
+ xPSLL.D (xmm1, 7); // A.0000000
+
+ xPSHUF.D (xmm1, xmm1, _v0); // A|A|A|A
+ xPSHUF.D (xmm3, xmm3, _v0); // G|G|G|G
+ xPSHUF.D (xmm4, xmm4, _v0); // B|B|B|B
+ mVUmergeRegs(XMM2, XMM1, 0x3); // A|x|x|R
+ mVUmergeRegs(XMM2, XMM3, 0x4); // A|x|G|R
+ mVUmergeRegs(XMM2, XMM4, 0x2); // A|B|G|R
+
+ xPSLL.D (xmm2, 24); // can optimize to
+ xPSRL.D (xmm2, 24); // single AND...
+}
+
+struct VifUnpackIndexer
+{
+ int usn, mask;
+ int curCycle, cyclesToWrite;
+
+ nVifCall& GetCall( int packType ) const
+ {
+ int usnpart = usn*2*16;
+ int maskpart = mask*16;
+ int packpart = packType;
+
+ int curpart = curCycle*4;
+ int cycpespart = cyclesToWrite;
+
+ return nVifUpk[((usnpart+maskpart+packpart)*(4*4)) + (curpart+cycpespart)];
+ }
+
+ void xSetCall( int packType ) const
+ {
+ GetCall( packType ) = (nVifCall)xGetAlignedCallTarget();
+ }
+
+ void xSetNullCall( int packType ) const
+ {
+ GetCall( packType ) = NULL;
+ }
+};
+
+// ecx = dest, edx = src
+void nVifGen(int usn, int mask, int curCycle, int cycles) {
+ const VifUnpackIndexer indexer = { usn, mask, curCycle, cycles };
+
+ indexer.xSetCall(0x0); // S-32
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0);
+ if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1);
+ if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2);
+ if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
+ xRET();
+
+ indexer.xSetCall(0x1); // S-16
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
+ if (cycles>=0) xShiftR (xmm0, 16);
+ if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0);
+ if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1);
+ if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2);
+ if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
+ xRET();
+
+ indexer.xSetCall(0x2); // S-8
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
+ if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
+ if (cycles>=0) xShiftR (xmm0, 24);
+ if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0);
+ if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1);
+ if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2);
+ if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
+ xRET();
+
+ indexer.xSetNullCall(0x3); // ----
+
+ indexer.xSetCall(0x4); // V2-32
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x10]);
+ if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe);
+ if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
+ xRET();
+
+ indexer.xSetCall(0x5); // V2-16
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2);
+ if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
+ if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
+ if (cycles>=0) xShiftR (xmm0, 16);
+ if (cycles>=2) xShiftR (xmm2, 16);
+ if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe);
+ if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
+ xRET();
+
+ indexer.xSetCall(0x6); // V2-8
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
+ if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2);
+ if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
+ if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
+ if (cycles>=0) xShiftR (xmm0, 24);
+ if (cycles>=2) xShiftR (xmm2, 24);
+ if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe);
+ if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
+ xRET();
+
+ indexer.xSetNullCall(0x7); // ----
+
+ indexer.xSetCall(0x8); // V3-32
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+12]);
+ if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+24]);
+ if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
+ xRET();
+
+ indexer.xSetCall(0x9); // V3-16
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+6]);
+ if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+12]);
+ if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
+ if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
+ if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
+ if (cycles>=0) xShiftR (xmm0, 16);
+ if (cycles>=1) xShiftR (xmm1, 16);
+ if (cycles>=2) xShiftR (xmm2, 16);
+ if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
+ xRET();
+
+ indexer.xSetCall(0xa); // V3-8
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+3]);
+ if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+6]);
+ if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
+ if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1);
+ if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2);
+ if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
+ if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
+ if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
+ if (cycles>=0) xShiftR (xmm0, 24);
+ if (cycles>=1) xShiftR (xmm1, 24);
+ if (cycles>=2) xShiftR (xmm2, 24);
+ if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
+ xRET();
+
+ indexer.xSetNullCall(0xb); // ----
+
+ indexer.xSetCall(0xc); // V4-32
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]);
+ if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]);
+ if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
+ xRET();
+
+ indexer.xSetCall(0xd); // V4-16
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]);
+ if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]);
+ if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
+ if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
+ if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
+ if (cycles>=0) xShiftR (xmm0, 16);
+ if (cycles>=1) xShiftR (xmm1, 16);
+ if (cycles>=2) xShiftR (xmm2, 16);
+ if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
+ xRET();
+
+ indexer.xSetCall(0xe); // V4-8
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+4]);
+ if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+8]);
+ if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
+ if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1);
+ if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2);
+ if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
+ if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
+ if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
+ if (cycles>=0) xShiftR (xmm0, 24);
+ if (cycles>=1) xShiftR (xmm1, 24);
+ if (cycles>=2) xShiftR (xmm2, 24);
+ if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
+ xRET();
+
+ // A | B5 | G5 | R5
+ // ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000
+ indexer.xSetCall(0xf); // V4-5
+ if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
+ if (cycles>=0) xMOVAPS (xmm1, xmm0);
+ if (cycles>=0) convertRGB();
+ if (cycles>=0) xMOVAPS (ptr32[ecx], xmm2);
+ if (cycles>=1) xMOVAPS (xmm1, xmm0);
+ if (cycles>=1) xPSRL.D (xmm1, 16);
+ if (cycles>=1) convertRGB();
+ if (cycles>=1) xMOVAPS (ptr32[ecx+0x10], xmm2);
+ if (cycles>=2) xPSHUF.D (xmm1, xmm0, _v1);
+ if (cycles>=2) convertRGB();
+ if (cycles>=2) xMOVAPS (ptr32[ecx+0x20], xmm2);
+ xRET();
+
+ pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
+}
diff --git a/pcsx2/x86/sVU_Micro.cpp b/pcsx2/x86/sVU_Micro.cpp
index b5e28abc37..c5c01f6228 100644
--- a/pcsx2/x86/sVU_Micro.cpp
+++ b/pcsx2/x86/sVU_Micro.cpp
@@ -1,1739 +1,1739 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-#include "PrecompiledHeader.h"
-
-#include "Common.h"
-#include "GS.h"
-#include "R5900OpcodeTables.h"
-#include "iR5900.h"
-#include "iMMI.h"
-#include "iFPU.h"
-#include "iCOP0.h"
-#include "VUmicro.h"
-#include "VUflags.h"
-#include "sVU_Micro.h"
-#include "sVU_Debug.h"
-#include "sVU_zerorec.h"
-
-#ifdef _WIN32
-#pragma warning(disable:4244)
-#pragma warning(disable:4761)
-#endif
-//------------------------------------------------------------------
-
-// fixme - VUmicro should really use its own static vars for pc and branch.
-// Sharing with the EE's copies of pc and branch is not cool! (air)
-
-//------------------------------------------------------------------
-// Helper Macros
-//------------------------------------------------------------------
-#define _Ft_ (( VU->code >> 16) & 0x1F) // The rt part of the instruction register
-#define _Fs_ (( VU->code >> 11) & 0x1F) // The rd part of the instruction register
-#define _Fd_ (( VU->code >> 6) & 0x1F) // The sa part of the instruction register
-#define _It_ (_Ft_ & 15)
-#define _Is_ (_Fs_ & 15)
-#define _Id_ (_Fd_ & 15)
-
-#define _X (( VU->code>>24) & 0x1)
-#define _Y (( VU->code>>23) & 0x1)
-#define _Z (( VU->code>>22) & 0x1)
-#define _W (( VU->code>>21) & 0x1)
-
-#define _XYZW_SS (_X+_Y+_Z+_W==1)
-
-#define _Fsf_ (( VU->code >> 21) & 0x03)
-#define _Ftf_ (( VU->code >> 23) & 0x03)
-
-#define _Imm11_ (s32)(VU->code & 0x400 ? 0xfffffc00 | (VU->code & 0x3ff) : VU->code & 0x3ff)
-#define _UImm11_ (s32)(VU->code & 0x7ff)
-
-#define VU_VFx_ADDR(x) (uptr)&VU->VF[x].UL[0]
-#define VU_VFy_ADDR(x) (uptr)&VU->VF[x].UL[1]
-#define VU_VFz_ADDR(x) (uptr)&VU->VF[x].UL[2]
-#define VU_VFw_ADDR(x) (uptr)&VU->VF[x].UL[3]
-
-#define VU_REGR_ADDR (uptr)&VU->VI[REG_R]
-#define VU_REGQ_ADDR (uptr)&VU->VI[REG_Q]
-#define VU_REGMAC_ADDR (uptr)&VU->VI[REG_MAC_FLAG]
-
-#define VU_VI_ADDR(x, read) GetVIAddr(VU, x, read, info)
-
-#define VU_ACCx_ADDR (uptr)&VU->ACC.UL[0]
-#define VU_ACCy_ADDR (uptr)&VU->ACC.UL[1]
-#define VU_ACCz_ADDR (uptr)&VU->ACC.UL[2]
-#define VU_ACCw_ADDR (uptr)&VU->ACC.UL[3]
-
-#define _X_Y_Z_W ((( VU->code >> 21 ) & 0xF ) )
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// Global Variables
-//------------------------------------------------------------------
-int vucycle;
-
-const __aligned16 float s_fones[8] = {1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
-const __aligned16 u32 s_mask[4] = {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff};
-const __aligned16 u32 s_expmask[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
-const __aligned16 u32 g_minvals[4] = {0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff};
-const __aligned16 u32 g_maxvals[4] = {0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff};
-const __aligned16 u32 const_clip[8] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
- 0x80000000, 0x80000000, 0x80000000, 0x80000000};
-
-const __aligned(64) u32 g_ones[4] = {0x00000001, 0x00000001, 0x00000001, 0x00000001};
-
-const __aligned16 u32 g_minvals_XYZW[16][4] =
-{
- { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000
- { 0xffffffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //0001
- { 0xffffffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //0010
- { 0xffffffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //0011
- { 0xffffffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //0100
- { 0xffffffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //0101
- { 0xffffffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //0110
- { 0xffffffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //0111
- { 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000
- { 0xff7fffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //1001
- { 0xff7fffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //1010
- { 0xff7fffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //1011
- { 0xff7fffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //1100
- { 0xff7fffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //1101
- { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //1110
- { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //1111
-};
-const __aligned16 u32 g_maxvals_XYZW[16][4] =
-{
- { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0000
- { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //0001
- { 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //0010
- { 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //0011
- { 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //0100
- { 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //0101
- { 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //0110
- { 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //0111
- { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000
- { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //1001
- { 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //1010
- { 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //1011
- { 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //1100
- { 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //1101
- { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //1110
- { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //1111
-};
-//------------------------------------------------------------------
-
-//------------------------------------------------------------------
-// VU Pipeline/Test Stalls/Analyzing Functions
-//------------------------------------------------------------------
-void _recvuFMACflush(VURegs * VU, bool intermediate) {
- int i;
-
- for (i=0; i<8; i++) {
- if (VU->fmac[i].enable == 0) continue;
-
- if( intermediate ) {
- if ((vucycle - VU->fmac[i].sCycle) > VU->fmac[i].Cycle) {
-// VUM_LOG("flushing FMAC pipe[%d]", i);
- VU->fmac[i].enable = 0;
- }
- }
- else {
- if ((vucycle - VU->fmac[i].sCycle) >= VU->fmac[i].Cycle) {
-// VUM_LOG("flushing FMAC pipe[%d]", i);
- VU->fmac[i].enable = 0;
- }
- }
- }
-}
-
-void _recvuFDIVflush(VURegs * VU, bool intermediate) {
- if (VU->fdiv.enable == 0) return;
-
- if( intermediate ) {
- if ((vucycle - VU->fdiv.sCycle) > VU->fdiv.Cycle) {
-// Console.WriteLn("flushing FDIV pipe");
- VU->fdiv.enable = 0;
- }
- }
- else {
- if ((vucycle - VU->fdiv.sCycle) >= VU->fdiv.Cycle) {
-// Console.WriteLn("flushing FDIV pipe");
- VU->fdiv.enable = 0;
- }
- }
-}
-
-void _recvuEFUflush(VURegs * VU, bool intermediate) {
- if (VU->efu.enable == 0) return;
-
- if( intermediate ) {
- if ((vucycle - VU->efu.sCycle) > VU->efu.Cycle) {
-// Console.WriteLn("flushing FDIV pipe");
- VU->efu.enable = 0;
- }
- }
- else {
- if ((vucycle - VU->efu.sCycle) >= VU->efu.Cycle) {
-// Console.WriteLn("flushing FDIV pipe");
- VU->efu.enable = 0;
- }
- }
-}
-
-void _recvuIALUflush(VURegs * VU, bool intermediate) {
- int i;
-
- for (i=0; i<8; i++) {
- if (VU->ialu[i].enable == 0) continue;
-
- if( intermediate ) {
- if ((vucycle - VU->ialu[i].sCycle) > VU->ialu[i].Cycle) {
-// VUM_LOG("flushing IALU pipe[%d]", i);
- VU->ialu[i].enable = 0;
- }
- }
- else {
- if ((vucycle - VU->ialu[i].sCycle) >= VU->ialu[i].Cycle) {
-// VUM_LOG("flushing IALU pipe[%d]", i);
- VU->ialu[i].enable = 0;
- }
- }
- }
-}
-
-void _recvuTestPipes(VURegs * VU, bool intermediate) { // intermediate = true if called by upper FMAC stall detection
- _recvuFMACflush(VU, intermediate);
- _recvuFDIVflush(VU, intermediate);
- _recvuEFUflush(VU, intermediate);
- _recvuIALUflush(VU, intermediate);
-}
-
-void _recvuFMACTestStall(VURegs * VU, int reg, int xyzw) {
- int cycle;
- int i;
- u32 mask = 0;
-
- for (i=0; i<8; i++) {
- if (VU->fmac[i].enable == 0) continue;
- if (VU->fmac[i].reg == reg && (VU->fmac[i].xyzw & xyzw)) break;
- }
-
- if (i == 8) return;
-
- // do a perchannel delay
- // old code
-// cycle = VU->fmac[i].Cycle - (vucycle - VU->fmac[i].sCycle);
-
- // new code
- mask = 4; // w
-// if( VU->fmac[i].xyzw & 1 ) mask = 4; // w
-// else if( VU->fmac[i].xyzw & 2 ) mask = 3; // z
-// else if( VU->fmac[i].xyzw & 4 ) mask = 2; // y
-// else {
-// assert(VU->fmac[i].xyzw & 8 );
-// mask = 1; // x
-// }
-
-// mask = 0;
-// if( VU->fmac[i].xyzw & 1 ) mask++; // w
-// else if( VU->fmac[i].xyzw & 2 ) mask++; // z
-// else if( VU->fmac[i].xyzw & 4 ) mask++; // y
-// else if( VU->fmac[i].xyzw & 8 ) mask++; // x
-
- assert( (int)VU->fmac[i].sCycle < (int)vucycle );
- cycle = 0;
- if( vucycle - VU->fmac[i].sCycle < mask )
- cycle = mask - (vucycle - VU->fmac[i].sCycle);
-
- VU->fmac[i].enable = 0;
- vucycle+= cycle;
- _recvuTestPipes(VU, true); // for lower instructions
-}
-
-void _recvuIALUTestStall(VURegs * VU, int reg) {
- int cycle;
- int i;
- u32 latency;
-
- for (i=0; i<8; i++) {
- if (VU->ialu[i].enable == 0) continue;
- if (VU->ialu[i].reg == reg) break;
- }
-
- if (i == 8) return;
-
- latency = VU->ialu[i].Cycle + 1;
- cycle = 0;
- if( vucycle - VU->ialu[i].sCycle < latency )
- cycle = latency - (vucycle - VU->ialu[i].sCycle);
-
- VU->ialu[i].enable = 0;
- vucycle+= cycle;
- _recvuTestPipes(VU, true);
-}
-
-void _recvuFMACAdd(VURegs * VU, int reg, int xyzw) {
- int i;
-
- /* find a free fmac pipe */
- for (i=0; i<8; i++) {
- if (VU->fmac[i].enable == 1) continue;
- break;
- }
-
- if (i==8) Console.Error("*PCSX2*: error , out of fmacs");
-// VUM_LOG("adding FMAC pipe[%d]; reg %d", i, reg);
-
- VU->fmac[i].enable = 1;
- VU->fmac[i].sCycle = vucycle;
- VU->fmac[i].Cycle = 3;
- VU->fmac[i].xyzw = xyzw;
- VU->fmac[i].reg = reg;
-}
-
-void _recvuFDIVAdd(VURegs * VU, int cycles) {
-// Console.WriteLn("adding FDIV pipe");
- VU->fdiv.enable = 1;
- VU->fdiv.sCycle = vucycle;
- VU->fdiv.Cycle = cycles;
-}
-
-void _recvuEFUAdd(VURegs * VU, int cycles) {
-// Console.WriteLn("adding EFU pipe");
- VU->efu.enable = 1;
- VU->efu.sCycle = vucycle;
- VU->efu.Cycle = cycles;
-}
-
-void _recvuIALUAdd(VURegs * VU, int reg, int cycles) {
- int i;
-
- /* find a free ialu pipe */
- for (i=0; i<8; i++) {
- if (VU->ialu[i].enable == 1) continue;
- break;
- }
-
- if (i==8) Console.Error("*PCSX2*: error , out of ialus");
-
- VU->ialu[i].enable = 1;
- VU->ialu[i].sCycle = vucycle;
- VU->ialu[i].Cycle = cycles;
- VU->ialu[i].reg = reg;
-}
-
-void _recvuTestIALUStalls(VURegs * VU, _VURegsNum *VUregsn) {
-
- int VIread0 = 0, VIread1 = 0; // max 2 integer registers are read simulataneously
- int i;
-
- for(i=0;i<16;i++) { // find used integer(vi00-vi15) registers
- if( (VUregsn->VIread >> i) & 1 ) {
- if( VIread0 ) VIread1 = i;
- else VIread0 = i;
- }
- }
-
- if( VIread0 ) _recvuIALUTestStall(VU, VIread0);
- if( VIread1 ) _recvuIALUTestStall(VU, VIread1);
-}
-
-void _recvuAddIALUStalls(VURegs * VU, _VURegsNum *VUregsn) {
- if (VUregsn->VIwrite && VUregsn->cycles) {
- int VIWrite0 = 0;
- int i;
-
- for(i=0;i<16;i++) { // find used(vi00-vi15) registers
- if( (VUregsn->VIwrite >> i) & 1 ) {
- VIWrite0 = i;
- }
- }
- if( VIWrite0 ) _recvuIALUAdd(VU, VIWrite0, VUregsn->cycles);
- }
-}
-
-void _recvuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn, bool upper) {
-
- if( VUregsn->VFread0 && (VUregsn->VFread0 == VUregsn->VFread1) ) {
- _recvuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw|VUregsn->VFr1xyzw);
- }
- else {
- if (VUregsn->VFread0) _recvuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw);
- if (VUregsn->VFread1) _recvuFMACTestStall(VU, VUregsn->VFread1, VUregsn->VFr1xyzw);
- }
-
- if( !upper && VUregsn->VIread ) _recvuTestIALUStalls(VU, VUregsn); // for lower instructions which read integer reg
-}
-
-void _recvuAddFMACStalls(VURegs * VU, _VURegsNum *VUregsn) {
-
- if (VUregsn->VFwrite) _recvuFMACAdd(VU, VUregsn->VFwrite, VUregsn->VFwxyzw);
- else if (VUregsn->VIwrite & (1 << REG_CLIP_FLAG)) _recvuFMACAdd(VU, -REG_CLIP_FLAG, 0); // REG_CLIP_FLAG pipe
- else _recvuFMACAdd(VU, 0, 0); // cause no data dependency with fp registers
-}
-
-void _recvuFlushFDIV(VURegs * VU) {
- int cycle;
-
- if (VU->fdiv.enable == 0) return;
-
- cycle = VU->fdiv.Cycle + 1 - (vucycle - VU->fdiv.sCycle); //VU->fdiv.Cycle contains the latency minus 1 (6 or 12)
-// Console.WriteLn("waiting FDIV pipe %d", cycle);
- VU->fdiv.enable = 0;
- vucycle+= cycle;
-}
-
-void _recvuFlushEFU(VURegs * VU) {
- int cycle;
-
- if (VU->efu.enable == 0) return;
-
- cycle = VU->efu.Cycle - (vucycle - VU->efu.sCycle);
-// Console.WriteLn("waiting FDIV pipe %d", cycle);
- VU->efu.enable = 0;
- vucycle+= cycle;
-}
-
-void _recvuTestFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) {
- _recvuTestFMACStalls(VU,VUregsn, false);
- _recvuFlushFDIV(VU);
-}
-
-void _recvuTestEFUStalls(VURegs * VU, _VURegsNum *VUregsn) {
- _recvuTestFMACStalls(VU,VUregsn, false);
- _recvuFlushEFU(VU);
-}
-
-void _recvuAddFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) {
-// _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn);
- if (VUregsn->VIwrite & (1 << REG_Q)) {
- _recvuFDIVAdd(VU, VUregsn->cycles);
- }
-}
-
-void _recvuAddEFUStalls(VURegs * VU, _VURegsNum *VUregsn) {
-// _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn);
- if (VUregsn->VIwrite & (1 << REG_P)) {
- _recvuEFUAdd(VU, VUregsn->cycles);
- }
-}
-
-void _recvuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn) {
- switch (VUregsn->pipe) {
- case VUPIPE_FMAC: _recvuTestFMACStalls(VU, VUregsn, true); break;
- }
-}
-
-void _recvuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
- switch (VUregsn->pipe) {
- case VUPIPE_FMAC: _recvuTestFMACStalls(VU, VUregsn, false); break;
- case VUPIPE_FDIV: _recvuTestFDIVStalls(VU, VUregsn); break;
- case VUPIPE_EFU: _recvuTestEFUStalls(VU, VUregsn); break;
- case VUPIPE_IALU: _recvuTestIALUStalls(VU, VUregsn); break;
- case VUPIPE_BRANCH: _recvuTestIALUStalls(VU, VUregsn); break;
- }
-}
-
-void _recvuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn) {
- switch (VUregsn->pipe) {
- case VUPIPE_FMAC: _recvuAddFMACStalls(VU, VUregsn); break;
- }
-}
-
-void _recvuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
- switch (VUregsn->pipe) {
- case VUPIPE_FMAC: _recvuAddFMACStalls(VU, VUregsn); break;
- case VUPIPE_FDIV: _recvuAddFDIVStalls(VU, VUregsn); break;
- case VUPIPE_EFU: _recvuAddEFUStalls(VU, VUregsn); break;
- case VUPIPE_IALU: _recvuAddIALUStalls(VU, VUregsn); break; // note: only ILW and ILWR cause stall in IALU pipe
- }
-}
-
-void SuperVUAnalyzeOp(VURegs *VU, _vuopinfo *info, _VURegsNum* pCodeRegs)
-{
- _VURegsNum* lregs;
- _VURegsNum* uregs;
- int *ptr;
-
- lregs = pCodeRegs;
- uregs = pCodeRegs+1;
-
- ptr = (int*)&VU->Micro[pc];
- pc += 8;
-
- if (ptr[1] & 0x40000000) { // EOP
- branch |= 8;
- }
-
- VU->code = ptr[1];
- if (VU == &VU1) VU1regs_UPPER_OPCODE[VU->code & 0x3f](uregs);
- else VU0regs_UPPER_OPCODE[VU->code & 0x3f](uregs);
-
- _recvuTestUpperStalls(VU, uregs);
- switch(VU->code & 0x3f) {
- case 0x10: case 0x11: case 0x12: case 0x13:
- case 0x14: case 0x15: case 0x16: case 0x17:
- case 0x1d: case 0x1f:
- case 0x2b: case 0x2f:
- break;
-
- case 0x3c:
- switch ((VU->code >> 6) & 0x1f) {
- case 0x4: case 0x5:
- break;
- default:
- info->statusflag = 4;
- info->macflag = 4;
- break;
- }
- break;
- case 0x3d:
- switch ((VU->code >> 6) & 0x1f) {
- case 0x4: case 0x5: case 0x7:
- break;
- default:
- info->statusflag = 4;
- info->macflag = 4;
- break;
- }
- break;
- case 0x3e:
- switch ((VU->code >> 6) & 0x1f) {
- case 0x4: case 0x5:
- break;
- default:
- info->statusflag = 4;
- info->macflag = 4;
- break;
- }
- break;
- case 0x3f:
- switch ((VU->code >> 6) & 0x1f) {
- case 0x4: case 0x5: case 0x7: case 0xb:
- break;
- default:
- info->statusflag = 4;
- info->macflag = 4;
- break;
- }
- break;
-
- default:
- info->statusflag = 4;
- info->macflag = 4;
- break;
- }
-
- if (uregs->VIread & (1 << REG_Q)) { info->q |= 2; }
- if (uregs->VIread & (1 << REG_P)) { info->p |= 2; assert( VU == &VU1 ); }
-
- // check upper flags
- if (ptr[1] & 0x80000000) { // I flag
- info->cycle = vucycle;
- memzero(*lregs);
- }
- else {
-
- VU->code = ptr[0];
- if (VU == &VU1) VU1regs_LOWER_OPCODE[VU->code >> 25](lregs);
- else VU0regs_LOWER_OPCODE[VU->code >> 25](lregs);
-
- _recvuTestLowerStalls(VU, lregs);
- info->cycle = vucycle;
-
- if (lregs->pipe == VUPIPE_BRANCH) {
- branch |= 1;
- }
-
- if (lregs->VIwrite & (1 << REG_Q)) {
- info->q |= 4;
- info->cycles = lregs->cycles;
- info->pqinst = (VU->code&2)>>1; // rsqrt is 2
- }
- else if (lregs->pipe == VUPIPE_FDIV) {
- info->q |= 8|1;
- info->pqinst = 0;
- }
-
- if (lregs->VIwrite & (1 << REG_P)) {
- assert( VU == &VU1 );
- info->p |= 4;
- info->cycles = lregs->cycles;
-
- switch( VU->code & 0xff ) {
- case 0xfd: info->pqinst = 0; break; //eatan
- case 0x7c: info->pqinst = 0; break; //eatanxy
- case 0x7d: info->pqinst = 0; break; //eatanzy
- case 0xfe: info->pqinst = 1; break; //eexp
- case 0xfc: info->pqinst = 2; break; //esin
- case 0x3f: info->pqinst = 3; break; //erleng
- case 0x3e: info->pqinst = 4; break; //eleng
- case 0x3d: info->pqinst = 4; break; //ersadd
- case 0xbd: info->pqinst = 4; break; //ersqrt
- case 0xbe: info->pqinst = 5; break; //ercpr
- case 0xbc: info->pqinst = 5; break; //esqrt
- case 0x7e: info->pqinst = 5; break; //esum
- case 0x3c: info->pqinst = 6; break; //esadd
- default: assert(0);
- }
- }
- else if (lregs->pipe == VUPIPE_EFU) {
- info->p |= 8|1;
- }
-
- if (lregs->VIread & (1 << REG_STATUS_FLAG)) info->statusflag|= VUOP_READ;
- if (lregs->VIread & (1 << REG_MAC_FLAG)) info->macflag|= VUOP_READ;
-
- if (lregs->VIwrite & (1 << REG_STATUS_FLAG)) info->statusflag|= VUOP_WRITE;
- if (lregs->VIwrite & (1 << REG_MAC_FLAG)) info->macflag|= VUOP_WRITE;
-
- if (lregs->VIread & (1 << REG_Q)) { info->q |= 2; }
- if (lregs->VIread & (1 << REG_P)) { info->p |= 2; assert( VU == &VU1 ); }
-
- _recvuAddLowerStalls(VU, lregs);
- }
-
- _recvuAddUpperStalls(VU, uregs);
- _recvuTestPipes(VU, false);
-
- vucycle++;
-}
-
-int eeVURecompileCode(VURegs *VU, _VURegsNum* regs)
-{
- int info = 0;
- int vfread0=-1, vfread1 = -1, vfwrite = -1, vfacc = -1, vftemp=-1;
-
- assert( regs != NULL );
-
- if( regs->VFread0 ) _addNeededVFtoXMMreg(regs->VFread0);
- if( regs->VFread1 ) _addNeededVFtoXMMreg(regs->VFread1);
- if( regs->VFwrite ) _addNeededVFtoXMMreg(regs->VFwrite);
- if( regs->VIread & (1<VIread & (1<VFread0 ) vfread0 = _allocVFtoXMMreg(VU, -1, regs->VFread0, MODE_READ);
- else if( regs->VIread & (1<VFread1 ) vfread1 = _allocVFtoXMMreg(VU, -1, regs->VFread1, MODE_READ);
- else if( (regs->VIread & (1<VFr1xyzw != 0xff) vfread1 = _allocVFtoXMMreg(VU, -1, 0, MODE_READ);
-
- if( regs->VIread & (1<VIwrite&(1<VIwrite & (1<VFwxyzw != 0xf?MODE_READ:0));
- }
-
- if( regs->VFwrite ) {
- assert( !(regs->VIwrite&(1<VFwrite, MODE_WRITE|(regs->VFwxyzw != 0xf?MODE_READ:0));
- }
-
- if( vfacc>= 0 ) info |= PROCESS_EE_SET_ACC(vfacc);
- if( vfwrite >= 0 ) {
- if( regs->VFwrite == _Ft_ && vfread1 < 0 ) {
- info |= PROCESS_EE_SET_T(vfwrite);
- }
- else {
- assert( regs->VFwrite == _Fd_ );
- info |= PROCESS_EE_SET_D(vfwrite);
- }
- }
-
- if( vfread0 >= 0 ) info |= PROCESS_EE_SET_S(vfread0);
- if( vfread1 >= 0 ) info |= PROCESS_EE_SET_T(vfread1);
-
- vftemp = _allocTempXMMreg(XMMT_FPS, -1);
- info |= PROCESS_VU_SET_TEMP(vftemp);
-
- if( regs->VIwrite & (1 << REG_CLIP_FLAG) ) {
- // CLIP inst, need two extra temp registers, put it EEREC_D and EEREC_ACC
- int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
- int t2reg = _allocTempXMMreg(XMMT_FPS, -1);
-
- info |= PROCESS_EE_SET_D(t1reg);
- info |= PROCESS_EE_SET_ACC(t2reg);
-
- _freeXMMreg(t1reg); // don't need
- _freeXMMreg(t2reg); // don't need
- }
- else if( regs->VIwrite & (1<VI[reg].UL;
-
- if( read != 1 ) {
- if( reg == REG_MAC_FLAG ) return (uptr)&VU->macflag;
- if( reg == REG_CLIP_FLAG ) return (uptr)&VU->clipflag;
- if( reg == REG_STATUS_FLAG ) return (uptr)&VU->statusflag;
- if( reg == REG_Q ) return (uptr)&VU->q;
- if( reg == REG_P ) return (uptr)&VU->p;
- }
-
- return (uptr)&VU->VI[reg].UL;
-}
-
-// gets a temp reg that is not EEREC_TEMP
-int _vuGetTempXMMreg(int info)
-{
- int t1reg = -1;
-
- if( _hasFreeXMMreg() ) {
- t1reg = _allocTempXMMreg(XMMT_FPS, -1);
-
- if( t1reg == EEREC_TEMP ) {
- if( _hasFreeXMMreg() ) {
- int t = _allocTempXMMreg(XMMT_FPS, -1);
- _freeXMMreg(t1reg);
- t1reg = t;
- }
- else {
- _freeXMMreg(t1reg);
- t1reg = -1;
- }
- }
- }
-
- return t1reg;
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// Misc VU Reg Flipping/Merging Functions
-//------------------------------------------------------------------
-void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw)
-{
- switch (xyzw) {
- case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break;
- case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break;
- case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break;
- case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break;
- }
-}
-
-void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw)
-{
- switch (xyzw) {
- case 0: SSE_MOVSS_XMM_to_XMM(dstreg, srcreg); break;
- case 1: if ( x86caps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0));
- else SSE2_PSHUFLW_XMM_to_XMM(dstreg, srcreg, 0xee);
- break;
- case 2: SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); break;
- case 3: if ( x86caps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0));
- else { SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); SSE2_PSHUFLW_XMM_to_XMM(dstreg, dstreg, 0xee); }
- break;
- }
-}
-
-void _vuFlipRegSS(VURegs * VU, int reg)
-{
- assert( _XYZW_SS );
- if( _Y ) SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e);
- else if( _Z ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6);
- else if( _W ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27);
-}
-
-void _vuFlipRegSS_xyzw(int reg, int xyzw)
-{
- switch ( xyzw ) {
- case 1: SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e); break;
- case 2: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6); break;
- case 3: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27); break;
- }
-}
-
-void _vuMoveSS(VURegs * VU, int dstreg, int srcreg)
-{
- assert( _XYZW_SS );
- if( _Y ) _unpackVFSS_xyzw(dstreg, srcreg, 1);
- else if( _Z ) _unpackVFSS_xyzw(dstreg, srcreg, 2);
- else if( _W ) _unpackVFSS_xyzw(dstreg, srcreg, 3);
- else _unpackVFSS_xyzw(dstreg, srcreg, 0);
-}
-
-// 1 - src, 0 - dest wzyx
-void VU_MERGE0(int dest, int src) { // 0000s
-}
-void VU_MERGE1(int dest, int src) { // 1000
- SSE_MOVHLPS_XMM_to_XMM(src, dest);
- SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4);
-}
-void VU_MERGE1b(int dest, int src) { // 1000s
- SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
-}
-void VU_MERGE2(int dest, int src) { // 0100
- SSE_MOVHLPS_XMM_to_XMM(src, dest);
- SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
-}
-void VU_MERGE2b(int dest, int src) { // 0100s
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
-}
-void VU_MERGE3(int dest, int src) { // 1100s
- SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
-}
-void VU_MERGE4(int dest, int src) { // 0010
- SSE_MOVSS_XMM_to_XMM(src, dest);
- SSE2_MOVSD_XMM_to_XMM(dest, src);
-}
-void VU_MERGE4b(int dest, int src) { // 0010s
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
-}
-void VU_MERGE5(int dest, int src) { // 1010
- SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd8);
-}
-void VU_MERGE5b(int dest, int src) { // 1010s
- SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
-}
-void VU_MERGE6(int dest, int src) { // 0110
- SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x78);
-}
-void VU_MERGE6b(int dest, int src) { // 0110s
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
-}
-void VU_MERGE7(int dest, int src) { // 1110
- SSE_MOVSS_XMM_to_XMM(src, dest);
- SSE_MOVAPS_XMM_to_XMM(dest, src);
-}
-void VU_MERGE7b(int dest, int src) { // 1110s
- SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
-}
-void VU_MERGE8(int dest, int src) { // 0001s
- SSE_MOVSS_XMM_to_XMM(dest, src);
-}
-void VU_MERGE9(int dest, int src) { // 1001
- SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd2);
-}
-void VU_MERGE9b(int dest, int src) { // 1001s
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
-}
-void VU_MERGE10(int dest, int src) { // 0101
- SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x72);
-}
-void VU_MERGE10b(int dest, int src) { // 0101s
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
-}
-void VU_MERGE11(int dest, int src) { // 1101s
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
-}
-void VU_MERGE12(int dest, int src) { // 0011
- SSE2_MOVSD_XMM_to_XMM(dest, src);
-}
-void VU_MERGE13(int dest, int src) { // 1011
- SSE_MOVHLPS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, dest, 0x64);
- SSE_MOVAPS_XMM_to_XMM(dest, src);
-}
-void VU_MERGE13b(int dest, int src) { // 1011s
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
-}
-void VU_MERGE14(int dest, int src) { // 0111
- SSE_MOVHLPS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, dest, 0xc4);
- SSE_MOVAPS_XMM_to_XMM(dest, src);
-}
-void VU_MERGE14b(int dest, int src) { // 0111s
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
- SSE_MOVSS_XMM_to_XMM(dest, src);
- SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
- SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
-}
-void VU_MERGE15(int dest, int src) { // 1111s
- SSE_MOVAPS_XMM_to_XMM(dest, src);
-}
-
-typedef void (*VUMERGEFN)(int dest, int src);
-
-static VUMERGEFN s_VuMerge[16] = {
- VU_MERGE0, VU_MERGE1, VU_MERGE2, VU_MERGE3,
- VU_MERGE4, VU_MERGE5, VU_MERGE6, VU_MERGE7,
- VU_MERGE8, VU_MERGE9, VU_MERGE10, VU_MERGE11,
- VU_MERGE12, VU_MERGE13, VU_MERGE14, VU_MERGE15 };
-
-static VUMERGEFN s_VuMerge2[16] = {
- VU_MERGE0, VU_MERGE1b, VU_MERGE2b, VU_MERGE3,
- VU_MERGE4b, VU_MERGE5b, VU_MERGE6b, VU_MERGE7b,
- VU_MERGE8, VU_MERGE9b, VU_MERGE10b, VU_MERGE11,
- VU_MERGE12, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 };
-
-// Modifies the Source Reg!
-void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) {
- xyzw &= 0xf;
- if ( (dest != src) && (xyzw != 0) ) {
- if ( x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) {
- xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
- SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw);
- }
- else s_VuMerge[xyzw](dest, src);
- }
-}
-// Doesn't Modify the Source Reg! (ToDo: s_VuMerge2() has room for optimization)
-void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw) {
- xyzw &= 0xf;
- if ( (dest != src) && (xyzw != 0) ) {
- if ( x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) {
- xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
- SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw);
- }
- else s_VuMerge2[xyzw](dest, src);
- }
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// Misc VU Reg Clamping/Overflow Functions
-//------------------------------------------------------------------
-#define CLAMP_NORMAL_SSE4(n) \
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);\
- SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[n][0]);\
- SSE2_PSUBD_XMM_to_XMM(regTemp, regd);\
- SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_ones[0]);\
- SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[n][0]);\
- SSE2_PSLLD_I8_to_XMM(regTemp, 31);\
- SSE_XORPS_XMM_to_XMM(regd, regTemp);
-
-#define CLAMP_SIGN_SSE4(n) \
- SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[n][0]);\
- SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[n][0]);
-
-void vFloat0(int regd, int regTemp) { } //0000
-void vFloat1(int regd, int regTemp) { //1000
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
-}
-void vFloat1c(int regd, int regTemp) { //1000
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(1);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat2(int regd, int regTemp) { //0100
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
-}
-void vFloat2c(int regd, int regTemp) { //0100
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(2);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat3(int regd, int regTemp) { //1100
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
-}
-void vFloat3b(int regd, int regTemp) { //1100 //regTemp is Modified
- SSE2_MOVSD_XMM_to_XMM(regTemp, regd);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
-}
-void vFloat3c(int regd, int regTemp) { //1100
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(3);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat4(int regd, int regTemp) { //0010
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
-}
-void vFloat4c(int regd, int regTemp) { //0010
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(4);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat5(int regd, int regTemp) { //1010
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
-}
-void vFloat5b(int regd, int regTemp) { //1010 //regTemp is Modified
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_NORMAL_SSE4(5);
- }
- else {
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
- }
-}
-void vFloat5c(int regd, int regTemp) { //1010
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(5);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat6(int regd, int regTemp) { //0110
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
-}
-void vFloat6b(int regd, int regTemp) { //0110 //regTemp is Modified
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_NORMAL_SSE4(6);
- }
- else {
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
- }
-}
-void vFloat6c(int regd, int regTemp) { //0110
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(6);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat7(int regd, int regTemp) { //1110
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
-}
-void vFloat7_useEAX(int regd, int regTemp) { //1110 //EAX is Modified
- SSE2_MOVD_XMM_to_R(EAX, regd);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- if ( x86caps.hasStreamingSIMD4Extensions )
- SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
- else {
- SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
- SHR32ItoR(EAX, 16);
- SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
- }
-}
-void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified
- SSE_MOVSS_XMM_to_XMM(regTemp, regd);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- SSE_MOVSS_XMM_to_XMM(regd, regTemp);
-}
-void vFloat7c(int regd, int regTemp) { //1110
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(7);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat7c_useEAX(int regd, int regTemp) { //1110 //EAX is Modified
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(7);
- }
- else {
- SSE2_MOVD_XMM_to_R(EAX, regd);
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- SSE2_MOVD_R_to_XMM(regTemp, EAX);
- SSE_MOVSS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat8(int regd, int regTemp) { //0001
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
-}
-void vFloat8c(int regd, int regTemp) { //0001
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(8);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat9(int regd, int regTemp) { //1001
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
-}
-void vFloat9b(int regd, int regTemp) { //1001 //regTemp is Modified
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_NORMAL_SSE4(9);
- }
- else {
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- }
-}
-void vFloat9c(int regd, int regTemp) { //1001
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(9);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat10(int regd, int regTemp) { //0101
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
-}
-void vFloat10b(int regd, int regTemp) { //0101 //regTemp is Modified
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_NORMAL_SSE4(10);
- }
- else {
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- }
-}
-void vFloat10c(int regd, int regTemp) { //0101
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(10);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat11(int regd, int regTemp) { //1101
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
-}
-void vFloat11_useEAX(int regd, int regTemp) { //1101 //EAX is Modified
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE2_MOVD_XMM_to_R(EAX, regd);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- if ( x86caps.hasStreamingSIMD4Extensions )
- SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
- else {
- SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
- SHR32ItoR(EAX, 16);
- SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
- }
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
-}
-void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- SSE_MOVSS_XMM_to_XMM(regTemp, regd);
- SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
-}
-void vFloat11c(int regd, int regTemp) { //1101
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(11);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat11c_useEAX(int regd, int regTemp) { //1101 // EAX is modified
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(11);
- }
- else {
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE2_MOVD_XMM_to_R(EAX, regd);
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- SSE2_MOVD_R_to_XMM(regTemp, EAX);
- SSE_MOVSS_XMM_to_XMM(regd, regTemp);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- }
-}
-void vFloat12(int regd, int regTemp) { //0011
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
-}
-void vFloat12b(int regd, int regTemp) { //0011 //regTemp is Modified
- SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- SSE2_PUNPCKLQDQ_XMM_to_XMM(regd, regTemp);
-}
-void vFloat12c(int regd, int regTemp) { //0011
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(12);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat13(int regd, int regTemp) { //1011
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
-}
-void vFloat13_useEAX(int regd, int regTemp) { //1011 // EAX is modified
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE2_MOVD_XMM_to_R(EAX, regd);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- if ( x86caps.hasStreamingSIMD4Extensions )
- SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
- else {
- SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
- SHR32ItoR(EAX, 16);
- SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
- }
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
-}
-void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
- SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x64);
-}
-void vFloat13c(int regd, int regTemp) { //1011
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(13);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat13c_useEAX(int regd, int regTemp) { //1011 // EAX is modified
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(13);
- }
- else {
- SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0xc6);
- SSE2_MOVD_XMM_to_R(EAX, regd);
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- SSE2_MOVD_R_to_XMM(regTemp, EAX);
- SSE_MOVSS_XMM_to_XMM(regd, regTemp);
- SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0xc6);
- }
-}
-void vFloat14(int regd, int regTemp) { //0111
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
-}
-void vFloat14_useEAX(int regd, int regTemp) { //0111 // EAX is modified
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
- SSE2_MOVD_XMM_to_R(EAX, regd);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- if ( x86caps.hasStreamingSIMD4Extensions )
- SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
- else {
- SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
- SHR32ItoR(EAX, 16);
- SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
- }
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
-}
-void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
- SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xc4);
-}
-void vFloat14c(int regd, int regTemp) { //0111
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(14);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
- SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-void vFloat14c_useEAX(int regd, int regTemp) { //0111 // EAX is modified
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(14);
- }
- else {
- SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0x27);
- SSE2_MOVD_XMM_to_R(EAX, regd);
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- SSE2_MOVD_R_to_XMM(regTemp, EAX);
- SSE_MOVSS_XMM_to_XMM(regd, regTemp);
- SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0x27);
- }
-}
-void vFloat15(int regd, int regTemp) { //1111
- SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
-}
-void vFloat15c(int regd, int regTemp) { //1111
- if ( x86caps.hasStreamingSIMD4Extensions ) {
- CLAMP_SIGN_SSE4(15);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
- SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
- SSE_MINPS_M128_to_XMM(regd, (uptr)&g_maxvals[0]);
- SSE_MAXPS_M128_to_XMM(regd, (uptr)&g_minvals[0]);
- SSE_ORPS_XMM_to_XMM(regd, regTemp);
- }
-}
-
-vFloat vFloats1[16] = { //regTemp is not modified
- vFloat0, vFloat1, vFloat2, vFloat3,
- vFloat4, vFloat5, vFloat6, vFloat7,
- vFloat8, vFloat9, vFloat10, vFloat11,
- vFloat12, vFloat13, vFloat14, vFloat15 };
-
-vFloat vFloats1_useEAX[16] = { //regTemp is not modified but EAX is used
- vFloat0, vFloat1, vFloat2, vFloat3,
- vFloat4, vFloat5, vFloat6, vFloat7_useEAX,
- vFloat8, vFloat9, vFloat10, vFloat11_useEAX,
- vFloat12, vFloat13_useEAX, vFloat14_useEAX, vFloat15 };
-
-vFloat vFloats2[16] = { //regTemp is modified
- vFloat0, vFloat1, vFloat2, vFloat3b,
- vFloat4, vFloat5b, vFloat6b, vFloat7b,
- vFloat8, vFloat9b, vFloat10b, vFloat11b,
- vFloat12b, vFloat13b, vFloat14b, vFloat15 };
-
-vFloat vFloats4[16] = { //regTemp is modified
- vFloat0, vFloat1c, vFloat2c, vFloat3c,
- vFloat4c, vFloat5c, vFloat6c, vFloat7c,
- vFloat8c, vFloat9c, vFloat10c, vFloat11c,
- vFloat12c, vFloat13c, vFloat14c, vFloat15c };
-
-vFloat vFloats4_useEAX[16] = { //regTemp is modified and EAX is used
- vFloat0, vFloat1c, vFloat2c, vFloat3c,
- vFloat4c, vFloat5c, vFloat6c, vFloat7c_useEAX,
- vFloat8c, vFloat9c, vFloat10c, vFloat11c_useEAX,
- vFloat12c, vFloat13c_useEAX, vFloat14c_useEAX, vFloat15c };
-
-//------------------------------------------------------------------
-// Clamping Functions (wrapper for vFloat* functions)
-// vuFloat : "normal" clamping
-// vuFloat_useEAX : "normal" clamping (faster but EAX is modified)
-// vuFloat2 : "normal" clamping (fastest but regTemp is modified)
-// vuFloat3 : "preserve sign" clamping for pointer
-// vuFloat4 : "preserve sign" clamping (regTemp is modified; *FASTEST* on SSE4 CPUs)
-// vuFloat4_useEAX : "preserve sign" clamping (faster but regTemp and EAX are modified)
-// vuFloat5 : wrapper function for vuFloat2 and vuFloat4
-// vuFloat5_useEAX : wrapper function for vuFloat2 and vuFloat4_useEAX
-// vuFloatExtra : for debugging
-//
-// Notice 1: vuFloat*_useEAX may be slower on AMD CPUs, which have independent execution pipeline for
-// vector and scalar instructions (need checks)
-// Notice 2: recVUMI_MUL_xyzw_toD and recVUMI_MADD_xyzw_toD use vFloats directly!
-//------------------------------------------------------------------
-
-// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (doesn't use any temp regs)
-void vuFloat( int info, int regd, int XYZW) {
- if( CHECK_VU_OVERFLOW ) {
- /*if ( (XYZW != 0) && (XYZW != 8) && (XYZW != 0xF) ) {
- int t1reg = _vuGetTempXMMreg(info);
- if (t1reg >= 0) {
- vuFloat2( regd, t1reg, XYZW );
- _freeXMMreg( t1reg );
- return;
- }
- }*/
- //vuFloatExtra(regd, XYZW);
- vFloats1[XYZW](regd, regd);
- }
-}
-
-// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses EAX as a temp register; faster but **destroys EAX**)
-void vuFloat_useEAX( int info, int regd, int XYZW) {
- if( CHECK_VU_OVERFLOW ) {
- vFloats1_useEAX[XYZW](regd, regd);
- }
-}
-
-// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses a temp reg)
-void vuFloat2(int regd, int regTemp, int XYZW) {
- if( CHECK_VU_OVERFLOW ) {
- //vuFloatExtra(regd, XYZW);
- vFloats2[XYZW](regd, regTemp);
- }
-}
-
-// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg)
-void vuFloat4(int regd, int regTemp, int XYZW) {
- if( CHECK_VU_OVERFLOW ) {
- vFloats4[XYZW](regd, regTemp);
- }
-}
-
-// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg, and uses EAX as a temp register; faster but **destroys EAX**)
-void vuFloat4_useEAX(int regd, int regTemp, int XYZW) {
- if( CHECK_VU_OVERFLOW ) {
- vFloats4_useEAX[XYZW](regd, regTemp);
- }
-}
-
-// Uses vuFloat4 or vuFloat2 depending on the CHECK_VU_SIGN_OVERFLOW setting
-void vuFloat5(int regd, int regTemp, int XYZW) {
- if (CHECK_VU_SIGN_OVERFLOW) {
- vuFloat4(regd, regTemp, XYZW);
- }
- else vuFloat2(regd, regTemp, XYZW);
-}
-
-// Uses vuFloat4_useEAX or vuFloat2 depending on the CHECK_VU_SIGN_OVERFLOW setting (uses EAX as a temp register; faster but **destoroyes EAX**)
-void vuFloat5_useEAX(int regd, int regTemp, int XYZW) {
- if (CHECK_VU_SIGN_OVERFLOW) {
- vuFloat4_useEAX(regd, regTemp, XYZW);
- }
- else vuFloat2(regd, regTemp, XYZW);
-}
-
-// Clamps +/-infs to +/-fMax, and +/-NaNs to +/-fMax
-void vuFloat3(uptr x86ptr) {
- u8* pjmp;
-
- if( CHECK_VU_OVERFLOW ) {
- CMP32ItoM(x86ptr, 0x7f800000 );
- pjmp = JL8(0); // Signed Comparison
- MOV32ItoM(x86ptr, 0x7f7fffff );
- x86SetJ8(pjmp);
-
- CMP32ItoM(x86ptr, 0xff800000 );
- pjmp = JB8(0); // Unsigned Comparison
- MOV32ItoM(x86ptr, 0xff7fffff );
- x86SetJ8(pjmp);
- }
-}
-
-__aligned16 u64 vuFloatData[4];
-
-// Makes NaN == 0, Infinities stay the same; Very Slow - Use only for debugging
-void vuFloatExtra( int regd, int XYZW) {
- int t1reg = (regd == 0) ? (regd + 1) : (regd - 1);
- int t2reg = (regd <= 1) ? (regd + 2) : (regd - 2);
- SSE_MOVAPS_XMM_to_M128( (uptr)&vuFloatData[0], t1reg );
- SSE_MOVAPS_XMM_to_M128( (uptr)&vuFloatData[2], t2reg );
-
- SSE_XORPS_XMM_to_XMM(t1reg, t1reg);
- SSE_CMPORDPS_XMM_to_XMM(t1reg, regd);
- SSE_MOVAPS_XMM_to_XMM(t2reg, regd);
- SSE_ANDPS_XMM_to_XMM(t2reg, t1reg);
- VU_MERGE_REGS_CUSTOM(regd, t2reg, XYZW);
-
- SSE_MOVAPS_M128_to_XMM( t1reg, (uptr)&vuFloatData[0] );
- SSE_MOVAPS_M128_to_XMM( t2reg, (uptr)&vuFloatData[2] );
-}
-
-static __aligned16 u32 tempRegX[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
-
-// Called by testWhenOverflow() function
-void testPrintOverflow() {
- tempRegX[0] &= 0xff800000;
- tempRegX[1] &= 0xff800000;
- tempRegX[2] &= 0xff800000;
- tempRegX[3] &= 0xff800000;
- if ( (tempRegX[0] == 0x7f800000) || (tempRegX[1] == 0x7f800000) || (tempRegX[2] == 0x7f800000) || (tempRegX[3] == 0x7f800000) )
- Console.Warning( "VU OVERFLOW!: Changing to +Fmax!!!!!!!!!!!!" );
- if ( (tempRegX[0] == 0xff800000) || (tempRegX[1] == 0xff800000) || (tempRegX[2] == 0xff800000) || (tempRegX[3] == 0xff800000) )
- Console.Warning( "VU OVERFLOW!: Changing to -Fmax!!!!!!!!!!!!" );
-}
-
-// Outputs to the console when overflow has occured.
-void testWhenOverflow(int info, int regd, int t0reg) {
- SSE_MOVAPS_XMM_to_M128((uptr)tempRegX, regd);
- CALLFunc((uptr)testPrintOverflow);
-}
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+#include "PrecompiledHeader.h"
+
+#include "Common.h"
+#include "GS.h"
+#include "R5900OpcodeTables.h"
+#include "iR5900.h"
+#include "iMMI.h"
+#include "iFPU.h"
+#include "iCOP0.h"
+#include "VUmicro.h"
+#include "VUflags.h"
+#include "sVU_Micro.h"
+#include "sVU_Debug.h"
+#include "sVU_zerorec.h"
+
+#ifdef _WIN32
+#pragma warning(disable:4244)
+#pragma warning(disable:4761)
+#endif
+//------------------------------------------------------------------
+
+// fixme - VUmicro should really use its own static vars for pc and branch.
+// Sharing with the EE's copies of pc and branch is not cool! (air)
+
+//------------------------------------------------------------------
+// Helper Macros
+//------------------------------------------------------------------
+#define _Ft_ (( VU->code >> 16) & 0x1F) // The rt part of the instruction register
+#define _Fs_ (( VU->code >> 11) & 0x1F) // The rd part of the instruction register
+#define _Fd_ (( VU->code >> 6) & 0x1F) // The sa part of the instruction register
+#define _It_ (_Ft_ & 15)
+#define _Is_ (_Fs_ & 15)
+#define _Id_ (_Fd_ & 15)
+
+#define _X (( VU->code>>24) & 0x1)
+#define _Y (( VU->code>>23) & 0x1)
+#define _Z (( VU->code>>22) & 0x1)
+#define _W (( VU->code>>21) & 0x1)
+
+#define _XYZW_SS (_X+_Y+_Z+_W==1)
+
+#define _Fsf_ (( VU->code >> 21) & 0x03)
+#define _Ftf_ (( VU->code >> 23) & 0x03)
+
+#define _Imm11_ (s32)(VU->code & 0x400 ? 0xfffffc00 | (VU->code & 0x3ff) : VU->code & 0x3ff)
+#define _UImm11_ (s32)(VU->code & 0x7ff)
+
+#define VU_VFx_ADDR(x) (uptr)&VU->VF[x].UL[0]
+#define VU_VFy_ADDR(x) (uptr)&VU->VF[x].UL[1]
+#define VU_VFz_ADDR(x) (uptr)&VU->VF[x].UL[2]
+#define VU_VFw_ADDR(x) (uptr)&VU->VF[x].UL[3]
+
+#define VU_REGR_ADDR (uptr)&VU->VI[REG_R]
+#define VU_REGQ_ADDR (uptr)&VU->VI[REG_Q]
+#define VU_REGMAC_ADDR (uptr)&VU->VI[REG_MAC_FLAG]
+
+#define VU_VI_ADDR(x, read) GetVIAddr(VU, x, read, info)
+
+#define VU_ACCx_ADDR (uptr)&VU->ACC.UL[0]
+#define VU_ACCy_ADDR (uptr)&VU->ACC.UL[1]
+#define VU_ACCz_ADDR (uptr)&VU->ACC.UL[2]
+#define VU_ACCw_ADDR (uptr)&VU->ACC.UL[3]
+
+#define _X_Y_Z_W ((( VU->code >> 21 ) & 0xF ) )
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// Global Variables
+//------------------------------------------------------------------
+int vucycle;
+
+const __aligned16 float s_fones[8] = {1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
+const __aligned16 u32 s_mask[4] = {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff};
+const __aligned16 u32 s_expmask[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
+const __aligned16 u32 g_minvals[4] = {0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff};
+const __aligned16 u32 g_maxvals[4] = {0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff};
+const __aligned16 u32 const_clip[8] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
+ 0x80000000, 0x80000000, 0x80000000, 0x80000000};
+
+const __aligned(64) u32 g_ones[4] = {0x00000001, 0x00000001, 0x00000001, 0x00000001};
+
+const __aligned16 u32 g_minvals_XYZW[16][4] =
+{
+ { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000
+ { 0xffffffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //0001
+ { 0xffffffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //0010
+ { 0xffffffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //0011
+ { 0xffffffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //0100
+ { 0xffffffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //0101
+ { 0xffffffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //0110
+ { 0xffffffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //0111
+ { 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000
+ { 0xff7fffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //1001
+ { 0xff7fffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //1010
+ { 0xff7fffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //1011
+ { 0xff7fffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //1100
+ { 0xff7fffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //1101
+ { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //1110
+ { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //1111
+};
+const __aligned16 u32 g_maxvals_XYZW[16][4] =
+{
+ { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0000
+ { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //0001
+ { 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //0010
+ { 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //0011
+ { 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //0100
+ { 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //0101
+ { 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //0110
+ { 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //0111
+ { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000
+ { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //1001
+ { 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //1010
+ { 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //1011
+ { 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //1100
+ { 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //1101
+ { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //1110
+ { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //1111
+};
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// VU Pipeline/Test Stalls/Analyzing Functions
+//------------------------------------------------------------------
+void _recvuFMACflush(VURegs * VU, bool intermediate) {
+ int i;
+
+ for (i=0; i<8; i++) {
+ if (VU->fmac[i].enable == 0) continue;
+
+ if( intermediate ) {
+ if ((vucycle - VU->fmac[i].sCycle) > VU->fmac[i].Cycle) {
+// VUM_LOG("flushing FMAC pipe[%d]", i);
+ VU->fmac[i].enable = 0;
+ }
+ }
+ else {
+ if ((vucycle - VU->fmac[i].sCycle) >= VU->fmac[i].Cycle) {
+// VUM_LOG("flushing FMAC pipe[%d]", i);
+ VU->fmac[i].enable = 0;
+ }
+ }
+ }
+}
+
+void _recvuFDIVflush(VURegs * VU, bool intermediate) {
+ if (VU->fdiv.enable == 0) return;
+
+ if( intermediate ) {
+ if ((vucycle - VU->fdiv.sCycle) > VU->fdiv.Cycle) {
+// Console.WriteLn("flushing FDIV pipe");
+ VU->fdiv.enable = 0;
+ }
+ }
+ else {
+ if ((vucycle - VU->fdiv.sCycle) >= VU->fdiv.Cycle) {
+// Console.WriteLn("flushing FDIV pipe");
+ VU->fdiv.enable = 0;
+ }
+ }
+}
+
+void _recvuEFUflush(VURegs * VU, bool intermediate) {
+ if (VU->efu.enable == 0) return;
+
+ if( intermediate ) {
+ if ((vucycle - VU->efu.sCycle) > VU->efu.Cycle) {
+// Console.WriteLn("flushing FDIV pipe");
+ VU->efu.enable = 0;
+ }
+ }
+ else {
+ if ((vucycle - VU->efu.sCycle) >= VU->efu.Cycle) {
+// Console.WriteLn("flushing FDIV pipe");
+ VU->efu.enable = 0;
+ }
+ }
+}
+
+void _recvuIALUflush(VURegs * VU, bool intermediate) {
+ int i;
+
+ for (i=0; i<8; i++) {
+ if (VU->ialu[i].enable == 0) continue;
+
+ if( intermediate ) {
+ if ((vucycle - VU->ialu[i].sCycle) > VU->ialu[i].Cycle) {
+// VUM_LOG("flushing IALU pipe[%d]", i);
+ VU->ialu[i].enable = 0;
+ }
+ }
+ else {
+ if ((vucycle - VU->ialu[i].sCycle) >= VU->ialu[i].Cycle) {
+// VUM_LOG("flushing IALU pipe[%d]", i);
+ VU->ialu[i].enable = 0;
+ }
+ }
+ }
+}
+
+void _recvuTestPipes(VURegs * VU, bool intermediate) { // intermediate = true if called by upper FMAC stall detection
+ _recvuFMACflush(VU, intermediate);
+ _recvuFDIVflush(VU, intermediate);
+ _recvuEFUflush(VU, intermediate);
+ _recvuIALUflush(VU, intermediate);
+}
+
+void _recvuFMACTestStall(VURegs * VU, int reg, int xyzw) {
+ int cycle;
+ int i;
+ u32 mask = 0;
+
+ for (i=0; i<8; i++) {
+ if (VU->fmac[i].enable == 0) continue;
+ if (VU->fmac[i].reg == reg && (VU->fmac[i].xyzw & xyzw)) break;
+ }
+
+ if (i == 8) return;
+
+ // do a perchannel delay
+ // old code
+// cycle = VU->fmac[i].Cycle - (vucycle - VU->fmac[i].sCycle);
+
+ // new code
+ mask = 4; // w
+// if( VU->fmac[i].xyzw & 1 ) mask = 4; // w
+// else if( VU->fmac[i].xyzw & 2 ) mask = 3; // z
+// else if( VU->fmac[i].xyzw & 4 ) mask = 2; // y
+// else {
+// assert(VU->fmac[i].xyzw & 8 );
+// mask = 1; // x
+// }
+
+// mask = 0;
+// if( VU->fmac[i].xyzw & 1 ) mask++; // w
+// else if( VU->fmac[i].xyzw & 2 ) mask++; // z
+// else if( VU->fmac[i].xyzw & 4 ) mask++; // y
+// else if( VU->fmac[i].xyzw & 8 ) mask++; // x
+
+ assert( (int)VU->fmac[i].sCycle < (int)vucycle );
+ cycle = 0;
+ if( vucycle - VU->fmac[i].sCycle < mask )
+ cycle = mask - (vucycle - VU->fmac[i].sCycle);
+
+ VU->fmac[i].enable = 0;
+ vucycle+= cycle;
+ _recvuTestPipes(VU, true); // for lower instructions
+}
+
+void _recvuIALUTestStall(VURegs * VU, int reg) {
+ int cycle;
+ int i;
+ u32 latency;
+
+ for (i=0; i<8; i++) {
+ if (VU->ialu[i].enable == 0) continue;
+ if (VU->ialu[i].reg == reg) break;
+ }
+
+ if (i == 8) return;
+
+ latency = VU->ialu[i].Cycle + 1;
+ cycle = 0;
+ if( vucycle - VU->ialu[i].sCycle < latency )
+ cycle = latency - (vucycle - VU->ialu[i].sCycle);
+
+ VU->ialu[i].enable = 0;
+ vucycle+= cycle;
+ _recvuTestPipes(VU, true);
+}
+
+void _recvuFMACAdd(VURegs * VU, int reg, int xyzw) {
+ int i;
+
+ /* find a free fmac pipe */
+ for (i=0; i<8; i++) {
+ if (VU->fmac[i].enable == 1) continue;
+ break;
+ }
+
+ if (i==8) Console.Error("*PCSX2*: error , out of fmacs");
+// VUM_LOG("adding FMAC pipe[%d]; reg %d", i, reg);
+
+ VU->fmac[i].enable = 1;
+ VU->fmac[i].sCycle = vucycle;
+ VU->fmac[i].Cycle = 3;
+ VU->fmac[i].xyzw = xyzw;
+ VU->fmac[i].reg = reg;
+}
+
+void _recvuFDIVAdd(VURegs * VU, int cycles) {
+// Console.WriteLn("adding FDIV pipe");
+ VU->fdiv.enable = 1;
+ VU->fdiv.sCycle = vucycle;
+ VU->fdiv.Cycle = cycles;
+}
+
+void _recvuEFUAdd(VURegs * VU, int cycles) {
+// Console.WriteLn("adding EFU pipe");
+ VU->efu.enable = 1;
+ VU->efu.sCycle = vucycle;
+ VU->efu.Cycle = cycles;
+}
+
+void _recvuIALUAdd(VURegs * VU, int reg, int cycles) {
+ int i;
+
+ /* find a free ialu pipe */
+ for (i=0; i<8; i++) {
+ if (VU->ialu[i].enable == 1) continue;
+ break;
+ }
+
+ if (i==8) Console.Error("*PCSX2*: error , out of ialus");
+
+ VU->ialu[i].enable = 1;
+ VU->ialu[i].sCycle = vucycle;
+ VU->ialu[i].Cycle = cycles;
+ VU->ialu[i].reg = reg;
+}
+
+void _recvuTestIALUStalls(VURegs * VU, _VURegsNum *VUregsn) {
+
+ int VIread0 = 0, VIread1 = 0; // max 2 integer registers are read simulataneously
+ int i;
+
+ for(i=0;i<16;i++) { // find used integer(vi00-vi15) registers
+ if( (VUregsn->VIread >> i) & 1 ) {
+ if( VIread0 ) VIread1 = i;
+ else VIread0 = i;
+ }
+ }
+
+ if( VIread0 ) _recvuIALUTestStall(VU, VIread0);
+ if( VIread1 ) _recvuIALUTestStall(VU, VIread1);
+}
+
+void _recvuAddIALUStalls(VURegs * VU, _VURegsNum *VUregsn) {
+ if (VUregsn->VIwrite && VUregsn->cycles) {
+ int VIWrite0 = 0;
+ int i;
+
+ for(i=0;i<16;i++) { // find used(vi00-vi15) registers
+ if( (VUregsn->VIwrite >> i) & 1 ) {
+ VIWrite0 = i;
+ }
+ }
+ if( VIWrite0 ) _recvuIALUAdd(VU, VIWrite0, VUregsn->cycles);
+ }
+}
+
+void _recvuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn, bool upper) {
+
+ if( VUregsn->VFread0 && (VUregsn->VFread0 == VUregsn->VFread1) ) {
+ _recvuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw|VUregsn->VFr1xyzw);
+ }
+ else {
+ if (VUregsn->VFread0) _recvuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw);
+ if (VUregsn->VFread1) _recvuFMACTestStall(VU, VUregsn->VFread1, VUregsn->VFr1xyzw);
+ }
+
+ if( !upper && VUregsn->VIread ) _recvuTestIALUStalls(VU, VUregsn); // for lower instructions which read integer reg
+}
+
+void _recvuAddFMACStalls(VURegs * VU, _VURegsNum *VUregsn) {
+
+ if (VUregsn->VFwrite) _recvuFMACAdd(VU, VUregsn->VFwrite, VUregsn->VFwxyzw);
+ else if (VUregsn->VIwrite & (1 << REG_CLIP_FLAG)) _recvuFMACAdd(VU, -REG_CLIP_FLAG, 0); // REG_CLIP_FLAG pipe
+ else _recvuFMACAdd(VU, 0, 0); // cause no data dependency with fp registers
+}
+
+void _recvuFlushFDIV(VURegs * VU) {
+ int cycle;
+
+ if (VU->fdiv.enable == 0) return;
+
+ cycle = VU->fdiv.Cycle + 1 - (vucycle - VU->fdiv.sCycle); //VU->fdiv.Cycle contains the latency minus 1 (6 or 12)
+// Console.WriteLn("waiting FDIV pipe %d", cycle);
+ VU->fdiv.enable = 0;
+ vucycle+= cycle;
+}
+
+void _recvuFlushEFU(VURegs * VU) {
+ int cycle;
+
+ if (VU->efu.enable == 0) return;
+
+ cycle = VU->efu.Cycle - (vucycle - VU->efu.sCycle);
+// Console.WriteLn("waiting FDIV pipe %d", cycle);
+ VU->efu.enable = 0;
+ vucycle+= cycle;
+}
+
+void _recvuTestFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) {
+ _recvuTestFMACStalls(VU,VUregsn, false);
+ _recvuFlushFDIV(VU);
+}
+
+void _recvuTestEFUStalls(VURegs * VU, _VURegsNum *VUregsn) {
+ _recvuTestFMACStalls(VU,VUregsn, false);
+ _recvuFlushEFU(VU);
+}
+
+void _recvuAddFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) {
+// _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn);
+ if (VUregsn->VIwrite & (1 << REG_Q)) {
+ _recvuFDIVAdd(VU, VUregsn->cycles);
+ }
+}
+
+void _recvuAddEFUStalls(VURegs * VU, _VURegsNum *VUregsn) {
+// _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn);
+ if (VUregsn->VIwrite & (1 << REG_P)) {
+ _recvuEFUAdd(VU, VUregsn->cycles);
+ }
+}
+
+void _recvuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn) {
+ switch (VUregsn->pipe) {
+ case VUPIPE_FMAC: _recvuTestFMACStalls(VU, VUregsn, true); break;
+ }
+}
+
+void _recvuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
+ switch (VUregsn->pipe) {
+ case VUPIPE_FMAC: _recvuTestFMACStalls(VU, VUregsn, false); break;
+ case VUPIPE_FDIV: _recvuTestFDIVStalls(VU, VUregsn); break;
+ case VUPIPE_EFU: _recvuTestEFUStalls(VU, VUregsn); break;
+ case VUPIPE_IALU: _recvuTestIALUStalls(VU, VUregsn); break;
+ case VUPIPE_BRANCH: _recvuTestIALUStalls(VU, VUregsn); break;
+ }
+}
+
+void _recvuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn) {
+ switch (VUregsn->pipe) {
+ case VUPIPE_FMAC: _recvuAddFMACStalls(VU, VUregsn); break;
+ }
+}
+
+void _recvuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
+ switch (VUregsn->pipe) {
+ case VUPIPE_FMAC: _recvuAddFMACStalls(VU, VUregsn); break;
+ case VUPIPE_FDIV: _recvuAddFDIVStalls(VU, VUregsn); break;
+ case VUPIPE_EFU: _recvuAddEFUStalls(VU, VUregsn); break;
+ case VUPIPE_IALU: _recvuAddIALUStalls(VU, VUregsn); break; // note: only ILW and ILWR cause stall in IALU pipe
+ }
+}
+
+void SuperVUAnalyzeOp(VURegs *VU, _vuopinfo *info, _VURegsNum* pCodeRegs)
+{
+ _VURegsNum* lregs;
+ _VURegsNum* uregs;
+ int *ptr;
+
+ lregs = pCodeRegs;
+ uregs = pCodeRegs+1;
+
+ ptr = (int*)&VU->Micro[pc];
+ pc += 8;
+
+ if (ptr[1] & 0x40000000) { // EOP
+ branch |= 8;
+ }
+
+ VU->code = ptr[1];
+ if (VU == &VU1) VU1regs_UPPER_OPCODE[VU->code & 0x3f](uregs);
+ else VU0regs_UPPER_OPCODE[VU->code & 0x3f](uregs);
+
+ _recvuTestUpperStalls(VU, uregs);
+ switch(VU->code & 0x3f) {
+ case 0x10: case 0x11: case 0x12: case 0x13:
+ case 0x14: case 0x15: case 0x16: case 0x17:
+ case 0x1d: case 0x1f:
+ case 0x2b: case 0x2f:
+ break;
+
+ case 0x3c:
+ switch ((VU->code >> 6) & 0x1f) {
+ case 0x4: case 0x5:
+ break;
+ default:
+ info->statusflag = 4;
+ info->macflag = 4;
+ break;
+ }
+ break;
+ case 0x3d:
+ switch ((VU->code >> 6) & 0x1f) {
+ case 0x4: case 0x5: case 0x7:
+ break;
+ default:
+ info->statusflag = 4;
+ info->macflag = 4;
+ break;
+ }
+ break;
+ case 0x3e:
+ switch ((VU->code >> 6) & 0x1f) {
+ case 0x4: case 0x5:
+ break;
+ default:
+ info->statusflag = 4;
+ info->macflag = 4;
+ break;
+ }
+ break;
+ case 0x3f:
+ switch ((VU->code >> 6) & 0x1f) {
+ case 0x4: case 0x5: case 0x7: case 0xb:
+ break;
+ default:
+ info->statusflag = 4;
+ info->macflag = 4;
+ break;
+ }
+ break;
+
+ default:
+ info->statusflag = 4;
+ info->macflag = 4;
+ break;
+ }
+
+ if (uregs->VIread & (1 << REG_Q)) { info->q |= 2; }
+ if (uregs->VIread & (1 << REG_P)) { info->p |= 2; assert( VU == &VU1 ); }
+
+ // check upper flags
+ if (ptr[1] & 0x80000000) { // I flag
+ info->cycle = vucycle;
+ memzero(*lregs);
+ }
+ else {
+
+ VU->code = ptr[0];
+ if (VU == &VU1) VU1regs_LOWER_OPCODE[VU->code >> 25](lregs);
+ else VU0regs_LOWER_OPCODE[VU->code >> 25](lregs);
+
+ _recvuTestLowerStalls(VU, lregs);
+ info->cycle = vucycle;
+
+ if (lregs->pipe == VUPIPE_BRANCH) {
+ branch |= 1;
+ }
+
+ if (lregs->VIwrite & (1 << REG_Q)) {
+ info->q |= 4;
+ info->cycles = lregs->cycles;
+ info->pqinst = (VU->code&2)>>1; // rsqrt is 2
+ }
+ else if (lregs->pipe == VUPIPE_FDIV) {
+ info->q |= 8|1;
+ info->pqinst = 0;
+ }
+
+ if (lregs->VIwrite & (1 << REG_P)) {
+ assert( VU == &VU1 );
+ info->p |= 4;
+ info->cycles = lregs->cycles;
+
+ switch( VU->code & 0xff ) {
+ case 0xfd: info->pqinst = 0; break; //eatan
+ case 0x7c: info->pqinst = 0; break; //eatanxy
+ case 0x7d: info->pqinst = 0; break; //eatanzy
+ case 0xfe: info->pqinst = 1; break; //eexp
+ case 0xfc: info->pqinst = 2; break; //esin
+ case 0x3f: info->pqinst = 3; break; //erleng
+ case 0x3e: info->pqinst = 4; break; //eleng
+ case 0x3d: info->pqinst = 4; break; //ersadd
+ case 0xbd: info->pqinst = 4; break; //ersqrt
+ case 0xbe: info->pqinst = 5; break; //ercpr
+ case 0xbc: info->pqinst = 5; break; //esqrt
+ case 0x7e: info->pqinst = 5; break; //esum
+ case 0x3c: info->pqinst = 6; break; //esadd
+ default: assert(0);
+ }
+ }
+ else if (lregs->pipe == VUPIPE_EFU) {
+ info->p |= 8|1;
+ }
+
+ if (lregs->VIread & (1 << REG_STATUS_FLAG)) info->statusflag|= VUOP_READ;
+ if (lregs->VIread & (1 << REG_MAC_FLAG)) info->macflag|= VUOP_READ;
+
+ if (lregs->VIwrite & (1 << REG_STATUS_FLAG)) info->statusflag|= VUOP_WRITE;
+ if (lregs->VIwrite & (1 << REG_MAC_FLAG)) info->macflag|= VUOP_WRITE;
+
+ if (lregs->VIread & (1 << REG_Q)) { info->q |= 2; }
+ if (lregs->VIread & (1 << REG_P)) { info->p |= 2; assert( VU == &VU1 ); }
+
+ _recvuAddLowerStalls(VU, lregs);
+ }
+
+ _recvuAddUpperStalls(VU, uregs);
+ _recvuTestPipes(VU, false);
+
+ vucycle++;
+}
+
+int eeVURecompileCode(VURegs *VU, _VURegsNum* regs)
+{
+ int info = 0;
+ int vfread0=-1, vfread1 = -1, vfwrite = -1, vfacc = -1, vftemp=-1;
+
+ assert( regs != NULL );
+
+ if( regs->VFread0 ) _addNeededVFtoXMMreg(regs->VFread0);
+ if( regs->VFread1 ) _addNeededVFtoXMMreg(regs->VFread1);
+ if( regs->VFwrite ) _addNeededVFtoXMMreg(regs->VFwrite);
+ if( regs->VIread & (1<VIread & (1<VFread0 ) vfread0 = _allocVFtoXMMreg(VU, -1, regs->VFread0, MODE_READ);
+ else if( regs->VIread & (1<VFread1 ) vfread1 = _allocVFtoXMMreg(VU, -1, regs->VFread1, MODE_READ);
+ else if( (regs->VIread & (1<VFr1xyzw != 0xff) vfread1 = _allocVFtoXMMreg(VU, -1, 0, MODE_READ);
+
+ if( regs->VIread & (1<VIwrite&(1<VIwrite & (1<VFwxyzw != 0xf?MODE_READ:0));
+ }
+
+ if( regs->VFwrite ) {
+ assert( !(regs->VIwrite&(1<VFwrite, MODE_WRITE|(regs->VFwxyzw != 0xf?MODE_READ:0));
+ }
+
+ if( vfacc>= 0 ) info |= PROCESS_EE_SET_ACC(vfacc);
+ if( vfwrite >= 0 ) {
+ if( regs->VFwrite == _Ft_ && vfread1 < 0 ) {
+ info |= PROCESS_EE_SET_T(vfwrite);
+ }
+ else {
+ assert( regs->VFwrite == _Fd_ );
+ info |= PROCESS_EE_SET_D(vfwrite);
+ }
+ }
+
+ if( vfread0 >= 0 ) info |= PROCESS_EE_SET_S(vfread0);
+ if( vfread1 >= 0 ) info |= PROCESS_EE_SET_T(vfread1);
+
+ vftemp = _allocTempXMMreg(XMMT_FPS, -1);
+ info |= PROCESS_VU_SET_TEMP(vftemp);
+
+ if( regs->VIwrite & (1 << REG_CLIP_FLAG) ) {
+ // CLIP inst, need two extra temp registers, put it EEREC_D and EEREC_ACC
+ int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
+ int t2reg = _allocTempXMMreg(XMMT_FPS, -1);
+
+ info |= PROCESS_EE_SET_D(t1reg);
+ info |= PROCESS_EE_SET_ACC(t2reg);
+
+ _freeXMMreg(t1reg); // don't need
+ _freeXMMreg(t2reg); // don't need
+ }
+ else if( regs->VIwrite & (1<VI[reg].UL;
+
+ if( read != 1 ) {
+ if( reg == REG_MAC_FLAG ) return (uptr)&VU->macflag;
+ if( reg == REG_CLIP_FLAG ) return (uptr)&VU->clipflag;
+ if( reg == REG_STATUS_FLAG ) return (uptr)&VU->statusflag;
+ if( reg == REG_Q ) return (uptr)&VU->q;
+ if( reg == REG_P ) return (uptr)&VU->p;
+ }
+
+ return (uptr)&VU->VI[reg].UL;
+}
+
+// gets a temp reg that is not EEREC_TEMP
+int _vuGetTempXMMreg(int info)
+{
+ int t1reg = -1;
+
+ if( _hasFreeXMMreg() ) {
+ t1reg = _allocTempXMMreg(XMMT_FPS, -1);
+
+ if( t1reg == EEREC_TEMP ) {
+ if( _hasFreeXMMreg() ) {
+ int t = _allocTempXMMreg(XMMT_FPS, -1);
+ _freeXMMreg(t1reg);
+ t1reg = t;
+ }
+ else {
+ _freeXMMreg(t1reg);
+ t1reg = -1;
+ }
+ }
+ }
+
+ return t1reg;
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// Misc VU Reg Flipping/Merging Functions
+//------------------------------------------------------------------
+void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw)
+{
+ switch (xyzw) {
+ case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break;
+ case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break;
+ case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break;
+ case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break;
+ }
+}
+
+void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw)
+{
+ switch (xyzw) {
+ case 0: SSE_MOVSS_XMM_to_XMM(dstreg, srcreg); break;
+ case 1: if ( x86caps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0));
+ else SSE2_PSHUFLW_XMM_to_XMM(dstreg, srcreg, 0xee);
+ break;
+ case 2: SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); break;
+ case 3: if ( x86caps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0));
+ else { SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); SSE2_PSHUFLW_XMM_to_XMM(dstreg, dstreg, 0xee); }
+ break;
+ }
+}
+
+void _vuFlipRegSS(VURegs * VU, int reg)
+{
+ assert( _XYZW_SS );
+ if( _Y ) SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e);
+ else if( _Z ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6);
+ else if( _W ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27);
+}
+
+void _vuFlipRegSS_xyzw(int reg, int xyzw)
+{
+ switch ( xyzw ) {
+ case 1: SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e); break;
+ case 2: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6); break;
+ case 3: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27); break;
+ }
+}
+
+void _vuMoveSS(VURegs * VU, int dstreg, int srcreg)
+{
+ assert( _XYZW_SS );
+ if( _Y ) _unpackVFSS_xyzw(dstreg, srcreg, 1);
+ else if( _Z ) _unpackVFSS_xyzw(dstreg, srcreg, 2);
+ else if( _W ) _unpackVFSS_xyzw(dstreg, srcreg, 3);
+ else _unpackVFSS_xyzw(dstreg, srcreg, 0);
+}
+
+// 1 - src, 0 - dest wzyx
+void VU_MERGE0(int dest, int src) { // 0000s
+}
+void VU_MERGE1(int dest, int src) { // 1000
+ SSE_MOVHLPS_XMM_to_XMM(src, dest);
+ SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4);
+}
+void VU_MERGE1b(int dest, int src) { // 1000s
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
+}
+void VU_MERGE2(int dest, int src) { // 0100
+ SSE_MOVHLPS_XMM_to_XMM(src, dest);
+ SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
+}
+void VU_MERGE2b(int dest, int src) { // 0100s
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
+}
+void VU_MERGE3(int dest, int src) { // 1100s
+ SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
+}
+void VU_MERGE4(int dest, int src) { // 0010
+ SSE_MOVSS_XMM_to_XMM(src, dest);
+ SSE2_MOVSD_XMM_to_XMM(dest, src);
+}
+void VU_MERGE4b(int dest, int src) { // 0010s
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+}
+void VU_MERGE5(int dest, int src) { // 1010
+ SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd8);
+}
+void VU_MERGE5b(int dest, int src) { // 1010s
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+}
+void VU_MERGE6(int dest, int src) { // 0110
+ SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x78);
+}
+void VU_MERGE6b(int dest, int src) { // 0110s
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+}
+void VU_MERGE7(int dest, int src) { // 1110
+ SSE_MOVSS_XMM_to_XMM(src, dest);
+ SSE_MOVAPS_XMM_to_XMM(dest, src);
+}
+void VU_MERGE7b(int dest, int src) { // 1110s
+ SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+}
+void VU_MERGE8(int dest, int src) { // 0001s
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+}
+void VU_MERGE9(int dest, int src) { // 1001
+ SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd2);
+}
+void VU_MERGE9b(int dest, int src) { // 1001s
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
+}
+void VU_MERGE10(int dest, int src) { // 0101
+ SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x72);
+}
+void VU_MERGE10b(int dest, int src) { // 0101s
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
+}
+void VU_MERGE11(int dest, int src) { // 1101s
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
+}
+void VU_MERGE12(int dest, int src) { // 0011
+ SSE2_MOVSD_XMM_to_XMM(dest, src);
+}
+void VU_MERGE13(int dest, int src) { // 1011
+ SSE_MOVHLPS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, dest, 0x64);
+ SSE_MOVAPS_XMM_to_XMM(dest, src);
+}
+void VU_MERGE13b(int dest, int src) { // 1011s
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+}
+void VU_MERGE14(int dest, int src) { // 0111
+ SSE_MOVHLPS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, dest, 0xc4);
+ SSE_MOVAPS_XMM_to_XMM(dest, src);
+}
+void VU_MERGE14b(int dest, int src) { // 0111s
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
+ SSE_MOVSS_XMM_to_XMM(dest, src);
+ SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
+ SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
+}
+void VU_MERGE15(int dest, int src) { // 1111s
+ SSE_MOVAPS_XMM_to_XMM(dest, src);
+}
+
+typedef void (*VUMERGEFN)(int dest, int src);
+
+static VUMERGEFN s_VuMerge[16] = {
+ VU_MERGE0, VU_MERGE1, VU_MERGE2, VU_MERGE3,
+ VU_MERGE4, VU_MERGE5, VU_MERGE6, VU_MERGE7,
+ VU_MERGE8, VU_MERGE9, VU_MERGE10, VU_MERGE11,
+ VU_MERGE12, VU_MERGE13, VU_MERGE14, VU_MERGE15 };
+
+static VUMERGEFN s_VuMerge2[16] = {
+ VU_MERGE0, VU_MERGE1b, VU_MERGE2b, VU_MERGE3,
+ VU_MERGE4b, VU_MERGE5b, VU_MERGE6b, VU_MERGE7b,
+ VU_MERGE8, VU_MERGE9b, VU_MERGE10b, VU_MERGE11,
+ VU_MERGE12, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 };
+
+// Modifies the Source Reg!
+void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) {
+ xyzw &= 0xf;
+ if ( (dest != src) && (xyzw != 0) ) {
+ if ( x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) {
+ xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
+ SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw);
+ }
+ else s_VuMerge[xyzw](dest, src);
+ }
+}
+// Doesn't Modify the Source Reg! (ToDo: s_VuMerge2() has room for optimization)
+void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw) {
+ xyzw &= 0xf;
+ if ( (dest != src) && (xyzw != 0) ) {
+ if ( x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) {
+ xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
+ SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw);
+ }
+ else s_VuMerge2[xyzw](dest, src);
+ }
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// Misc VU Reg Clamping/Overflow Functions
+//------------------------------------------------------------------
+#define CLAMP_NORMAL_SSE4(n) \
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);\
+ SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[n][0]);\
+ SSE2_PSUBD_XMM_to_XMM(regTemp, regd);\
+ SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_ones[0]);\
+ SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[n][0]);\
+ SSE2_PSLLD_I8_to_XMM(regTemp, 31);\
+ SSE_XORPS_XMM_to_XMM(regd, regTemp);
+
+#define CLAMP_SIGN_SSE4(n) \
+ SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[n][0]);\
+ SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[n][0]);
+
+void vFloat0(int regd, int regTemp) { } //0000
+void vFloat1(int regd, int regTemp) { //1000
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+}
+void vFloat1c(int regd, int regTemp) { //1000
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(1);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat2(int regd, int regTemp) { //0100
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+}
+void vFloat2c(int regd, int regTemp) { //0100
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(2);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat3(int regd, int regTemp) { //1100
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
+}
+void vFloat3b(int regd, int regTemp) { //1100 //regTemp is Modified
+ SSE2_MOVSD_XMM_to_XMM(regTemp, regd);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
+}
+void vFloat3c(int regd, int regTemp) { //1100
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(3);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat4(int regd, int regTemp) { //0010
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+}
+void vFloat4c(int regd, int regTemp) { //0010
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(4);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat5(int regd, int regTemp) { //1010
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
+}
+void vFloat5b(int regd, int regTemp) { //1010 //regTemp is Modified
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_NORMAL_SSE4(5);
+ }
+ else {
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
+ }
+}
+void vFloat5c(int regd, int regTemp) { //1010
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(5);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat6(int regd, int regTemp) { //0110
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
+}
+void vFloat6b(int regd, int regTemp) { //0110 //regTemp is Modified
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_NORMAL_SSE4(6);
+ }
+ else {
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
+ }
+}
+void vFloat6c(int regd, int regTemp) { //0110
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(6);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat7(int regd, int regTemp) { //1110
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
+}
+void vFloat7_useEAX(int regd, int regTemp) { //1110 //EAX is Modified
+ SSE2_MOVD_XMM_to_R(EAX, regd);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ if ( x86caps.hasStreamingSIMD4Extensions )
+ SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
+ else {
+ SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
+ SHR32ItoR(EAX, 16);
+ SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
+ }
+}
+void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified
+ SSE_MOVSS_XMM_to_XMM(regTemp, regd);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ SSE_MOVSS_XMM_to_XMM(regd, regTemp);
+}
+void vFloat7c(int regd, int regTemp) { //1110
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(7);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat7c_useEAX(int regd, int regTemp) { //1110 //EAX is Modified
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(7);
+ }
+ else {
+ SSE2_MOVD_XMM_to_R(EAX, regd);
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ SSE2_MOVD_R_to_XMM(regTemp, EAX);
+ SSE_MOVSS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat8(int regd, int regTemp) { //0001
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+}
+void vFloat8c(int regd, int regTemp) { //0001
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(8);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat9(int regd, int regTemp) { //1001
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+}
+void vFloat9b(int regd, int regTemp) { //1001 //regTemp is Modified
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_NORMAL_SSE4(9);
+ }
+ else {
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ }
+}
+void vFloat9c(int regd, int regTemp) { //1001
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(9);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat10(int regd, int regTemp) { //0101
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+}
+void vFloat10b(int regd, int regTemp) { //0101 //regTemp is Modified
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_NORMAL_SSE4(10);
+ }
+ else {
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ }
+}
+void vFloat10c(int regd, int regTemp) { //0101
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(10);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat11(int regd, int regTemp) { //1101
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
+}
+void vFloat11_useEAX(int regd, int regTemp) { //1101 //EAX is Modified
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE2_MOVD_XMM_to_R(EAX, regd);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ if ( x86caps.hasStreamingSIMD4Extensions )
+ SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
+ else {
+ SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
+ SHR32ItoR(EAX, 16);
+ SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
+ }
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+}
+void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ SSE_MOVSS_XMM_to_XMM(regTemp, regd);
+ SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
+}
+void vFloat11c(int regd, int regTemp) { //1101
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(11);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat11c_useEAX(int regd, int regTemp) { //1101 // EAX is modified
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(11);
+ }
+ else {
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE2_MOVD_XMM_to_R(EAX, regd);
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ SSE2_MOVD_R_to_XMM(regTemp, EAX);
+ SSE_MOVSS_XMM_to_XMM(regd, regTemp);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ }
+}
+void vFloat12(int regd, int regTemp) { //0011
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+}
+void vFloat12b(int regd, int regTemp) { //0011 //regTemp is Modified
+ SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PUNPCKLQDQ_XMM_to_XMM(regd, regTemp);
+}
+void vFloat12c(int regd, int regTemp) { //0011
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(12);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat13(int regd, int regTemp) { //1011
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
+}
+void vFloat13_useEAX(int regd, int regTemp) { //1011 // EAX is modified
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE2_MOVD_XMM_to_R(EAX, regd);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ if ( x86caps.hasStreamingSIMD4Extensions )
+ SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
+ else {
+ SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
+ SHR32ItoR(EAX, 16);
+ SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
+ }
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+}
+void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
+ SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x64);
+}
+void vFloat13c(int regd, int regTemp) { //1011
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(13);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat13c_useEAX(int regd, int regTemp) { //1011 // EAX is modified
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(13);
+ }
+ else {
+ SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0xc6);
+ SSE2_MOVD_XMM_to_R(EAX, regd);
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ SSE2_MOVD_R_to_XMM(regTemp, EAX);
+ SSE_MOVSS_XMM_to_XMM(regd, regTemp);
+ SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0xc6);
+ }
+}
+void vFloat14(int regd, int regTemp) { //0111
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
+}
+void vFloat14_useEAX(int regd, int regTemp) { //0111 // EAX is modified
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+ SSE2_MOVD_XMM_to_R(EAX, regd);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ if ( x86caps.hasStreamingSIMD4Extensions )
+ SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
+ else {
+ SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
+ SHR32ItoR(EAX, 16);
+ SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
+ }
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
+}
+void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
+ SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xc4);
+}
+void vFloat14c(int regd, int regTemp) { //0111
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(14);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
+ SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+void vFloat14c_useEAX(int regd, int regTemp) { //0111 // EAX is modified
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(14);
+ }
+ else {
+ SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0x27);
+ SSE2_MOVD_XMM_to_R(EAX, regd);
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ SSE2_MOVD_R_to_XMM(regTemp, EAX);
+ SSE_MOVSS_XMM_to_XMM(regd, regTemp);
+ SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0x27);
+ }
+}
+void vFloat15(int regd, int regTemp) { //1111
+ SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
+}
+void vFloat15c(int regd, int regTemp) { //1111
+ if ( x86caps.hasStreamingSIMD4Extensions ) {
+ CLAMP_SIGN_SSE4(15);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
+ SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
+ SSE_MINPS_M128_to_XMM(regd, (uptr)&g_maxvals[0]);
+ SSE_MAXPS_M128_to_XMM(regd, (uptr)&g_minvals[0]);
+ SSE_ORPS_XMM_to_XMM(regd, regTemp);
+ }
+}
+
+vFloat vFloats1[16] = { //regTemp is not modified
+ vFloat0, vFloat1, vFloat2, vFloat3,
+ vFloat4, vFloat5, vFloat6, vFloat7,
+ vFloat8, vFloat9, vFloat10, vFloat11,
+ vFloat12, vFloat13, vFloat14, vFloat15 };
+
+vFloat vFloats1_useEAX[16] = { //regTemp is not modified but EAX is used
+ vFloat0, vFloat1, vFloat2, vFloat3,
+ vFloat4, vFloat5, vFloat6, vFloat7_useEAX,
+ vFloat8, vFloat9, vFloat10, vFloat11_useEAX,
+ vFloat12, vFloat13_useEAX, vFloat14_useEAX, vFloat15 };
+
+vFloat vFloats2[16] = { //regTemp is modified
+ vFloat0, vFloat1, vFloat2, vFloat3b,
+ vFloat4, vFloat5b, vFloat6b, vFloat7b,
+ vFloat8, vFloat9b, vFloat10b, vFloat11b,
+ vFloat12b, vFloat13b, vFloat14b, vFloat15 };
+
+vFloat vFloats4[16] = { //regTemp is modified
+ vFloat0, vFloat1c, vFloat2c, vFloat3c,
+ vFloat4c, vFloat5c, vFloat6c, vFloat7c,
+ vFloat8c, vFloat9c, vFloat10c, vFloat11c,
+ vFloat12c, vFloat13c, vFloat14c, vFloat15c };
+
+vFloat vFloats4_useEAX[16] = { //regTemp is modified and EAX is used
+ vFloat0, vFloat1c, vFloat2c, vFloat3c,
+ vFloat4c, vFloat5c, vFloat6c, vFloat7c_useEAX,
+ vFloat8c, vFloat9c, vFloat10c, vFloat11c_useEAX,
+ vFloat12c, vFloat13c_useEAX, vFloat14c_useEAX, vFloat15c };
+
+//------------------------------------------------------------------
+// Clamping Functions (wrapper for vFloat* functions)
+// vuFloat : "normal" clamping
+// vuFloat_useEAX : "normal" clamping (faster but EAX is modified)
+// vuFloat2 : "normal" clamping (fastest but regTemp is modified)
+// vuFloat3 : "preserve sign" clamping for pointer
+// vuFloat4 : "preserve sign" clamping (regTemp is modified; *FASTEST* on SSE4 CPUs)
+// vuFloat4_useEAX : "preserve sign" clamping (faster but regTemp and EAX are modified)
+// vuFloat5 : wrapper function for vuFloat2 and vuFloat4
+// vuFloat5_useEAX : wrapper function for vuFloat2 and vuFloat4_useEAX
+// vuFloatExtra : for debugging
+//
+// Notice 1: vuFloat*_useEAX may be slower on AMD CPUs, which have independent execution pipeline for
+// vector and scalar instructions (need checks)
+// Notice 2: recVUMI_MUL_xyzw_toD and recVUMI_MADD_xyzw_toD use vFloats directly!
+//------------------------------------------------------------------
+
+// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (doesn't use any temp regs)
+void vuFloat( int info, int regd, int XYZW) {
+ if( CHECK_VU_OVERFLOW ) {
+ /*if ( (XYZW != 0) && (XYZW != 8) && (XYZW != 0xF) ) {
+ int t1reg = _vuGetTempXMMreg(info);
+ if (t1reg >= 0) {
+ vuFloat2( regd, t1reg, XYZW );
+ _freeXMMreg( t1reg );
+ return;
+ }
+ }*/
+ //vuFloatExtra(regd, XYZW);
+ vFloats1[XYZW](regd, regd);
+ }
+}
+
+// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses EAX as a temp register; faster but **destroys EAX**)
+void vuFloat_useEAX( int info, int regd, int XYZW) {
+ if( CHECK_VU_OVERFLOW ) {
+ vFloats1_useEAX[XYZW](regd, regd);
+ }
+}
+
+// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses a temp reg)
+void vuFloat2(int regd, int regTemp, int XYZW) {
+ if( CHECK_VU_OVERFLOW ) {
+ //vuFloatExtra(regd, XYZW);
+ vFloats2[XYZW](regd, regTemp);
+ }
+}
+
+// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg)
+void vuFloat4(int regd, int regTemp, int XYZW) {
+ if( CHECK_VU_OVERFLOW ) {
+ vFloats4[XYZW](regd, regTemp);
+ }
+}
+
+// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg, and uses EAX as a temp register; faster but **destroys EAX**)
+void vuFloat4_useEAX(int regd, int regTemp, int XYZW) {
+ if( CHECK_VU_OVERFLOW ) {
+ vFloats4_useEAX[XYZW](regd, regTemp);
+ }
+}
+
+// Uses vuFloat4 or vuFloat2 depending on the CHECK_VU_SIGN_OVERFLOW setting
+void vuFloat5(int regd, int regTemp, int XYZW) {
+ if (CHECK_VU_SIGN_OVERFLOW) {
+ vuFloat4(regd, regTemp, XYZW);
+ }
+ else vuFloat2(regd, regTemp, XYZW);
+}
+
+// Uses vuFloat4_useEAX or vuFloat2 depending on the CHECK_VU_SIGN_OVERFLOW setting (uses EAX as a temp register; faster but **destoroyes EAX**)
+void vuFloat5_useEAX(int regd, int regTemp, int XYZW) {
+ if (CHECK_VU_SIGN_OVERFLOW) {
+ vuFloat4_useEAX(regd, regTemp, XYZW);
+ }
+ else vuFloat2(regd, regTemp, XYZW);
+}
+
+// Clamps +/-infs to +/-fMax, and +/-NaNs to +/-fMax
+void vuFloat3(uptr x86ptr) {
+ u8* pjmp;
+
+ if( CHECK_VU_OVERFLOW ) {
+ CMP32ItoM(x86ptr, 0x7f800000 );
+ pjmp = JL8(0); // Signed Comparison
+ MOV32ItoM(x86ptr, 0x7f7fffff );
+ x86SetJ8(pjmp);
+
+ CMP32ItoM(x86ptr, 0xff800000 );
+ pjmp = JB8(0); // Unsigned Comparison
+ MOV32ItoM(x86ptr, 0xff7fffff );
+ x86SetJ8(pjmp);
+ }
+}
+
+__aligned16 u64 vuFloatData[4];
+
+// Makes NaN == 0, Infinities stay the same; Very Slow - Use only for debugging
+void vuFloatExtra( int regd, int XYZW) {
+ int t1reg = (regd == 0) ? (regd + 1) : (regd - 1);
+ int t2reg = (regd <= 1) ? (regd + 2) : (regd - 2);
+ SSE_MOVAPS_XMM_to_M128( (uptr)&vuFloatData[0], t1reg );
+ SSE_MOVAPS_XMM_to_M128( (uptr)&vuFloatData[2], t2reg );
+
+ SSE_XORPS_XMM_to_XMM(t1reg, t1reg);
+ SSE_CMPORDPS_XMM_to_XMM(t1reg, regd);
+ SSE_MOVAPS_XMM_to_XMM(t2reg, regd);
+ SSE_ANDPS_XMM_to_XMM(t2reg, t1reg);
+ VU_MERGE_REGS_CUSTOM(regd, t2reg, XYZW);
+
+ SSE_MOVAPS_M128_to_XMM( t1reg, (uptr)&vuFloatData[0] );
+ SSE_MOVAPS_M128_to_XMM( t2reg, (uptr)&vuFloatData[2] );
+}
+
+static __aligned16 u32 tempRegX[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
+
+// Called by testWhenOverflow() function
+void testPrintOverflow() {
+ tempRegX[0] &= 0xff800000;
+ tempRegX[1] &= 0xff800000;
+ tempRegX[2] &= 0xff800000;
+ tempRegX[3] &= 0xff800000;
+ if ( (tempRegX[0] == 0x7f800000) || (tempRegX[1] == 0x7f800000) || (tempRegX[2] == 0x7f800000) || (tempRegX[3] == 0x7f800000) )
+ Console.Warning( "VU OVERFLOW!: Changing to +Fmax!!!!!!!!!!!!" );
+ if ( (tempRegX[0] == 0xff800000) || (tempRegX[1] == 0xff800000) || (tempRegX[2] == 0xff800000) || (tempRegX[3] == 0xff800000) )
+ Console.Warning( "VU OVERFLOW!: Changing to -Fmax!!!!!!!!!!!!" );
+}
+
+// Outputs to the console when overflow has occured.
+void testWhenOverflow(int info, int regd, int t0reg) {
+ SSE_MOVAPS_XMM_to_M128((uptr)tempRegX, regd);
+ CALLFunc((uptr)testPrintOverflow);
+}
diff --git a/pcsx2/x86/sVU_Micro.h b/pcsx2/x86/sVU_Micro.h
index 9d7415f380..4aec2425ed 100644
--- a/pcsx2/x86/sVU_Micro.h
+++ b/pcsx2/x86/sVU_Micro.h
@@ -1,283 +1,283 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-#pragma once
-
-#include "VUmicro.h"
-
-extern u32 vudump;
-
-#define VU0_MEMSIZE 0x1000
-#define VU1_MEMSIZE 0x4000
-
-void recResetVU0();
-void recExecuteVU0Block();
-void recClearVU0( u32 Addr, u32 Size );
-
-void recVU1Init();
-void recVU1Shutdown();
-void recResetVU1();
-void recExecuteVU1Block();
-void recClearVU1( u32 Addr, u32 Size );
-
-
-u32 GetVIAddr(VURegs * VU, int reg, int read, int info); // returns the correct VI addr
-void recUpdateFlags(VURegs * VU, int reg, int info);
-
-void _recvuTestPipes(VURegs * VU);
-void _recvuFlushFDIV(VURegs * VU);
-void _recvuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn);
-void _recvuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn);
-void _recvuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn);
-void _recvuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn);
-
-#define VUOP_READ 2
-#define VUOP_WRITE 4
-
-// save on mem
-struct _vuopinfo {
- int cycle;
- int cycles;
- u8 statusflag;
- u8 macflag;
- u8 clipflag;
- u8 dummy;
- u8 q;
- u8 p;
- u16 pqinst; // bit of instruction specifying index (srec only)
-};
-
-void SuperVUAnalyzeOp(VURegs *VU, _vuopinfo *info, _VURegsNum* pCodeRegs);
-int eeVURecompileCode(VURegs *VU, _VURegsNum* regs); // allocates all the necessary regs and returns the indices
-void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr); // used for MTGS in XGKICK
-
-extern int vucycle;
-typedef void (*vFloat)(int regd, int regTemp);
-extern vFloat vFloats1[16];
-extern vFloat vFloats1_useEAX[16];
-extern vFloat vFloats2[16];
-extern vFloat vFloats4[16];
-extern vFloat vFloats4_useEAX[16];
-extern const __aligned16 float s_fones[8];
-extern const __aligned16 u32 s_mask[4];
-extern const __aligned16 u32 s_expmask[4];
-extern const __aligned16 u32 g_minvals[4];
-extern const __aligned16 u32 g_maxvals[4];
-extern const __aligned16 u32 const_clip[8];
-
-u32 GetVIAddr(VURegs * VU, int reg, int read, int info);
-int _vuGetTempXMMreg(int info);
-void vuFloat(int info, int regd, int XYZW);
-void vuFloat_useEAX(int regd, int regTemp, int XYZW);
-void vuFloat2(int regd, int regTemp, int XYZW);
-void vuFloat3(uptr x86ptr);
-void vuFloat4(int regd, int regTemp, int XYZW);
-void vuFloat4_useEAX(int regd, int regTemp, int XYZW);
-void vuFloat5(int regd, int regTemp, int XYZW);
-void vuFloat5_useEAX(int regd, int regTemp, int XYZW);
-void _vuFlipRegSS(VURegs * VU, int reg);
-void _vuFlipRegSS_xyzw(int reg, int xyzw);
-void _vuMoveSS(VURegs * VU, int dstreg, int srcreg);
-void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw);
-void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw);
-void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw);
-void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw);
-#define VU_MERGE_REGS(dest, src) { \
- VU_MERGE_REGS_CUSTOM(dest, src, _X_Y_Z_W); \
-}
-
-// use for allocating vi regs
-#define ALLOCTEMPX86(mode) _allocX86reg(-1, X86TYPE_TEMP, 0, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode)
-#define ALLOCVI(vi, mode) _allocX86reg(-1, X86TYPE_VI|((VU==&VU1)?X86TYPE_VU1:0), vi, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode)
-#define ADD_VI_NEEDED(vi) _addNeededX86reg(X86TYPE_VI|(VU==&VU1?X86TYPE_VU1:0), vi);
-
-#define SWAP(x, y) *(u32*)&y ^= *(u32*)&x ^= *(u32*)&y ^= *(u32*)&x;
-
-/*****************************************
- VU Micromode Upper instructions
-*****************************************/
-
-void recVUMI_ABS(VURegs *vuRegs, int info);
-void recVUMI_ADD(VURegs *vuRegs, int info);
-void recVUMI_ADDi(VURegs *vuRegs, int info);
-void recVUMI_ADDq(VURegs *vuRegs, int info);
-void recVUMI_ADDx(VURegs *vuRegs, int info);
-void recVUMI_ADDy(VURegs *vuRegs, int info);
-void recVUMI_ADDz(VURegs *vuRegs, int info);
-void recVUMI_ADDw(VURegs *vuRegs, int info);
-void recVUMI_ADDA(VURegs *vuRegs, int info);
-void recVUMI_ADDAi(VURegs *vuRegs, int info);
-void recVUMI_ADDAq(VURegs *vuRegs, int info);
-void recVUMI_ADDAx(VURegs *vuRegs, int info);
-void recVUMI_ADDAy(VURegs *vuRegs, int info);
-void recVUMI_ADDAz(VURegs *vuRegs, int info);
-void recVUMI_ADDAw(VURegs *vuRegs, int info);
-void recVUMI_SUB(VURegs *vuRegs, int info);
-void recVUMI_SUBi(VURegs *vuRegs, int info);
-void recVUMI_SUBq(VURegs *vuRegs, int info);
-void recVUMI_SUBx(VURegs *vuRegs, int info);
-void recVUMI_SUBy(VURegs *vuRegs, int info);
-void recVUMI_SUBz(VURegs *vuRegs, int info);
-void recVUMI_SUBw(VURegs *vuRegs, int info);
-void recVUMI_SUBA(VURegs *vuRegs, int info);
-void recVUMI_SUBAi(VURegs *vuRegs, int info);
-void recVUMI_SUBAq(VURegs *vuRegs, int info);
-void recVUMI_SUBAx(VURegs *vuRegs, int info);
-void recVUMI_SUBAy(VURegs *vuRegs, int info);
-void recVUMI_SUBAz(VURegs *vuRegs, int info);
-void recVUMI_SUBAw(VURegs *vuRegs, int info);
-void recVUMI_MUL(VURegs *vuRegs, int info);
-void recVUMI_MULi(VURegs *vuRegs, int info);
-void recVUMI_MULq(VURegs *vuRegs, int info);
-void recVUMI_MULx(VURegs *vuRegs, int info);
-void recVUMI_MULy(VURegs *vuRegs, int info);
-void recVUMI_MULz(VURegs *vuRegs, int info);
-void recVUMI_MULw(VURegs *vuRegs, int info);
-void recVUMI_MULA(VURegs *vuRegs, int info);
-void recVUMI_MULAi(VURegs *vuRegs, int info);
-void recVUMI_MULAq(VURegs *vuRegs, int info);
-void recVUMI_MULAx(VURegs *vuRegs, int info);
-void recVUMI_MULAy(VURegs *vuRegs, int info);
-void recVUMI_MULAz(VURegs *vuRegs, int info);
-void recVUMI_MULAw(VURegs *vuRegs, int info);
-void recVUMI_MADD(VURegs *vuRegs, int info);
-void recVUMI_MADDi(VURegs *vuRegs, int info);
-void recVUMI_MADDq(VURegs *vuRegs, int info);
-void recVUMI_MADDx(VURegs *vuRegs, int info);
-void recVUMI_MADDy(VURegs *vuRegs, int info);
-void recVUMI_MADDz(VURegs *vuRegs, int info);
-void recVUMI_MADDw(VURegs *vuRegs, int info);
-void recVUMI_MADDA(VURegs *vuRegs, int info);
-void recVUMI_MADDAi(VURegs *vuRegs, int info);
-void recVUMI_MADDAq(VURegs *vuRegs, int info);
-void recVUMI_MADDAx(VURegs *vuRegs, int info);
-void recVUMI_MADDAy(VURegs *vuRegs, int info);
-void recVUMI_MADDAz(VURegs *vuRegs, int info);
-void recVUMI_MADDAw(VURegs *vuRegs, int info);
-void recVUMI_MSUB(VURegs *vuRegs, int info);
-void recVUMI_MSUBi(VURegs *vuRegs, int info);
-void recVUMI_MSUBq(VURegs *vuRegs, int info);
-void recVUMI_MSUBx(VURegs *vuRegs, int info);
-void recVUMI_MSUBy(VURegs *vuRegs, int info);
-void recVUMI_MSUBz(VURegs *vuRegs, int info);
-void recVUMI_MSUBw(VURegs *vuRegs, int info);
-void recVUMI_MSUBA(VURegs *vuRegs, int info);
-void recVUMI_MSUBAi(VURegs *vuRegs, int info);
-void recVUMI_MSUBAq(VURegs *vuRegs, int info);
-void recVUMI_MSUBAx(VURegs *vuRegs, int info);
-void recVUMI_MSUBAy(VURegs *vuRegs, int info);
-void recVUMI_MSUBAz(VURegs *vuRegs, int info);
-void recVUMI_MSUBAw(VURegs *vuRegs, int info);
-void recVUMI_MAX(VURegs *vuRegs, int info);
-void recVUMI_MAXi(VURegs *vuRegs, int info);
-void recVUMI_MAXx(VURegs *vuRegs, int info);
-void recVUMI_MAXy(VURegs *vuRegs, int info);
-void recVUMI_MAXz(VURegs *vuRegs, int info);
-void recVUMI_MAXw(VURegs *vuRegs, int info);
-void recVUMI_MINI(VURegs *vuRegs, int info);
-void recVUMI_MINIi(VURegs *vuRegs, int info);
-void recVUMI_MINIx(VURegs *vuRegs, int info);
-void recVUMI_MINIy(VURegs *vuRegs, int info);
-void recVUMI_MINIz(VURegs *vuRegs, int info);
-void recVUMI_MINIw(VURegs *vuRegs, int info);
-void recVUMI_OPMULA(VURegs *vuRegs, int info);
-void recVUMI_OPMSUB(VURegs *vuRegs, int info);
-void recVUMI_NOP(VURegs *vuRegs, int info);
-void recVUMI_FTOI0(VURegs *vuRegs, int info);
-void recVUMI_FTOI4(VURegs *vuRegs, int info);
-void recVUMI_FTOI12(VURegs *vuRegs, int info);
-void recVUMI_FTOI15(VURegs *vuRegs, int info);
-void recVUMI_ITOF0(VURegs *vuRegs, int info);
-void recVUMI_ITOF4(VURegs *vuRegs, int info);
-void recVUMI_ITOF12(VURegs *vuRegs, int info);
-void recVUMI_ITOF15(VURegs *vuRegs, int info);
-void recVUMI_CLIP(VURegs *vuRegs, int info);
-
-/*****************************************
- VU Micromode Lower instructions
-*****************************************/
-
-void recVUMI_DIV(VURegs *vuRegs, int info);
-void recVUMI_SQRT(VURegs *vuRegs, int info);
-void recVUMI_RSQRT(VURegs *vuRegs, int info);
-void recVUMI_IADD(VURegs *vuRegs, int info);
-void recVUMI_IADDI(VURegs *vuRegs, int info);
-void recVUMI_IADDIU(VURegs *vuRegs, int info);
-void recVUMI_IAND(VURegs *vuRegs, int info);
-void recVUMI_IOR(VURegs *vuRegs, int info);
-void recVUMI_ISUB(VURegs *vuRegs, int info);
-void recVUMI_ISUBIU(VURegs *vuRegs, int info);
-void recVUMI_MOVE(VURegs *vuRegs, int info);
-void recVUMI_MFIR(VURegs *vuRegs, int info);
-void recVUMI_MTIR(VURegs *vuRegs, int info);
-void recVUMI_MR32(VURegs *vuRegs, int info);
-void recVUMI_LQ(VURegs *vuRegs, int info);
-void recVUMI_LQD(VURegs *vuRegs, int info);
-void recVUMI_LQI(VURegs *vuRegs, int info);
-void recVUMI_SQ(VURegs *vuRegs, int info);
-void recVUMI_SQD(VURegs *vuRegs, int info);
-void recVUMI_SQI(VURegs *vuRegs, int info);
-void recVUMI_ILW(VURegs *vuRegs, int info);
-void recVUMI_ISW(VURegs *vuRegs, int info);
-void recVUMI_ILWR(VURegs *vuRegs, int info);
-void recVUMI_ISWR(VURegs *vuRegs, int info);
-void recVUMI_LOI(VURegs *vuRegs, int info);
-void recVUMI_RINIT(VURegs *vuRegs, int info);
-void recVUMI_RGET(VURegs *vuRegs, int info);
-void recVUMI_RNEXT(VURegs *vuRegs, int info);
-void recVUMI_RXOR(VURegs *vuRegs, int info);
-void recVUMI_WAITQ(VURegs *vuRegs, int info);
-void recVUMI_FSAND(VURegs *vuRegs, int info);
-void recVUMI_FSEQ(VURegs *vuRegs, int info);
-void recVUMI_FSOR(VURegs *vuRegs, int info);
-void recVUMI_FSSET(VURegs *vuRegs, int info);
-void recVUMI_FMAND(VURegs *vuRegs, int info);
-void recVUMI_FMEQ(VURegs *vuRegs, int info);
-void recVUMI_FMOR(VURegs *vuRegs, int info);
-void recVUMI_FCAND(VURegs *vuRegs, int info);
-void recVUMI_FCEQ(VURegs *vuRegs, int info);
-void recVUMI_FCOR(VURegs *vuRegs, int info);
-void recVUMI_FCSET(VURegs *vuRegs, int info);
-void recVUMI_FCGET(VURegs *vuRegs, int info);
-void recVUMI_IBEQ(VURegs *vuRegs, int info);
-void recVUMI_IBGEZ(VURegs *vuRegs, int info);
-void recVUMI_IBGTZ(VURegs *vuRegs, int info);
-void recVUMI_IBLTZ(VURegs *vuRegs, int info);
-void recVUMI_IBLEZ(VURegs *vuRegs, int info);
-void recVUMI_IBNE(VURegs *vuRegs, int info);
-void recVUMI_B(VURegs *vuRegs, int info);
-void recVUMI_BAL(VURegs *vuRegs, int info);
-void recVUMI_JR(VURegs *vuRegs, int info);
-void recVUMI_JALR(VURegs *vuRegs, int info);
-void recVUMI_MFP(VURegs *vuRegs, int info);
-void recVUMI_WAITP(VURegs *vuRegs, int info);
-void recVUMI_ESADD(VURegs *vuRegs, int info);
-void recVUMI_ERSADD(VURegs *vuRegs, int info);
-void recVUMI_ELENG(VURegs *vuRegs, int info);
-void recVUMI_ERLENG(VURegs *vuRegs, int info);
-void recVUMI_EATANxy(VURegs *vuRegs, int info);
-void recVUMI_EATANxz(VURegs *vuRegs, int info);
-void recVUMI_ESUM(VURegs *vuRegs, int info);
-void recVUMI_ERCPR(VURegs *vuRegs, int info);
-void recVUMI_ESQRT(VURegs *vuRegs, int info);
-void recVUMI_ERSQRT(VURegs *vuRegs, int info);
-void recVUMI_ESIN(VURegs *vuRegs, int info);
-void recVUMI_EATAN(VURegs *vuRegs, int info);
-void recVUMI_EEXP(VURegs *vuRegs, int info);
-void recVUMI_XGKICK(VURegs *vuRegs, int info);
-void recVUMI_XTOP(VURegs *vuRegs, int info);
-void recVUMI_XITOP(VURegs *vuRegs, int info);
-void recVUMI_XTOP( VURegs *VU , int info);
-
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+#pragma once
+
+#include "VUmicro.h"
+
+extern u32 vudump;
+
+#define VU0_MEMSIZE 0x1000
+#define VU1_MEMSIZE 0x4000
+
+void recResetVU0();
+void recExecuteVU0Block();
+void recClearVU0( u32 Addr, u32 Size );
+
+void recVU1Init();
+void recVU1Shutdown();
+void recResetVU1();
+void recExecuteVU1Block();
+void recClearVU1( u32 Addr, u32 Size );
+
+
+u32 GetVIAddr(VURegs * VU, int reg, int read, int info); // returns the correct VI addr
+void recUpdateFlags(VURegs * VU, int reg, int info);
+
+void _recvuTestPipes(VURegs * VU);
+void _recvuFlushFDIV(VURegs * VU);
+void _recvuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn);
+void _recvuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn);
+void _recvuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn);
+void _recvuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn);
+
+#define VUOP_READ 2
+#define VUOP_WRITE 4
+
+// save on mem
+struct _vuopinfo {
+ int cycle;
+ int cycles;
+ u8 statusflag;
+ u8 macflag;
+ u8 clipflag;
+ u8 dummy;
+ u8 q;
+ u8 p;
+ u16 pqinst; // bit of instruction specifying index (srec only)
+};
+
+void SuperVUAnalyzeOp(VURegs *VU, _vuopinfo *info, _VURegsNum* pCodeRegs);
+int eeVURecompileCode(VURegs *VU, _VURegsNum* regs); // allocates all the necessary regs and returns the indices
+void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr); // used for MTGS in XGKICK
+
+extern int vucycle;
+typedef void (*vFloat)(int regd, int regTemp);
+extern vFloat vFloats1[16];
+extern vFloat vFloats1_useEAX[16];
+extern vFloat vFloats2[16];
+extern vFloat vFloats4[16];
+extern vFloat vFloats4_useEAX[16];
+extern const __aligned16 float s_fones[8];
+extern const __aligned16 u32 s_mask[4];
+extern const __aligned16 u32 s_expmask[4];
+extern const __aligned16 u32 g_minvals[4];
+extern const __aligned16 u32 g_maxvals[4];
+extern const __aligned16 u32 const_clip[8];
+
+u32 GetVIAddr(VURegs * VU, int reg, int read, int info);
+int _vuGetTempXMMreg(int info);
+void vuFloat(int info, int regd, int XYZW);
+void vuFloat_useEAX(int regd, int regTemp, int XYZW);
+void vuFloat2(int regd, int regTemp, int XYZW);
+void vuFloat3(uptr x86ptr);
+void vuFloat4(int regd, int regTemp, int XYZW);
+void vuFloat4_useEAX(int regd, int regTemp, int XYZW);
+void vuFloat5(int regd, int regTemp, int XYZW);
+void vuFloat5_useEAX(int regd, int regTemp, int XYZW);
+void _vuFlipRegSS(VURegs * VU, int reg);
+void _vuFlipRegSS_xyzw(int reg, int xyzw);
+void _vuMoveSS(VURegs * VU, int dstreg, int srcreg);
+void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw);
+void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw);
+void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw);
+void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw);
+#define VU_MERGE_REGS(dest, src) { \
+ VU_MERGE_REGS_CUSTOM(dest, src, _X_Y_Z_W); \
+}
+
+// use for allocating vi regs
+#define ALLOCTEMPX86(mode) _allocX86reg(-1, X86TYPE_TEMP, 0, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode)
+#define ALLOCVI(vi, mode) _allocX86reg(-1, X86TYPE_VI|((VU==&VU1)?X86TYPE_VU1:0), vi, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode)
+#define ADD_VI_NEEDED(vi) _addNeededX86reg(X86TYPE_VI|(VU==&VU1?X86TYPE_VU1:0), vi);
+
+#define SWAP(x, y) *(u32*)&y ^= *(u32*)&x ^= *(u32*)&y ^= *(u32*)&x;
+
+/*****************************************
+ VU Micromode Upper instructions
+*****************************************/
+
+void recVUMI_ABS(VURegs *vuRegs, int info);
+void recVUMI_ADD(VURegs *vuRegs, int info);
+void recVUMI_ADDi(VURegs *vuRegs, int info);
+void recVUMI_ADDq(VURegs *vuRegs, int info);
+void recVUMI_ADDx(VURegs *vuRegs, int info);
+void recVUMI_ADDy(VURegs *vuRegs, int info);
+void recVUMI_ADDz(VURegs *vuRegs, int info);
+void recVUMI_ADDw(VURegs *vuRegs, int info);
+void recVUMI_ADDA(VURegs *vuRegs, int info);
+void recVUMI_ADDAi(VURegs *vuRegs, int info);
+void recVUMI_ADDAq(VURegs *vuRegs, int info);
+void recVUMI_ADDAx(VURegs *vuRegs, int info);
+void recVUMI_ADDAy(VURegs *vuRegs, int info);
+void recVUMI_ADDAz(VURegs *vuRegs, int info);
+void recVUMI_ADDAw(VURegs *vuRegs, int info);
+void recVUMI_SUB(VURegs *vuRegs, int info);
+void recVUMI_SUBi(VURegs *vuRegs, int info);
+void recVUMI_SUBq(VURegs *vuRegs, int info);
+void recVUMI_SUBx(VURegs *vuRegs, int info);
+void recVUMI_SUBy(VURegs *vuRegs, int info);
+void recVUMI_SUBz(VURegs *vuRegs, int info);
+void recVUMI_SUBw(VURegs *vuRegs, int info);
+void recVUMI_SUBA(VURegs *vuRegs, int info);
+void recVUMI_SUBAi(VURegs *vuRegs, int info);
+void recVUMI_SUBAq(VURegs *vuRegs, int info);
+void recVUMI_SUBAx(VURegs *vuRegs, int info);
+void recVUMI_SUBAy(VURegs *vuRegs, int info);
+void recVUMI_SUBAz(VURegs *vuRegs, int info);
+void recVUMI_SUBAw(VURegs *vuRegs, int info);
+void recVUMI_MUL(VURegs *vuRegs, int info);
+void recVUMI_MULi(VURegs *vuRegs, int info);
+void recVUMI_MULq(VURegs *vuRegs, int info);
+void recVUMI_MULx(VURegs *vuRegs, int info);
+void recVUMI_MULy(VURegs *vuRegs, int info);
+void recVUMI_MULz(VURegs *vuRegs, int info);
+void recVUMI_MULw(VURegs *vuRegs, int info);
+void recVUMI_MULA(VURegs *vuRegs, int info);
+void recVUMI_MULAi(VURegs *vuRegs, int info);
+void recVUMI_MULAq(VURegs *vuRegs, int info);
+void recVUMI_MULAx(VURegs *vuRegs, int info);
+void recVUMI_MULAy(VURegs *vuRegs, int info);
+void recVUMI_MULAz(VURegs *vuRegs, int info);
+void recVUMI_MULAw(VURegs *vuRegs, int info);
+void recVUMI_MADD(VURegs *vuRegs, int info);
+void recVUMI_MADDi(VURegs *vuRegs, int info);
+void recVUMI_MADDq(VURegs *vuRegs, int info);
+void recVUMI_MADDx(VURegs *vuRegs, int info);
+void recVUMI_MADDy(VURegs *vuRegs, int info);
+void recVUMI_MADDz(VURegs *vuRegs, int info);
+void recVUMI_MADDw(VURegs *vuRegs, int info);
+void recVUMI_MADDA(VURegs *vuRegs, int info);
+void recVUMI_MADDAi(VURegs *vuRegs, int info);
+void recVUMI_MADDAq(VURegs *vuRegs, int info);
+void recVUMI_MADDAx(VURegs *vuRegs, int info);
+void recVUMI_MADDAy(VURegs *vuRegs, int info);
+void recVUMI_MADDAz(VURegs *vuRegs, int info);
+void recVUMI_MADDAw(VURegs *vuRegs, int info);
+void recVUMI_MSUB(VURegs *vuRegs, int info);
+void recVUMI_MSUBi(VURegs *vuRegs, int info);
+void recVUMI_MSUBq(VURegs *vuRegs, int info);
+void recVUMI_MSUBx(VURegs *vuRegs, int info);
+void recVUMI_MSUBy(VURegs *vuRegs, int info);
+void recVUMI_MSUBz(VURegs *vuRegs, int info);
+void recVUMI_MSUBw(VURegs *vuRegs, int info);
+void recVUMI_MSUBA(VURegs *vuRegs, int info);
+void recVUMI_MSUBAi(VURegs *vuRegs, int info);
+void recVUMI_MSUBAq(VURegs *vuRegs, int info);
+void recVUMI_MSUBAx(VURegs *vuRegs, int info);
+void recVUMI_MSUBAy(VURegs *vuRegs, int info);
+void recVUMI_MSUBAz(VURegs *vuRegs, int info);
+void recVUMI_MSUBAw(VURegs *vuRegs, int info);
+void recVUMI_MAX(VURegs *vuRegs, int info);
+void recVUMI_MAXi(VURegs *vuRegs, int info);
+void recVUMI_MAXx(VURegs *vuRegs, int info);
+void recVUMI_MAXy(VURegs *vuRegs, int info);
+void recVUMI_MAXz(VURegs *vuRegs, int info);
+void recVUMI_MAXw(VURegs *vuRegs, int info);
+void recVUMI_MINI(VURegs *vuRegs, int info);
+void recVUMI_MINIi(VURegs *vuRegs, int info);
+void recVUMI_MINIx(VURegs *vuRegs, int info);
+void recVUMI_MINIy(VURegs *vuRegs, int info);
+void recVUMI_MINIz(VURegs *vuRegs, int info);
+void recVUMI_MINIw(VURegs *vuRegs, int info);
+void recVUMI_OPMULA(VURegs *vuRegs, int info);
+void recVUMI_OPMSUB(VURegs *vuRegs, int info);
+void recVUMI_NOP(VURegs *vuRegs, int info);
+void recVUMI_FTOI0(VURegs *vuRegs, int info);
+void recVUMI_FTOI4(VURegs *vuRegs, int info);
+void recVUMI_FTOI12(VURegs *vuRegs, int info);
+void recVUMI_FTOI15(VURegs *vuRegs, int info);
+void recVUMI_ITOF0(VURegs *vuRegs, int info);
+void recVUMI_ITOF4(VURegs *vuRegs, int info);
+void recVUMI_ITOF12(VURegs *vuRegs, int info);
+void recVUMI_ITOF15(VURegs *vuRegs, int info);
+void recVUMI_CLIP(VURegs *vuRegs, int info);
+
+/*****************************************
+ VU Micromode Lower instructions
+*****************************************/
+
+void recVUMI_DIV(VURegs *vuRegs, int info);
+void recVUMI_SQRT(VURegs *vuRegs, int info);
+void recVUMI_RSQRT(VURegs *vuRegs, int info);
+void recVUMI_IADD(VURegs *vuRegs, int info);
+void recVUMI_IADDI(VURegs *vuRegs, int info);
+void recVUMI_IADDIU(VURegs *vuRegs, int info);
+void recVUMI_IAND(VURegs *vuRegs, int info);
+void recVUMI_IOR(VURegs *vuRegs, int info);
+void recVUMI_ISUB(VURegs *vuRegs, int info);
+void recVUMI_ISUBIU(VURegs *vuRegs, int info);
+void recVUMI_MOVE(VURegs *vuRegs, int info);
+void recVUMI_MFIR(VURegs *vuRegs, int info);
+void recVUMI_MTIR(VURegs *vuRegs, int info);
+void recVUMI_MR32(VURegs *vuRegs, int info);
+void recVUMI_LQ(VURegs *vuRegs, int info);
+void recVUMI_LQD(VURegs *vuRegs, int info);
+void recVUMI_LQI(VURegs *vuRegs, int info);
+void recVUMI_SQ(VURegs *vuRegs, int info);
+void recVUMI_SQD(VURegs *vuRegs, int info);
+void recVUMI_SQI(VURegs *vuRegs, int info);
+void recVUMI_ILW(VURegs *vuRegs, int info);
+void recVUMI_ISW(VURegs *vuRegs, int info);
+void recVUMI_ILWR(VURegs *vuRegs, int info);
+void recVUMI_ISWR(VURegs *vuRegs, int info);
+void recVUMI_LOI(VURegs *vuRegs, int info);
+void recVUMI_RINIT(VURegs *vuRegs, int info);
+void recVUMI_RGET(VURegs *vuRegs, int info);
+void recVUMI_RNEXT(VURegs *vuRegs, int info);
+void recVUMI_RXOR(VURegs *vuRegs, int info);
+void recVUMI_WAITQ(VURegs *vuRegs, int info);
+void recVUMI_FSAND(VURegs *vuRegs, int info);
+void recVUMI_FSEQ(VURegs *vuRegs, int info);
+void recVUMI_FSOR(VURegs *vuRegs, int info);
+void recVUMI_FSSET(VURegs *vuRegs, int info);
+void recVUMI_FMAND(VURegs *vuRegs, int info);
+void recVUMI_FMEQ(VURegs *vuRegs, int info);
+void recVUMI_FMOR(VURegs *vuRegs, int info);
+void recVUMI_FCAND(VURegs *vuRegs, int info);
+void recVUMI_FCEQ(VURegs *vuRegs, int info);
+void recVUMI_FCOR(VURegs *vuRegs, int info);
+void recVUMI_FCSET(VURegs *vuRegs, int info);
+void recVUMI_FCGET(VURegs *vuRegs, int info);
+void recVUMI_IBEQ(VURegs *vuRegs, int info);
+void recVUMI_IBGEZ(VURegs *vuRegs, int info);
+void recVUMI_IBGTZ(VURegs *vuRegs, int info);
+void recVUMI_IBLTZ(VURegs *vuRegs, int info);
+void recVUMI_IBLEZ(VURegs *vuRegs, int info);
+void recVUMI_IBNE(VURegs *vuRegs, int info);
+void recVUMI_B(VURegs *vuRegs, int info);
+void recVUMI_BAL(VURegs *vuRegs, int info);
+void recVUMI_JR(VURegs *vuRegs, int info);
+void recVUMI_JALR(VURegs *vuRegs, int info);
+void recVUMI_MFP(VURegs *vuRegs, int info);
+void recVUMI_WAITP(VURegs *vuRegs, int info);
+void recVUMI_ESADD(VURegs *vuRegs, int info);
+void recVUMI_ERSADD(VURegs *vuRegs, int info);
+void recVUMI_ELENG(VURegs *vuRegs, int info);
+void recVUMI_ERLENG(VURegs *vuRegs, int info);
+void recVUMI_EATANxy(VURegs *vuRegs, int info);
+void recVUMI_EATANxz(VURegs *vuRegs, int info);
+void recVUMI_ESUM(VURegs *vuRegs, int info);
+void recVUMI_ERCPR(VURegs *vuRegs, int info);
+void recVUMI_ESQRT(VURegs *vuRegs, int info);
+void recVUMI_ERSQRT(VURegs *vuRegs, int info);
+void recVUMI_ESIN(VURegs *vuRegs, int info);
+void recVUMI_EATAN(VURegs *vuRegs, int info);
+void recVUMI_EEXP(VURegs *vuRegs, int info);
+void recVUMI_XGKICK(VURegs *vuRegs, int info);
+void recVUMI_XTOP(VURegs *vuRegs, int info);
+void recVUMI_XITOP(VURegs *vuRegs, int info);
+void recVUMI_XTOP( VURegs *VU , int info);
+
diff --git a/pcsx2/x86/sVU_Upper.cpp b/pcsx2/x86/sVU_Upper.cpp
index 525d770596..09aba70625 100644
--- a/pcsx2/x86/sVU_Upper.cpp
+++ b/pcsx2/x86/sVU_Upper.cpp
@@ -1,3069 +1,3069 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-#include "PrecompiledHeader.h"
-
-#include "Common.h"
-#include "GS.h"
-#include "R5900OpcodeTables.h"
-#include "iR5900.h"
-#include "iMMI.h"
-#include "iFPU.h"
-#include "iCOP0.h"
-#include "VUmicro.h"
-#include "VUflags.h"
-#include "sVU_Micro.h"
-#include "sVU_Debug.h"
-#include "sVU_zerorec.h"
-//------------------------------------------------------------------
-#define MINMAXFIX 1
-//------------------------------------------------------------------
-// Helper Macros
-//------------------------------------------------------------------
-#define _Ft_ (( VU->code >> 16) & 0x1F) // The rt part of the instruction register
-#define _Fs_ (( VU->code >> 11) & 0x1F) // The rd part of the instruction register
-#define _Fd_ (( VU->code >> 6) & 0x1F) // The sa part of the instruction register
-
-#define _X (( VU->code>>24) & 0x1)
-#define _Y (( VU->code>>23) & 0x1)
-#define _Z (( VU->code>>22) & 0x1)
-#define _W (( VU->code>>21) & 0x1)
-
-#define _XYZW_SS (_X+_Y+_Z+_W==1)
-
-#define _Fsf_ (( VU->code >> 21) & 0x03)
-#define _Ftf_ (( VU->code >> 23) & 0x03)
-
-#define _Imm11_ (s32)(VU->code & 0x400 ? 0xfffffc00 | (VU->code & 0x3ff) : VU->code & 0x3ff)
-#define _UImm11_ (s32)(VU->code & 0x7ff)
-
-#define VU_VFx_ADDR(x) (uptr)&VU->VF[x].UL[0]
-#define VU_VFy_ADDR(x) (uptr)&VU->VF[x].UL[1]
-#define VU_VFz_ADDR(x) (uptr)&VU->VF[x].UL[2]
-#define VU_VFw_ADDR(x) (uptr)&VU->VF[x].UL[3]
-
-#define VU_REGR_ADDR (uptr)&VU->VI[REG_R]
-#define VU_REGQ_ADDR (uptr)&VU->VI[REG_Q]
-#define VU_REGMAC_ADDR (uptr)&VU->VI[REG_MAC_FLAG]
-
-#define VU_VI_ADDR(x, read) GetVIAddr(VU, x, read, info)
-
-#define VU_ACCx_ADDR (uptr)&VU->ACC.UL[0]
-#define VU_ACCy_ADDR (uptr)&VU->ACC.UL[1]
-#define VU_ACCz_ADDR (uptr)&VU->ACC.UL[2]
-#define VU_ACCw_ADDR (uptr)&VU->ACC.UL[3]
-
-#define _X_Y_Z_W ((( VU->code >> 21 ) & 0xF ) )
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// Global Variables
-//------------------------------------------------------------------
-static const __aligned16 int SSEmovMask[ 16 ][ 4 ] =
-{
- { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
- { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
- { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
- { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
- { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
- { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
- { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
- { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
- { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
- { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
- { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
- { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
- { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
- { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
- { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
- { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
-};
-
-static const __aligned16 u32 const_abs_table[16][4] =
-{
- { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000
- { 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, //0001
- { 0xffffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, //0010
- { 0xffffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, //0011
- { 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, //0100
- { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, //0101
- { 0xffffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, //0110
- { 0xffffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0111
- { 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000
- { 0x7fffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, //1001
- { 0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, //1010
- { 0x7fffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, //1011
- { 0x7fffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, //1100
- { 0x7fffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, //1101
- { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, //1110
- { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1111
-};
-
-static const __aligned16 float recMult_float_to_int4[4] = { 16.0, 16.0, 16.0, 16.0 };
-static const __aligned16 float recMult_float_to_int12[4] = { 4096.0, 4096.0, 4096.0, 4096.0 };
-static const __aligned16 float recMult_float_to_int15[4] = { 32768.0, 32768.0, 32768.0, 32768.0 };
-
-static const __aligned16 float recMult_int_to_float4[4] = { 0.0625f, 0.0625f, 0.0625f, 0.0625f };
-static const __aligned16 float recMult_int_to_float12[4] = { 0.000244140625, 0.000244140625, 0.000244140625, 0.000244140625 };
-static const __aligned16 float recMult_int_to_float15[4] = { 0.000030517578125, 0.000030517578125, 0.000030517578125, 0.000030517578125 };
-
-static const __aligned16 u32 VU_Underflow_Mask1[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
-static const __aligned16 u32 VU_Underflow_Mask2[4] = {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff};
-static const __aligned16 u32 VU_Zero_Mask[4] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
-static const __aligned16 u32 VU_Zero_Helper_Mask[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-static const __aligned16 u32 VU_Signed_Zero_Mask[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-static const __aligned16 u32 VU_Pos_Infinity[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
-static const __aligned16 u32 VU_Neg_Infinity[4] = {0xff800000, 0xff800000, 0xff800000, 0xff800000};
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// recUpdateFlags() - Computes the flags for the Upper Opcodes
-//
-// Note: Computes under/overflow flags if CHECK_VU_EXTRA_FLAGS is 1
-//------------------------------------------------------------------
-static __aligned16 u64 TEMPXMMData[2];
-void recUpdateFlags(VURegs * VU, int reg, int info)
-{
- static u8 *pjmp, *pjmp2;
- static u32 *pjmp32;
- static u32 macaddr, stataddr, prevstataddr;
- static int x86macflag, x86statflag, x86temp;
- static int t1reg, t1regBoolean;
- static const int flipMask[16] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
-
- if( !(info & PROCESS_VU_UPDATEFLAGS) ) {
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (reg != EEREC_TEMP) vuFloat2(reg, EEREC_TEMP, _X_Y_Z_W);
- else vuFloat_useEAX(info, reg, _X_Y_Z_W);
- }
- return;
- }
-
- //Console.WriteLn ("recUpdateFlags");
-
- macaddr = VU_VI_ADDR(REG_MAC_FLAG, 0);
- stataddr = VU_VI_ADDR(REG_STATUS_FLAG, 0); // write address
- prevstataddr = VU_VI_ADDR(REG_STATUS_FLAG, 2); // previous address
-
- if( stataddr == 0 ) stataddr = prevstataddr;
- if( macaddr == 0 ) {
- Console.WriteLn( "VU ALLOCATION WARNING: Using Mac Flag Previous Address!" );
- macaddr = VU_VI_ADDR(REG_MAC_FLAG, 2);
- }
-
- x86macflag = ALLOCTEMPX86(0);
- x86statflag = ALLOCTEMPX86(0);
-
- if (reg == EEREC_TEMP) {
- t1reg = _vuGetTempXMMreg(info);
- if (t1reg < 0) {
- //Console.WriteLn( "VU ALLOCATION ERROR: Temp reg can't be allocated!!!!" );
- t1reg = (reg == 0) ? 1 : 0; // Make t1reg != reg
- SSE_MOVAPS_XMM_to_M128( (uptr)TEMPXMMData, t1reg ); // Backup data to temp address
- t1regBoolean = 1;
- }
- else t1regBoolean = 0;
- }
- else {
- t1reg = EEREC_TEMP;
- t1regBoolean = 2;
- }
-
- SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip wzyx to xyzw
- MOV32MtoR(x86statflag, prevstataddr); // Load the previous status in to x86statflag
- AND16ItoR(x86statflag, 0xff0); // Keep Sticky and D/I flags
-
-
- if (CHECK_VU_EXTRA_FLAGS) { // Checks all flags
-
- x86temp = ALLOCTEMPX86(0);
-
- //-------------------------Check for Overflow flags------------------------------
-
- //SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
- //SSE_CMPUNORDPS_XMM_to_XMM(t1reg, reg); // If reg == NaN then set Vector to 0xFFFFFFFF
-
- //SSE_MOVAPS_XMM_to_XMM(t1reg, reg);
- //SSE_MINPS_M128_to_XMM(t1reg, (uptr)g_maxvals);
- //SSE_MAXPS_M128_to_XMM(t1reg, (uptr)g_minvals);
- //SSE_CMPNEPS_XMM_to_XMM(t1reg, reg); // If they're not equal, then overflow has occured
-
- SSE_MOVAPS_XMM_to_XMM(t1reg, reg);
- SSE_ANDPS_M128_to_XMM(t1reg, (uptr)VU_Zero_Helper_Mask);
- SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)VU_Pos_Infinity); // If infinity, then overflow has occured (NaN's don't report as overflow)
-
- SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the previous calculation
-
- AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified)
- pjmp = JZ8(0); // Skip if none are
- OR16ItoR(x86statflag, 0x208); // OS, O flags
- SHL16ItoR(x86macflag, 12);
- if (_XYZW_SS) pjmp32 = JMP32(0); // Skip Underflow Check
- x86SetJ8(pjmp);
-
- //-------------------------Check for Underflow flags------------------------------
-
- SSE_MOVAPS_XMM_to_XMM(t1reg, reg); // t1reg <- reg
-
- SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask1[ 0 ]);
- SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg == zero exponent) then set Vector to 0xFFFFFFFF
-
- SSE_ANDPS_XMM_to_XMM(t1reg, reg);
- SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask2[ 0 ]);
- SSE_CMPNEPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg != zero mantisa) then set Vector to 0xFFFFFFFF
-
- SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
-
- AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Has Underflowed" bits from the previous calculation
- pjmp = JZ8(0); // Skip if none are
- OR16ItoR(x86statflag, 0x104); // US, U flags
- SHL16ItoR(EAX, 8);
- OR32RtoR(x86macflag, EAX);
- x86SetJ8(pjmp);
-
- //-------------------------Optional Code: Denormals Are Zero------------------------------
- if (CHECK_VU_UNDERFLOW) { // Sets underflow/denormals to zero
- SSE_ANDNPS_XMM_to_XMM(t1reg, reg); // t1reg = !t1reg & reg (t1reg = denormals are positive zero)
- VU_MERGE_REGS_SAFE(t1reg, reg, (15 - flipMask[_X_Y_Z_W])); // Send t1reg the vectors that shouldn't be modified (since reg was flipped, we need a mask to get the unmodified vectors)
- // Now we have Denormals are Positive Zero in t1reg; the next two lines take Signed Zero into account
- SSE_ANDPS_M128_to_XMM(reg, (uptr)&VU_Signed_Zero_Mask[ 0 ]); // Only keep the sign bit for each vector
- SSE_ORPS_XMM_to_XMM(reg, t1reg); // Denormals are Signed Zero, and unmodified vectors stay the same!
- }
-
- if (_XYZW_SS) x86SetJ32(pjmp32); // If we skipped the Underflow Flag Checking (when we had an Overflow), return here
-
- vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask)
-
- //-------------------------Check for Signed flags------------------------------
-
- // The following code makes sure the Signed Bit isn't set with Negative Zero
- SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
- SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
- SSE_MOVMSKPS_XMM_to_R32(x86temp, t1reg); // Used for Zero Flag Calculation
- SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
-
- SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg
-
- AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
- pjmp = JZ8(0); // Skip if none are
- OR16ItoR(x86statflag, 0x82); // SS, S flags
- SHL16ItoR(EAX, 4);
- OR32RtoR(x86macflag, EAX);
- if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
- x86SetJ8(pjmp);
-
- //-------------------------Check for Zero flags------------------------------
-
- AND16ItoR(x86temp, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
- pjmp = JZ8(0); // Skip if none are
- OR16ItoR(x86statflag, 0x41); // ZS, Z flags
- OR32RtoR(x86macflag, x86temp);
- x86SetJ8(pjmp);
-
- _freeX86reg(x86temp);
- }
- else { // Only Checks for Sign and Zero Flags
-
- vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask)
-
- //-------------------------Check for Signed flags------------------------------
-
- // The following code makes sure the Signed Bit isn't set with Negative Zero
- SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
- SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
- SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Used for Zero Flag Calculation
- SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
-
- SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the t1reg
-
- AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
- pjmp = JZ8(0); // Skip if none are
- OR16ItoR(x86statflag, 0x82); // SS, S flags
- SHL16ItoR(x86macflag, 4);
- if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
- x86SetJ8(pjmp);
-
- //-------------------------Check for Zero flags------------------------------
-
- AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
- pjmp = JZ8(0); // Skip if none are
- OR16ItoR(x86statflag, 0x41); // ZS, Z flags
- OR32RtoR(x86macflag, EAX);
- x86SetJ8(pjmp);
- }
- //-------------------------Finally: Send the Flags to the Mac Flag Address------------------------------
-
- if (_XYZW_SS) x86SetJ8(pjmp2); // If we skipped the Zero Flag Checking, return here
-
- if (t1regBoolean == 2) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip back reg to wzyx (have to do this because reg != EEREC_TEMP)
- else if (t1regBoolean == 1) SSE_MOVAPS_M128_to_XMM( t1reg, (uptr)TEMPXMMData ); // Restore data from temo address
- else _freeXMMreg(t1reg); // Free temp reg
-
- MOV16RtoM(macaddr, x86macflag);
- MOV16RtoM(stataddr, x86statflag);
-
- _freeX86reg(x86macflag);
- _freeX86reg(x86statflag);
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// Custom VU ADD/SUB routines by Nneeve
-//
-// Note: See FPU_ADD_SUB() for more info on what this is doing.
-//------------------------------------------------------------------
-static __aligned16 u32 VU_addsuband[2][4];
-static __aligned16 u32 VU_addsub_reg[2][4];
-
-static u32 tempECX;
-
-void VU_ADD_SUB(u32 regd, u32 regt, int is_sub, int info)
-{
- u8 *localptr[4][8];
-
- MOV32RtoM((uptr)&tempECX, ECX);
-
- int temp1 = ECX; //receives regd
- int temp2 = ALLOCTEMPX86(0);
-
- if (temp2 == ECX)
- {
- temp2 = ALLOCTEMPX86(0);
- _freeX86reg(ECX);
- }
-
- SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
- SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
-
- SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
- SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd);
- SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd);
- SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
-
- SSE2_PSLLD_I8_to_XMM(regd, 1);
- SSE2_PSLLD_I8_to_XMM(regt, 1);
-
- SSE2_PSRLD_I8_to_XMM(regd, 24);
- SSE2_PSRLD_I8_to_XMM(regt, 24);
-
- SSE2_PSUBD_XMM_to_XMM(regd, regt);
-
-#define PERFORM(i) \
- \
- SSE_PEXTRW_XMM_to_R32(temp1, regd, i*2); \
- MOVSX32R16toR(temp1, temp1); \
- CMP32ItoR(temp1, 25);\
- localptr[i][0] = JGE8(0);\
- CMP32ItoR(temp1, 0);\
- localptr[i][1] = JG8(0);\
- localptr[i][2] = JE8(0);\
- CMP32ItoR(temp1, -25);\
- localptr[i][3] = JLE8(0);\
- \
- NEG32R(temp1); \
- DEC32R(temp1);\
- MOV32ItoR(temp2, 0xffffffff); \
- SHL32CLtoR(temp2); \
- MOV32RtoM((uptr)&VU_addsuband[0][i], temp2);\
- localptr[i][4] = JMP8(0);\
- \
- x86SetJ8(localptr[i][0]);\
- MOV32ItoM((uptr)&VU_addsuband[1][i], 0x80000000);\
- localptr[i][5] = JMP8(0);\
- \
- x86SetJ8(localptr[i][1]);\
- DEC32R(temp1);\
- MOV32ItoR(temp2, 0xffffffff);\
- SHL32CLtoR(temp2); \
- MOV32RtoM((uptr)&VU_addsuband[1][i], temp2);\
- localptr[i][6] = JMP8(0);\
- \
- x86SetJ8(localptr[i][3]);\
- MOV32ItoM((uptr)&VU_addsuband[0][i], 0x80000000);\
- localptr[i][7] = JMP8(0);\
- \
- x86SetJ8(localptr[i][2]);\
- \
- x86SetJ8(localptr[i][4]);\
- x86SetJ8(localptr[i][5]);\
- x86SetJ8(localptr[i][6]);\
- x86SetJ8(localptr[i][7]);
-
- PERFORM(0);
- PERFORM(1);
- PERFORM(2);
- PERFORM(3);
-#undef PERFORM
-
- SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
- SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
-
- SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[0][0]);
- SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsuband[1][0]);
-
- if (is_sub) SSE_SUBPS_XMM_to_XMM(regd, regt);
- else SSE_ADDPS_XMM_to_XMM(regd, regt);
-
- SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
-
- _freeX86reg(temp2);
-
- MOV32MtoR(ECX, (uptr)&tempECX);
-}
-
-void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
-{
- u8 *localptr[8];
- u32 addrt = regt; //for case is_mem
-
- MOV32RtoM((uptr)&tempECX, ECX);
-
- int temp1 = ECX; //receives regd
- int temp2 = ALLOCTEMPX86(0);
-
- if (temp2 == ECX)
- {
- temp2 = ALLOCTEMPX86(0);
- _freeX86reg(ECX);
- }
-
- SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
- if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
-
- SSE2_MOVD_XMM_to_R(temp1, regd);
- SHR32ItoR(temp1, 23);
-
- if (is_mem) {
- MOV32MtoR(temp2, addrt);
- MOV32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
- SHR32ItoR(temp2, 23);
- }
- else {
- SSE2_MOVD_XMM_to_R(temp2, regt);
- SHR32ItoR(temp2, 23);
- }
-
- AND32ItoR(temp1, 0xff);
- AND32ItoR(temp2, 0xff);
-
- SUB32RtoR(temp1, temp2); //temp1 = exponent difference
-
- CMP32ItoR(temp1, 25);
- localptr[0] = JGE8(0);
- CMP32ItoR(temp1, 0);
- localptr[1] = JG8(0);
- localptr[2] = JE8(0);
- CMP32ItoR(temp1, -25);
- localptr[3] = JLE8(0);
-
- NEG32R(temp1);
- DEC32R(temp1);
- MOV32ItoR(temp2, 0xffffffff);
- SHL32CLtoR(temp2);
- SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
- if (is_mem) {
- SSE_PINSRW_R32_to_XMM(regd, temp2, 0);
- SHR32ItoR(temp2, 16);
- SSE_PINSRW_R32_to_XMM(regd, temp2, 1);
- }
- else {
- SSE2_MOVD_R_to_XMM(regt, temp2);
- SSE_MOVSS_XMM_to_XMM(regd, regt);
- SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
- }
- localptr[4] = JMP8(0);
-
- x86SetJ8(localptr[0]);
- MOV32ItoR(temp2, 0x80000000);
- if (is_mem)
- AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
- else {
- SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
- SSE2_MOVD_R_to_XMM(regd, temp2);
- SSE_MOVSS_XMM_to_XMM(regt, regd);
- }
- SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
- localptr[5] = JMP8(0);
-
- x86SetJ8(localptr[1]);
- DEC32R(temp1);
- MOV32ItoR(temp2, 0xffffffff);
- SHL32CLtoR(temp2);
- if (is_mem)
- AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
- else {
- SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
- SSE2_MOVD_R_to_XMM(regd, temp2);
- SSE_MOVSS_XMM_to_XMM(regt, regd);
- }
- SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
- localptr[6] = JMP8(0);
-
- x86SetJ8(localptr[3]);
- MOV32ItoR(temp2, 0x80000000);
- SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
- if (is_mem) {
- SSE_PINSRW_R32_to_XMM(regd, temp2, 0);
- SHR32ItoR(temp2, 16);
- SSE_PINSRW_R32_to_XMM(regd, temp2, 1);
- }
- else {
- SSE2_MOVD_R_to_XMM(regt, temp2);
- SSE_MOVSS_XMM_to_XMM(regd, regt);
- SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
- }
- localptr[7] = JMP8(0);
-
- x86SetJ8(localptr[2]);
- x86SetJ8(localptr[4]);
- x86SetJ8(localptr[5]);
- x86SetJ8(localptr[6]);
- x86SetJ8(localptr[7]);
-
- if (is_mem)
- {
- SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
-
- if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
- else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
- }
- else
- {
- SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
- SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
-
- if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt);
- else SSE_ADDSS_XMM_to_XMM(regd, regt);
-
- SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
- }
-
- _freeX86reg(temp2);
-
- MOV32MtoR(ECX, (uptr)&tempECX);
-}
-
-void SSE_ADDPS_XMM_to_XMM_custom(int info, int regd, int regt) {
- if (CHECK_VUADDSUBHACK) {
- VU_ADD_SUB(regd, regt, 0, info);
- }
- else SSE_ADDPS_XMM_to_XMM(regd, regt);
-}
-void SSE_SUBPS_XMM_to_XMM_custom(int info, int regd, int regt) {
- if (CHECK_VUADDSUBHACK) {
- VU_ADD_SUB(regd, regt, 1, info);
- }
- else SSE_SUBPS_XMM_to_XMM(regd, regt);
-}
-void SSE_ADDSS_XMM_to_XMM_custom(int info, int regd, int regt) {
- if (CHECK_VUADDSUBHACK) {
- VU_ADD_SUB_SS(regd, regt, 0, 0, info);
- }
- else SSE_ADDSS_XMM_to_XMM(regd, regt);
-}
-void SSE_SUBSS_XMM_to_XMM_custom(int info, int regd, int regt) {
- if (CHECK_VUADDSUBHACK) {
- VU_ADD_SUB_SS(regd, regt, 1, 0, info);
- }
- else SSE_SUBSS_XMM_to_XMM(regd, regt);
-}
-void SSE_ADDSS_M32_to_XMM_custom(int info, int regd, int regt) {
- if (CHECK_VUADDSUBHACK) {
- VU_ADD_SUB_SS(regd, regt, 0, 1, info);
- }
- else SSE_ADDSS_M32_to_XMM(regd, regt);
-}
-void SSE_SUBSS_M32_to_XMM_custom(int info, int regd, int regt) {
- if (CHECK_VUADDSUBHACK) {
- VU_ADD_SUB_SS(regd, regt, 1, 1, info);
- }
- else SSE_SUBSS_M32_to_XMM(regd, regt);
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// *VU Upper Instructions!*
-//
-// Note: * = Checked for errors by cottonvibes
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// ABS*
-//------------------------------------------------------------------
-void recVUMI_ABS(VURegs *VU, int info)
-{
- //Console.WriteLn("recVUMI_ABS()");
- if ( (_Ft_ == 0) || (_X_Y_Z_W == 0) ) return;
-
- if ((_X_Y_Z_W == 0x8) || (_X_Y_Z_W == 0xf)) {
- VU_MERGE_REGS(EEREC_T, EEREC_S);
- SSE_ANDPS_M128_to_XMM(EEREC_T, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] );
- }
- else { // Use a temp reg because VU_MERGE_REGS() modifies source reg!
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] );
- VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
- }
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// ADD*, ADD_iq*, ADD_xyzw*
-//------------------------------------------------------------------
-static const __aligned16 float s_two[4] = {0,0,0,2};
-void recVUMI_ADD(VURegs *VU, int info)
-{
- //Console.WriteLn("recVUMI_ADD()");
- if ( _X_Y_Z_W == 0 ) goto flagUpdate; // Don't do anything and just clear flags
- if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
-
- if ( _Fs_ == 0 && _Ft_ == 0 ) { // if adding VF00 with VF00, then the result is always 0,0,0,2
- if ( _X_Y_Z_W != 0xf ) {
- SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)s_two);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)s_two);
- }
- else {
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
- }
- if( _X_Y_Z_W == 8 ) { // If only adding x, then we can do a Scalar Add
- if (EEREC_D == EEREC_S) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T);
- else if (EEREC_D == EEREC_T) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_S);
- else {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- else if (_X_Y_Z_W != 0xf) { // If xyzw != 1111, then we have to use a temp reg
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else { // All xyzw being modified (xyzw == 1111)
- if (EEREC_D == EEREC_S) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_T);
- else if (EEREC_D == EEREC_T) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S);
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- }
-flagUpdate:
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_ADD_iq(VURegs *VU, uptr addr, int info)
-{
- //Console.WriteLn("recVUMI_ADD_iq()");
- if ( _X_Y_Z_W == 0 ) goto flagUpdate;
- if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
- if (CHECK_VU_EXTRA_OVERFLOW) {
- vuFloat3(addr);
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- }
-
- if ( _XYZW_SS ) {
- if ( EEREC_D == EEREC_TEMP ) {
- _vuFlipRegSS(VU, EEREC_S);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_ADDSS_M32_to_XMM(EEREC_D, addr);
- _vuFlipRegSS(VU, EEREC_S);
- _vuFlipRegSS(VU, EEREC_D); // have to flip over EEREC_D for computing flags!
- }
- else if ( EEREC_D == EEREC_S ) {
- _vuFlipRegSS(VU, EEREC_D);
- SSE_ADDSS_M32_to_XMM(EEREC_D, addr);
- _vuFlipRegSS(VU, EEREC_D);
- }
- else {
- if ( _X ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_ADDSS_M32_to_XMM_custom(info, EEREC_D, addr);
- }
- else {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- SSE_ADDPS_XMM_to_XMM_custom(info, EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- }
- }
- else {
- if ( (_X_Y_Z_W != 0xf) || (EEREC_D == EEREC_S) || (EEREC_D == EEREC_TEMP) ) {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- }
-
- if (_X_Y_Z_W != 0xf) {
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- if ( EEREC_D == EEREC_TEMP ) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S);
- else if ( EEREC_D == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- else {
- SSE_MOVSS_M32_to_XMM(EEREC_D, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00);
- SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- }
- }
-flagUpdate:
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_ADD_xyzw(VURegs *VU, int xyzw, int info)
-{
- //Console.WriteLn("recVUMI_ADD_xyzw()");
- if ( _X_Y_Z_W == 0 ) goto flagUpdate;
- if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
- }
-
- if ( _Ft_ == 0 && xyzw < 3 ) { // just move since adding zero
- if ( _X_Y_Z_W == 0x8 ) { VU_MERGE_REGS(EEREC_D, EEREC_S); }
- else if ( _X_Y_Z_W != 0xf ) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- else if ( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP) ) {
- if ( xyzw == 0 ) {
- if ( EEREC_D == EEREC_T ) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_S);
- else {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- else {
- _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- }
- else if( _Fs_ == 0 && !_W ) { // just move
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- if ( _X_Y_Z_W != 0xf ) {
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- if( EEREC_D == EEREC_TEMP ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); }
- else if( EEREC_D == EEREC_S ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); }
- else { _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); }
- }
- }
-flagUpdate:
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_ADDi(VURegs *VU, int info) { recVUMI_ADD_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
-void recVUMI_ADDq(VURegs *VU, int info) { recVUMI_ADD_iq(VU, VU_REGQ_ADDR, info); }
-void recVUMI_ADDx(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 0, info); }
-void recVUMI_ADDy(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 1, info); }
-void recVUMI_ADDz(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 2, info); }
-void recVUMI_ADDw(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 3, info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// ADDA*, ADDA_iq*, ADDA_xyzw*
-//------------------------------------------------------------------
-void recVUMI_ADDA(VURegs *VU, int info)
-{
- //Console.WriteLn("recVUMI_ADDA()");
- if ( _X_Y_Z_W == 0 ) goto flagUpdate;
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
- }
-
- if( _X_Y_Z_W == 8 ) {
- if (EEREC_ACC == EEREC_S) SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T); // Can this case happen? (cottonvibes)
- else if (EEREC_ACC == EEREC_T) SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_S); // Can this case happen?
- else {
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T);
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
-
- VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
- }
- else {
- if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_T); // Can this case happen?
- else if( EEREC_ACC == EEREC_T ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S); // Can this case happen?
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_T);
- }
- }
-flagUpdate:
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_ADDA_iq(VURegs *VU, uptr addr, int info)
-{
- //Console.WriteLn("recVUMI_ADDA_iq()");
- if ( _X_Y_Z_W == 0 ) goto flagUpdate;
- if (CHECK_VU_EXTRA_OVERFLOW) {
- vuFloat3(addr);
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- }
-
- if( _XYZW_SS ) {
- assert( EEREC_ACC != EEREC_TEMP );
- if( EEREC_ACC == EEREC_S ) {
- _vuFlipRegSS(VU, EEREC_ACC);
- SSE_ADDSS_M32_to_XMM(EEREC_ACC, addr);
- _vuFlipRegSS(VU, EEREC_ACC);
- }
- else {
- if( _X ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_ADDSS_M32_to_XMM(EEREC_ACC, addr);
- }
- else {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
- }
- }
- }
- else {
- if( _X_Y_Z_W != 0xf || EEREC_ACC == EEREC_S ) {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- }
-
- if (_X_Y_Z_W != 0xf) {
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
- }
- else {
- if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
- else {
- SSE_MOVSS_M32_to_XMM(EEREC_ACC, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_ACC, EEREC_ACC, 0x00);
- SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- }
- }
- }
-flagUpdate:
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_ADDA_xyzw(VURegs *VU, int xyzw, int info)
-{
- //Console.WriteLn("recVUMI_ADDA_xyzw()");
- if ( _X_Y_Z_W == 0 ) goto flagUpdate;
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
- }
-
- if( _X_Y_Z_W == 8 ) {
- assert( EEREC_ACC != EEREC_T );
- if( xyzw == 0 ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T);
- }
- else {
- _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- if( _Fs_ == 0 ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
- }
- else {
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
- }
- }
- }
- else {
- if( _X_Y_Z_W != 0xf || EEREC_ACC == EEREC_S )
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
-
- if (_X_Y_Z_W != 0xf) {
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
- }
- else {
- if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
- else {
- _unpackVF_xyzw(EEREC_ACC, EEREC_T, xyzw);
- SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- }
- }
- }
-flagUpdate:
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_ADDAi(VURegs *VU, int info) { recVUMI_ADDA_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
-void recVUMI_ADDAq(VURegs *VU, int info) { recVUMI_ADDA_iq(VU, VU_REGQ_ADDR, info); }
-void recVUMI_ADDAx(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 0, info); }
-void recVUMI_ADDAy(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 1, info); }
-void recVUMI_ADDAz(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 2, info); }
-void recVUMI_ADDAw(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 3, info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// SUB*, SUB_iq*, SUB_xyzw*
-//------------------------------------------------------------------
-void recVUMI_SUB(VURegs *VU, int info)
-{
- //Console.WriteLn("recVUMI_SUB()");
- if ( _X_Y_Z_W == 0 ) goto flagUpdate;
- if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
-
- if( EEREC_S == EEREC_T ) {
- if (_X_Y_Z_W != 0xf) SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&SSEmovMask[15-_X_Y_Z_W][0]);
- else SSE_XORPS_XMM_to_XMM(EEREC_D, EEREC_D);
- }
- else if( _X_Y_Z_W == 8 ) {
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
- }
- if (EEREC_D == EEREC_S) {
- if (_Ft_) SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- else if (EEREC_D == EEREC_T) {
- if (_Ft_) {
- SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_SUBSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- else SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- else {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- if (_Ft_) SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- else {
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
- }
- if (_X_Y_Z_W != 0xf) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if( ( _Ft_ > 0 ) || _W ) SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
-
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- if (EEREC_D == EEREC_S) SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_T);
- else if (EEREC_D == EEREC_T) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
- SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- }
-flagUpdate:
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_SUB_iq(VURegs *VU, uptr addr, int info)
-{
- //Console.WriteLn("recVUMI_SUB_iq()");
- if ( _X_Y_Z_W == 0 ) goto flagUpdate;
- if (CHECK_VU_EXTRA_OVERFLOW) {
- vuFloat3(addr);
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- }
- if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
-
- if( _XYZW_SS ) {
- if( EEREC_D == EEREC_TEMP ) {
- _vuFlipRegSS(VU, EEREC_S);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_SUBSS_M32_to_XMM(EEREC_D, addr);
- _vuFlipRegSS(VU, EEREC_S);
- _vuFlipRegSS(VU, EEREC_D);
- }
- else if( EEREC_D == EEREC_S ) {
- _vuFlipRegSS(VU, EEREC_D);
- SSE_SUBSS_M32_to_XMM(EEREC_D, addr);
- _vuFlipRegSS(VU, EEREC_D);
- }
- else {
- if( _X ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_SUBSS_M32_to_XMM(EEREC_D, addr);
- }
- else {
- _vuMoveSS(VU, EEREC_TEMP, EEREC_S);
- _vuFlipRegSS(VU, EEREC_D);
- SSE_SUBSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- _vuFlipRegSS(VU, EEREC_D);
- }
- }
- }
- else {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
-
- if (_X_Y_Z_W != 0xf) {
- int t1reg = _vuGetTempXMMreg(info);
-
- if( t1reg >= 0 ) {
- SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
-
- VU_MERGE_REGS(EEREC_D, t1reg);
- _freeXMMreg(t1reg);
- }
- else {
- // negate
- SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- }
- else {
- if( EEREC_D == EEREC_TEMP ) {
- SSE_XORPS_M128_to_XMM(EEREC_D, (uptr)&const_clip[4]);
- SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- }
- }
-flagUpdate:
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_SUB_xyzw(VURegs *VU, int xyzw, int info)
-{
- //Console.WriteLn("recVUMI_SUB_xyzw()");
- if ( _X_Y_Z_W == 0 ) goto flagUpdate;
- if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
- }
-
- if ( _X_Y_Z_W == 8 ) {
- if ( (xyzw == 0) && (_Ft_ == _Fs_) ) {
- SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&SSEmovMask[7][0]);
- }
- else if ( EEREC_D == EEREC_TEMP ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- if ( (_Ft_ > 0) || (xyzw == 3) ) {
- _vuFlipRegSS_xyzw(EEREC_T, xyzw);
- SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T);
- _vuFlipRegSS_xyzw(EEREC_T, xyzw);
- }
- }
- else {
- if ( (_Ft_ > 0) || (xyzw == 3) ) {
- _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- else SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- }
- else {
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
-
- if (_X_Y_Z_W != 0xf) {
- int t1reg = _vuGetTempXMMreg(info);
-
- if( t1reg >= 0 ) {
- SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
-
- VU_MERGE_REGS(EEREC_D, t1reg);
- _freeXMMreg(t1reg);
- }
- else {
- // negate
- SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- }
- else {
- if( EEREC_D == EEREC_TEMP ) {
- SSE_XORPS_M128_to_XMM(EEREC_D, (uptr)&const_clip[4]);
- SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- }
- }
-flagUpdate:
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_SUBi(VURegs *VU, int info) { recVUMI_SUB_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
-void recVUMI_SUBq(VURegs *VU, int info) { recVUMI_SUB_iq(VU, VU_REGQ_ADDR, info); }
-void recVUMI_SUBx(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 0, info); }
-void recVUMI_SUBy(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 1, info); }
-void recVUMI_SUBz(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 2, info); }
-void recVUMI_SUBw(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 3, info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// SUBA*, SUBA_iq, SUBA_xyzw
-//------------------------------------------------------------------
-void recVUMI_SUBA(VURegs *VU, int info)
-{
- //Console.WriteLn("recVUMI_SUBA()");
- if ( _X_Y_Z_W == 0 ) goto flagUpdate;
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
- }
-
- if( EEREC_S == EEREC_T ) {
- if (_X_Y_Z_W != 0xf) SSE_ANDPS_M128_to_XMM(EEREC_ACC, (uptr)&SSEmovMask[15-_X_Y_Z_W][0]);
- else SSE_XORPS_XMM_to_XMM(EEREC_ACC, EEREC_ACC);
- }
- else if( _X_Y_Z_W == 8 ) {
- if (EEREC_ACC == EEREC_S) SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T);
- else if (EEREC_ACC == EEREC_T) {
- SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_SUBSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
- }
- else {
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T);
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
-
- VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
- }
- else {
- if( EEREC_ACC == EEREC_S ) SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_T);
- else if( EEREC_ACC == EEREC_T ) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
- SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_T);
- }
- }
-flagUpdate:
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_SUBA_iq(VURegs *VU, uptr addr, int info)
-{
- //Console.WriteLn ("recVUMI_SUBA_iq");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- vuFloat3(addr);
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- }
-
- if( _XYZW_SS ) {
- if( EEREC_ACC == EEREC_S ) {
- _vuFlipRegSS(VU, EEREC_ACC);
- SSE_SUBSS_M32_to_XMM(EEREC_ACC, addr);
- _vuFlipRegSS(VU, EEREC_ACC);
- }
- else {
- if( _X ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_SUBSS_M32_to_XMM(EEREC_ACC, addr);
- }
- else {
- _vuMoveSS(VU, EEREC_TEMP, EEREC_S);
- _vuFlipRegSS(VU, EEREC_ACC);
- SSE_SUBSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
- _vuFlipRegSS(VU, EEREC_ACC);
- }
- }
- }
- else {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
-
- if (_X_Y_Z_W != 0xf) {
- int t1reg = _vuGetTempXMMreg(info);
-
- if( t1reg >= 0 ) {
- SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
-
- VU_MERGE_REGS(EEREC_ACC, t1reg);
- _freeXMMreg(t1reg);
- }
- else {
- // negate
- SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
- }
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
- }
- }
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_SUBA_xyzw(VURegs *VU, int xyzw, int info)
-{
- //Console.WriteLn ("recVUMI_SUBA_xyzw");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
- }
-
- if( _X_Y_Z_W == 8 ) {
- if( xyzw == 0 ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T);
- }
- else {
- _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
- }
- }
- else {
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
-
- if (_X_Y_Z_W != 0xf) {
- int t1reg = _vuGetTempXMMreg(info);
-
- if( t1reg >= 0 ) {
- SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
-
- VU_MERGE_REGS(EEREC_ACC, t1reg);
- _freeXMMreg(t1reg);
- }
- else {
- // negate
- SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
- }
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
- SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
- }
- }
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_SUBAi(VURegs *VU, int info) { recVUMI_SUBA_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
-void recVUMI_SUBAq(VURegs *VU, int info) { recVUMI_SUBA_iq(VU, VU_REGQ_ADDR, info); }
-void recVUMI_SUBAx(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 0, info); }
-void recVUMI_SUBAy(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 1, info); }
-void recVUMI_SUBAz(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 2, info); }
-void recVUMI_SUBAw(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 3, info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// MUL
-//------------------------------------------------------------------
-void recVUMI_MUL_toD(VURegs *VU, int regd, int info)
-{
- //Console.WriteLn ("recVUMI_MUL_toD");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
- }
-
- if (_X_Y_Z_W == 1 && (_Ft_ == 0 || _Fs_==0) ) { // W
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, _Ft_ ? EEREC_T : EEREC_S);
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else if( _Fd_ == _Fs_ && _Fs_ == _Ft_ && _XYZW_SS ) {
- _vuFlipRegSS(VU, EEREC_D);
- SSE_MULSS_XMM_to_XMM(EEREC_D, EEREC_D);
- _vuFlipRegSS(VU, EEREC_D);
- }
- else if( _X_Y_Z_W == 8 ) {
- if (regd == EEREC_S) SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
- else if (regd == EEREC_T) SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
- else {
- SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
- SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
-
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else {
- if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
- else if (regd == EEREC_T) SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- else {
- SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S);
- SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
- }
- }
-}
-
-void recVUMI_MUL_iq_toD(VURegs *VU, uptr addr, int regd, int info)
-{
- //Console.WriteLn ("recVUMI_MUL_iq_toD");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- vuFloat3(addr);
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- }
-
- if( _XYZW_SS ) {
- if( regd == EEREC_TEMP ) {
- _vuFlipRegSS(VU, EEREC_S);
- SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
- SSE_MULSS_M32_to_XMM(regd, addr);
- _vuFlipRegSS(VU, EEREC_S);
- _vuFlipRegSS(VU, regd);
- }
- else if( regd == EEREC_S ) {
- _vuFlipRegSS(VU, regd);
- SSE_MULSS_M32_to_XMM(regd, addr);
- _vuFlipRegSS(VU, regd);
- }
- else {
- if( _X ) {
- SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
- SSE_MULSS_M32_to_XMM(regd, addr);
- }
- else {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- }
- }
- else {
- if( _X_Y_Z_W != 0xf || regd == EEREC_TEMP || regd == EEREC_S ) {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- }
-
- if (_X_Y_Z_W != 0xf) {
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else {
- if( regd == EEREC_TEMP ) SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- else if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP);
- else {
- SSE_MOVSS_M32_to_XMM(regd, addr);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x00);
- SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- }
- }
- }
-}
-
-void recVUMI_MUL_xyzw_toD(VURegs *VU, int xyzw, int regd, int info)
-{
- //Console.WriteLn ("recVUMI_MUL_xyzw_toD");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
- }
- if (_Fs_) { // This is needed for alot of games; so always clamp this operand
- if (CHECK_VU_SIGN_OVERFLOW) vFloats4_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
- else vFloats2[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
- }
- if( _Ft_ == 0 ) {
- if( xyzw < 3 ) {
- if (_X_Y_Z_W != 0xf) {
- SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else SSE_XORPS_XMM_to_XMM(regd, regd);
- }
- else {
- assert(xyzw==3);
- if (_X_Y_Z_W != 0xf) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S);
- }
- }
- else if( _X_Y_Z_W == 8 ) {
- if( regd == EEREC_TEMP ) {
- _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
- }
- else {
- if( xyzw == 0 ) {
- if( regd == EEREC_T ) {
- SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
- }
- else {
- SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
- SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
- }
- }
- else {
- _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
- SSE_MULSS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- }
- }
- else {
- if( _X_Y_Z_W != 0xf || regd == EEREC_TEMP || regd == EEREC_S )
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
-
- if (_X_Y_Z_W != 0xf) {
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else {
- if( regd == EEREC_TEMP ) SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- else if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP);
- else {
- _unpackVF_xyzw(regd, EEREC_T, xyzw);
- SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- }
- }
- }
-}
-
-void recVUMI_MUL(VURegs *VU, int info)
-{
- //Console.WriteLn ("recVUMI_MUL");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MUL_toD(VU, EEREC_D, info);
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_MUL_iq(VURegs *VU, int addr, int info)
-{
- //Console.WriteLn ("recVUMI_MUL_iq");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MUL_iq_toD(VU, addr, EEREC_D, info);
- recUpdateFlags(VU, EEREC_D, info);
- // spacefisherman needs overflow checking on MULi.z
-}
-
-void recVUMI_MUL_xyzw(VURegs *VU, int xyzw, int info)
-{
- //Console.WriteLn ("recVUMI_MUL_xyzw");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MUL_xyzw_toD(VU, xyzw, EEREC_D, info);
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_MULi(VURegs *VU, int info) { recVUMI_MUL_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
-void recVUMI_MULq(VURegs *VU, int info) { recVUMI_MUL_iq(VU, VU_REGQ_ADDR, info); }
-void recVUMI_MULx(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 0, info); }
-void recVUMI_MULy(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 1, info); }
-void recVUMI_MULz(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 2, info); }
-void recVUMI_MULw(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 3, info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// MULA
-//------------------------------------------------------------------
-void recVUMI_MULA( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_MULA");
- recVUMI_MUL_toD(VU, EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MULA_iq(VURegs *VU, int addr, int info)
-{
- //Console.WriteLn ("recVUMI_MULA_iq");
- recVUMI_MUL_iq_toD(VU, addr, EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MULA_xyzw(VURegs *VU, int xyzw, int info)
-{
- //Console.WriteLn ("recVUMI_MULA_xyzw");
- recVUMI_MUL_xyzw_toD(VU, xyzw, EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MULAi(VURegs *VU, int info) { recVUMI_MULA_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
-void recVUMI_MULAq(VURegs *VU, int info) { recVUMI_MULA_iq(VU, VU_REGQ_ADDR, info); }
-void recVUMI_MULAx(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 0, info); }
-void recVUMI_MULAy(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 1, info); }
-void recVUMI_MULAz(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 2, info); }
-void recVUMI_MULAw(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 3, info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// MADD
-//------------------------------------------------------------------
-void recVUMI_MADD_toD(VURegs *VU, int regd, int info)
-{
- //Console.WriteLn ("recVUMI_MADD_toD");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
- vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
- }
-
-
- if( _X_Y_Z_W == 8 ) {
- if( regd == EEREC_ACC ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- else if (regd == EEREC_T) {
- SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else if (regd == EEREC_S) {
- SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else {
- SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
- SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
-
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else {
- if( regd == EEREC_ACC ) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- else if (regd == EEREC_T) {
- SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else if (regd == EEREC_S) {
- SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S);
- SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
- }
- }
-}
-
-void recVUMI_MADD_iq_toD(VURegs *VU, uptr addr, int regd, int info)
-{
- //Console.WriteLn ("recVUMI_MADD_iq_toD");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- vuFloat3(addr);
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
- }
-
- if( _X_Y_Z_W == 8 ) {
- if( _Fs_ == 0 ) {
- // do nothing if regd == ACC (ACCx <= ACCx + 0.0 * *addr)
- if( regd != EEREC_ACC ) {
- SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC);
- }
- return;
- }
-
- if( regd == EEREC_ACC ) {
- assert( EEREC_TEMP < iREGCNT_XMM );
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- else if( regd == EEREC_S ) {
- SSE_MULSS_M32_to_XMM(regd, addr);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else {
- SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
- SSE_MULSS_M32_to_XMM(regd, addr);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
- }
- }
- else {
- if( _Fs_ == 0 ) {
- if( regd == EEREC_ACC ) { // ACCxyz is unchanged, ACCw <= ACCw + *addr
- if( _W ) { // if _W is zero, do nothing
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); // { *addr, 0, 0, 0 }
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x27); // { 0, 0, 0, *addr }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP); // { ACCx, ACCy, ACCz, ACCw + *addr }
- }
- }
- else { // DESTxyz <= ACCxyz, DESTw <= ACCw + *addr
- if( _W ) {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); // { *addr, 0, 0, 0 }
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x27); // { 0, 0, 0, *addr }
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); // { ACCx, ACCy, ACCz, ACCw + *addr }
- }
- else SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
-
- return;
- }
-
- if( _X_Y_Z_W != 0xf || regd == EEREC_ACC || regd == EEREC_TEMP || regd == EEREC_S ) {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- }
-
- if (_X_Y_Z_W != 0xf) {
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
-
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else {
- if( regd == EEREC_ACC ) {
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- else if( regd == EEREC_S ) {
- SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else if( regd == EEREC_TEMP ) {
- SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else {
- SSE_MOVSS_M32_to_XMM(regd, addr);
- SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x00);
- SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
- }
- }
- }
-}
-
-void recVUMI_MADD_xyzw_toD(VURegs *VU, int xyzw, int regd, int info)
-{
- //Console.WriteLn ("recVUMI_MADD_xyzw_toD");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
- vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
- }
- if (_Fs_) { // This is needed for alot of games; so always clamp this operand
- if (CHECK_VU_SIGN_OVERFLOW) vFloats4_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
- else vFloats2[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
- }
- if( _Ft_ == 0 ) {
-
- if( xyzw == 3 ) {
- // just add
- if( _X_Y_Z_W == 8 ) {
- if( regd == EEREC_S ) SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
- else {
- SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC);
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_S);
- }
- }
- else {
- if( _X_Y_Z_W != 0xf ) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
-
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else {
- if( regd == EEREC_S ) SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
- else {
- SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC);
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_S);
- }
- }
- }
- }
- else {
- // just move acc to regd
- if( _X_Y_Z_W != 0xf ) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC);
- }
-
- return;
- }
-
- if( _X_Y_Z_W == 8 ) {
- _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
-
- if( regd == EEREC_ACC ) {
- SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- else if( regd == EEREC_S ) {
- SSE_MULSS_XMM_to_XMM(regd, EEREC_TEMP);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else if( regd == EEREC_TEMP ) {
- SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else {
- SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC);
- SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); }
- SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- }
- else {
- if( _X_Y_Z_W != 0xf || regd == EEREC_ACC || regd == EEREC_TEMP || regd == EEREC_S ) {
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- }
-
- if (_X_Y_Z_W != 0xf) {
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
-
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- else {
- if( regd == EEREC_ACC ) {
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- else if( regd == EEREC_S ) {
- SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else if( regd == EEREC_TEMP ) {
- SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
- }
- else {
- _unpackVF_xyzw(regd, EEREC_T, xyzw);
- SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
- }
- }
- }
-}
-
-void recVUMI_MADD(VURegs *VU, int info)
-{
- //Console.WriteLn ("recVUMI_MADD");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MADD_toD(VU, EEREC_D, info);
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_MADD_iq(VURegs *VU, int addr, int info)
-{
- //Console.WriteLn ("recVUMI_MADD_iq");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MADD_iq_toD(VU, addr, EEREC_D, info);
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_MADD_xyzw(VURegs *VU, int xyzw, int info)
-{
- //Console.WriteLn ("recVUMI_MADD_xyzw");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MADD_xyzw_toD(VU, xyzw, EEREC_D, info);
- recUpdateFlags(VU, EEREC_D, info);
- // super bust-a-move arrows needs overflow clamping
-}
-
-void recVUMI_MADDi(VURegs *VU, int info) { recVUMI_MADD_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
-void recVUMI_MADDq(VURegs *VU, int info) { recVUMI_MADD_iq(VU, VU_REGQ_ADDR, info); }
-void recVUMI_MADDx(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 0, info); }
-void recVUMI_MADDy(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 1, info); }
-void recVUMI_MADDz(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 2, info); }
-void recVUMI_MADDw(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 3, info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// MADDA
-//------------------------------------------------------------------
-void recVUMI_MADDA( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_MADDA");
- recVUMI_MADD_toD(VU, EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MADDAi( VURegs *VU , int info)
-{
- //Console.WriteLn ("recVUMI_MADDAi");
- recVUMI_MADD_iq_toD( VU, VU_VI_ADDR(REG_I, 1), EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MADDAq( VURegs *VU , int info)
-{
- //Console.WriteLn ("recVUMI_MADDAq ");
- recVUMI_MADD_iq_toD( VU, VU_REGQ_ADDR, EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MADDAx( VURegs *VU , int info)
-{
- //Console.WriteLn ("recVUMI_MADDAx");
- recVUMI_MADD_xyzw_toD(VU, 0, EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MADDAy( VURegs *VU , int info)
-{
- //Console.WriteLn ("recVUMI_MADDAy");
- recVUMI_MADD_xyzw_toD(VU, 1, EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MADDAz( VURegs *VU , int info)
-{
- //Console.WriteLn ("recVUMI_MADDAz");
- recVUMI_MADD_xyzw_toD(VU, 2, EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MADDAw( VURegs *VU , int info)
-{
- //Console.WriteLn ("recVUMI_MADDAw");
- recVUMI_MADD_xyzw_toD(VU, 3, EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// MSUB
-//------------------------------------------------------------------
-void recVUMI_MSUB_toD(VURegs *VU, int regd, int info)
-{
- //Console.WriteLn ("recVUMI_MSUB_toD");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
- vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
- }
-
- if (_X_Y_Z_W != 0xf) {
- int t1reg = _vuGetTempXMMreg(info);
-
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
-
- if( t1reg >= 0 ) {
- SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_ACC);
- SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
-
- VU_MERGE_REGS(regd, t1reg);
- _freeXMMreg(t1reg);
- }
- else {
- SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- }
- else {
- if( regd == EEREC_S ) {
- assert( regd != EEREC_ACC );
- SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC);
- SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]);
- }
- else if( regd == EEREC_T ) {
- assert( regd != EEREC_ACC );
- SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC);
- SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]);
- }
- else if( regd == EEREC_TEMP ) {
- SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S);
- SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC);
- SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC);
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
- SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- }
-}
-
-void recVUMI_MSUB_temp_toD(VURegs *VU, int regd, int info)
-{
- //Console.WriteLn ("recVUMI_MSUB_temp_toD");
-
- if (_X_Y_Z_W != 0xf) {
- int t1reg = _vuGetTempXMMreg(info);
-
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
-
- if( t1reg >= 0 ) {
- SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_ACC);
- SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
-
- if ( regd != EEREC_TEMP ) { VU_MERGE_REGS(regd, t1reg); }
- else SSE_MOVAPS_XMM_to_XMM(regd, t1reg);
-
- _freeXMMreg(t1reg);
- }
- else {
- SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
- VU_MERGE_REGS(regd, EEREC_TEMP);
- }
- }
- else {
- if( regd == EEREC_ACC ) {
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
- SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- else if( regd == EEREC_S ) {
- SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC);
- SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]);
- }
- else if( regd == EEREC_TEMP ) {
- SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
- SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC);
- SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC);
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
- SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP);
- }
- }
-}
-
-void recVUMI_MSUB_iq_toD(VURegs *VU, int regd, int addr, int info)
-{
- //Console.WriteLn ("recVUMI_MSUB_iq_toD");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
- vuFloat3(addr);
- }
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- recVUMI_MSUB_temp_toD(VU, regd, info);
-}
-
-void recVUMI_MSUB_xyzw_toD(VURegs *VU, int regd, int xyzw, int info)
-{
- //Console.WriteLn ("recVUMI_MSUB_xyzw_toD");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 1 << (3 - xyzw));
- vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
- }
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- recVUMI_MSUB_temp_toD(VU, regd, info);
-}
-
-void recVUMI_MSUB(VURegs *VU, int info)
-{
- //Console.WriteLn ("recVUMI_MSUB");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MSUB_toD(VU, EEREC_D, info);
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_MSUB_iq(VURegs *VU, int addr, int info)
-{
- //Console.WriteLn ("recVUMI_MSUB_iq");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MSUB_iq_toD(VU, EEREC_D, addr, info);
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_MSUBi(VURegs *VU, int info) { recVUMI_MSUB_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
-void recVUMI_MSUBq(VURegs *VU, int info) { recVUMI_MSUB_iq(VU, VU_REGQ_ADDR, info); }
-void recVUMI_MSUBx(VURegs *VU, int info)
-{
- //Console.WriteLn ("recVUMI_MSUBx");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 0, info);
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_MSUBy(VURegs *VU, int info)
-{
- //Console.WriteLn ("recVUMI_MSUBy");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 1, info);
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_MSUBz(VURegs *VU, int info)
-{
- //Console.WriteLn ("recVUMI_MSUBz");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 2, info);
- recUpdateFlags(VU, EEREC_D, info);
-}
-
-void recVUMI_MSUBw(VURegs *VU, int info)
-{
- //Console.WriteLn ("recVUMI_MSUBw");
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 3, info);
- recUpdateFlags(VU, EEREC_D, info);
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// MSUBA
-//------------------------------------------------------------------
-void recVUMI_MSUBA( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_MSUBA");
- recVUMI_MSUB_toD(VU, EEREC_ACC, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MSUBAi( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_MSUBAi ");
- recVUMI_MSUB_iq_toD( VU, EEREC_ACC, VU_VI_ADDR(REG_I, 1), info );
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MSUBAq( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_MSUBAq");
- recVUMI_MSUB_iq_toD( VU, EEREC_ACC, VU_REGQ_ADDR, info );
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MSUBAx( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_MSUBAx");
- recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 0, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MSUBAy( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_MSUBAy");
- recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 1, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MSUBAz( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_MSUBAz ");
- recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 2, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-
-void recVUMI_MSUBAw( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_MSUBAw");
- recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 3, info);
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-//------------------------------------------------------------------
-
-
-static const __aligned16 u32 special_mask[4] = {0xffffffff, 0x80000000, 0xffffffff, 0x80000000};
-static const __aligned16 u32 special_mask2[4] = {0, 0x40000000, 0, 0x40000000};
-
-__aligned16 u32 temp_loc[4];
-__aligned16 u32 temp_loc2[4];
-
-//MAX/MINI are non-arithmetic operations that implicitly support numbers with the EXP field being 0 ("denormals").
-//
-//As such, they are sometimes used for integer move and (positive!) integer max/min, knowing that integers that
-//represent denormals will not be flushed to 0.
-//
-//As such, this implementation performs a non-arithmetic operation that supports "denormals" and "infs/nans".
-//There might be an easier way to do it but here, MAX/MIN is performed with PMAXPD/PMINPD.
-//Fake double-precision numbers are constructed by copying the sign of the original numbers, clearing the upper 32 bits,
-//setting the 62nd bit to 1 (to ensure double-precision number is "normalized") and having the lower 32bits
-//being the same as the original number.
-
-void MINMAXlogical(VURegs *VU, int info, int min, int mode, uptr addr = 0, int xyzw = 0)
-//mode1 = iq, mode2 = xyzw, mode0 = normal
-{
- int t1regbool = 0;
- int t1reg = _vuGetTempXMMreg(info);
- if (t1reg < 0)
- {
- t1regbool = 1;
- for (t1reg = 0; ( (t1reg == EEREC_D) || (t1reg == EEREC_S) || (mode != 1 && t1reg == EEREC_T)
- || (t1reg == EEREC_TEMP) ); t1reg++); // Find unused reg (For first temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)temp_loc, t1reg); // Backup t1reg XMM reg
- }
- int t2regbool = -1;
- int t2reg = EEREC_TEMP;
- if (EEREC_TEMP == EEREC_D || EEREC_TEMP == EEREC_S || (mode != 1 && EEREC_TEMP == EEREC_T))
- {
- t2regbool = 0;
- t2reg = _vuGetTempXMMreg(info);
- if (t2reg < 0)
- {
- t2regbool = 1;
- for (t2reg = 0; ( (t2reg == EEREC_D) || (t2reg == EEREC_S) || (mode != 1 && t2reg == EEREC_T) ||
- (t2reg == t1reg) || (t2reg == EEREC_TEMP) ); t2reg++); // Find unused reg (For second temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)temp_loc2, t2reg); // Backup t2reg XMM reg
- }
- }
-
- if (_X || _Y)
- {
- SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0x50);
- SSE2_PAND_M128_to_XMM(t1reg, (uptr)special_mask);
- SSE2_POR_M128_to_XMM(t1reg, (uptr)special_mask2);
- if (mode == 0)
- SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0x50);
- else if (mode == 1)
- {
- SSE2_MOVD_M32_to_XMM(t2reg, addr);
- SSE2_PSHUFD_XMM_to_XMM(t2reg, t2reg, 0x00);
- }
- else if (mode == 2)
- _unpackVF_xyzw(t2reg, EEREC_T, xyzw);
- SSE2_PAND_M128_to_XMM(t2reg, (uptr)special_mask);
- SSE2_POR_M128_to_XMM(t2reg, (uptr)special_mask2);
- if (min)
- SSE2_MINPD_XMM_to_XMM(t1reg, t2reg);
- else
- SSE2_MAXPD_XMM_to_XMM(t1reg, t2reg);
- SSE2_PSHUFD_XMM_to_XMM(t1reg, t1reg, 0x88);
- VU_MERGE_REGS_CUSTOM(EEREC_D, t1reg, 0xc & _X_Y_Z_W);
- }
-
- if (_Z || _W)
- {
- SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0xfa);
- SSE2_PAND_M128_to_XMM(t1reg, (uptr)special_mask);
- SSE2_POR_M128_to_XMM(t1reg, (uptr)special_mask2);
- if (mode == 0)
- SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0xfa);
- else if (mode == 1)
- {
- SSE2_MOVD_M32_to_XMM(t2reg, addr);
- SSE2_PSHUFD_XMM_to_XMM(t2reg, t2reg, 0x00);
- }
- else if (mode == 2)
- _unpackVF_xyzw(t2reg, EEREC_T, xyzw);
- SSE2_PAND_M128_to_XMM(t2reg, (uptr)special_mask);
- SSE2_POR_M128_to_XMM(t2reg, (uptr)special_mask2);
- if (min)
- SSE2_MINPD_XMM_to_XMM(t1reg, t2reg);
- else
- SSE2_MAXPD_XMM_to_XMM(t1reg, t2reg);
- SSE2_PSHUFD_XMM_to_XMM(t1reg, t1reg, 0x88);
- VU_MERGE_REGS_CUSTOM(EEREC_D, t1reg, 0x3 & _X_Y_Z_W);
- }
-
- if (t1regbool == 0)
- _freeXMMreg(t1reg);
- else if (t1regbool == 1)
- SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)temp_loc); // Restore t1reg XMM reg
- if (t2regbool == 0)
- _freeXMMreg(t2reg);
- else if (t2regbool == 1)
- SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)temp_loc2); // Restore t2reg XMM reg
-}
-
-//------------------------------------------------------------------
-// MAX
-//------------------------------------------------------------------
-
-void recVUMI_MAX(VURegs *VU, int info)
-{
- if ( _Fd_ == 0 ) return;
- //Console.WriteLn ("recVUMI_MAX");
-
- if (MINMAXFIX)
- MINMAXlogical(VU, info, 0, 0);
- else
- {
-
- if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
- if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
-
- if( _X_Y_Z_W == 8 ) {
- if (EEREC_D == EEREC_S) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T);
- else if (EEREC_D == EEREC_T) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_S);
- else {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
-
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- if( EEREC_D == EEREC_S ) SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_T);
- else if( EEREC_D == EEREC_T ) SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S);
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- }
-}
-
-void recVUMI_MAX_iq(VURegs *VU, uptr addr, int info)
-{
- if ( _Fd_ == 0 ) return;
- //Console.WriteLn ("recVUMI_MAX_iq");
-
- if (MINMAXFIX)
- MINMAXlogical(VU, info, 0, 1, addr);
- else
- {
- if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
- vuFloat3(addr);
-
- if( _XYZW_SS ) {
- if( EEREC_D == EEREC_TEMP ) {
- _vuFlipRegSS(VU, EEREC_S);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MAXSS_M32_to_XMM(EEREC_D, addr);
- _vuFlipRegSS(VU, EEREC_S);
-
- // have to flip over EEREC_D if computing flags!
- //if( (info & PROCESS_VU_UPDATEFLAGS) )
- _vuFlipRegSS(VU, EEREC_D);
- }
- else if( EEREC_D == EEREC_S ) {
- _vuFlipRegSS(VU, EEREC_D);
- SSE_MAXSS_M32_to_XMM(EEREC_D, addr);
- _vuFlipRegSS(VU, EEREC_D);
- }
- else {
- if( _X ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MAXSS_M32_to_XMM(EEREC_D, addr);
- }
- else {
- _vuMoveSS(VU, EEREC_TEMP, EEREC_S);
- _vuFlipRegSS(VU, EEREC_D);
- SSE_MAXSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- _vuFlipRegSS(VU, EEREC_D);
- }
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- if(EEREC_D == EEREC_S) {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- else {
- SSE_MOVSS_M32_to_XMM(EEREC_D, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00);
- SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- }
- }
-}
-
-void recVUMI_MAX_xyzw(VURegs *VU, int xyzw, int info)
-{
- if ( _Fd_ == 0 ) return;
- //Console.WriteLn ("recVUMI_MAX_xyzw");
-
- if (_Fs_ == 0 && _Ft_ == 0)
- {
- if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) {
- if( xyzw < 3 ) {
- SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- else {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)s_fones);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- if( xyzw < 3 ) {
- if( _X_Y_Z_W & 1 ) SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[0]); // w included, so insert the whole reg
- else SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); // w not included, can zero out
- }
- else SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)s_fones);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- //If VF0.w isnt chosen as the constant, then its going to be MAX( 0, VF0 ), so the result is VF0
- if( xyzw < 3 ) { SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)&VU->VF[0].UL[0]); }
- else SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)s_fones);
- }
- return;
- }
-
- if (MINMAXFIX)
- MINMAXlogical(VU, info, 0, 2, 0, xyzw);
- else
- {
- if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
- if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
-
- if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) {
- if( xyzw == 0 ) {
- if( EEREC_D == EEREC_S ) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T);
- else if( EEREC_D == EEREC_T ) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_S);
- else {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- else {
- _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- if (EEREC_D == EEREC_S) {
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- else {
- _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw);
- SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- }
- }
-}
-
-void recVUMI_MAXi(VURegs *VU, int info) { recVUMI_MAX_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
-void recVUMI_MAXx(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 0, info); }
-void recVUMI_MAXy(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 1, info); }
-void recVUMI_MAXz(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 2, info); }
-void recVUMI_MAXw(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 3, info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// MINI
-//------------------------------------------------------------------
-void recVUMI_MINI(VURegs *VU, int info)
-{
- if ( _Fd_ == 0 ) return;
- //Console.WriteLn ("recVUMI_MINI");
-
- if (MINMAXFIX)
- MINMAXlogical(VU, info, 1, 0);
- else
- {
-
- if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
- if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
-
- if( _X_Y_Z_W == 8 ) {
- if (EEREC_D == EEREC_S) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T);
- else if (EEREC_D == EEREC_T) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_S);
- else {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
-
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- if( EEREC_D == EEREC_S ) {
- //ClampUnordered(EEREC_T, EEREC_TEMP, 0); // need for GT4 vu0rec
- SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- else if( EEREC_D == EEREC_T ) {
- //ClampUnordered(EEREC_S, EEREC_TEMP, 0); // need for GT4 vu0rec
- SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- }
-}
-
-void recVUMI_MINI_iq(VURegs *VU, uptr addr, int info)
-{
- if ( _Fd_ == 0 ) return;
- //Console.WriteLn ("recVUMI_MINI_iq");
-
- if (MINMAXFIX)
- MINMAXlogical(VU, info, 1, 1, addr);
- else
- {
-
- if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
- vuFloat3(addr);
-
- if( _XYZW_SS ) {
- if( EEREC_D == EEREC_TEMP ) {
- _vuFlipRegSS(VU, EEREC_S);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MINSS_M32_to_XMM(EEREC_D, addr);
- _vuFlipRegSS(VU, EEREC_S);
-
- // have to flip over EEREC_D if computing flags!
- //if( (info & PROCESS_VU_UPDATEFLAGS) )
- _vuFlipRegSS(VU, EEREC_D);
- }
- else if( EEREC_D == EEREC_S ) {
- _vuFlipRegSS(VU, EEREC_D);
- SSE_MINSS_M32_to_XMM(EEREC_D, addr);
- _vuFlipRegSS(VU, EEREC_D);
- }
- else {
- if( _X ) {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MINSS_M32_to_XMM(EEREC_D, addr);
- }
- else {
- _vuMoveSS(VU, EEREC_TEMP, EEREC_S);
- _vuFlipRegSS(VU, EEREC_D);
- SSE_MINSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- _vuFlipRegSS(VU, EEREC_D);
- }
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- if(EEREC_D == EEREC_S) {
- SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
- SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- else {
- SSE_MOVSS_M32_to_XMM(EEREC_D, addr);
- SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00);
- SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- }
- }
-}
-
-void recVUMI_MINI_xyzw(VURegs *VU, int xyzw, int info)
-{
- if ( _Fd_ == 0 ) return;
- //Console.WriteLn ("recVUMI_MINI_xyzw");
-
- if (_Fs_ == 0 && _Ft_ == 0)
- {
- if( _X_Y_Z_W == 0xf )
- {
- //If VF0.w is the constant, the result will match VF0, else its all 0's
- if(xyzw == 3) SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)&VU->VF[0].UL[0]);
- else SSE_XORPS_XMM_to_XMM(EEREC_D, EEREC_D);
- }
- else
- {
- //If VF0.w is the constant, the result will match VF0, else its all 0's
- if(xyzw == 3) SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[0]);
- else SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- return;
- }
- if (MINMAXFIX)
- MINMAXlogical(VU, info, 1, 2, 0, xyzw);
- else
- {
- if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
- if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
-
- if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) {
- if( xyzw == 0 ) {
- if( EEREC_D == EEREC_S ) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T);
- else if( EEREC_D == EEREC_T ) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_S);
- else {
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T);
- }
- }
- else {
- _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
- SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- }
- else if (_X_Y_Z_W != 0xf) {
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
- }
- else {
- if (EEREC_D == EEREC_S) {
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
- SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
- }
- else {
- _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw);
- SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S);
- }
- }
- }
-}
-
-void recVUMI_MINIi(VURegs *VU, int info) { recVUMI_MINI_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
-void recVUMI_MINIx(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 0, info); }
-void recVUMI_MINIy(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 1, info); }
-void recVUMI_MINIz(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 2, info); }
-void recVUMI_MINIw(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 3, info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// OPMULA
-//------------------------------------------------------------------
-void recVUMI_OPMULA( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_OPMULA");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, 0xE);
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 0xE);
- }
-
- SSE_MOVAPS_XMM_to_XMM( EEREC_TEMP, EEREC_S );
- SSE_SHUFPS_XMM_to_XMM( EEREC_T, EEREC_T, 0xD2 ); // EEREC_T = WYXZ
- SSE_SHUFPS_XMM_to_XMM( EEREC_TEMP, EEREC_TEMP, 0xC9 ); // EEREC_TEMP = WXZY
- SSE_MULPS_XMM_to_XMM( EEREC_TEMP, EEREC_T );
-
- VU_MERGE_REGS_CUSTOM(EEREC_ACC, EEREC_TEMP, 14);
-
- // revert EEREC_T
- if( EEREC_T != EEREC_ACC )
- SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xC9);
-
- recUpdateFlags(VU, EEREC_ACC, info);
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// OPMSUB
-//------------------------------------------------------------------
-void recVUMI_OPMSUB( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_OPMSUB");
- if (CHECK_VU_EXTRA_OVERFLOW) {
- if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, 0xE);
- if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 0xE);
- }
-
- if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xD2); // EEREC_T = WYXZ
- SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0xC9); // EEREC_TEMP = WXZY
- SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
-
- // negate and add
- SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
- SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
- VU_MERGE_REGS_CUSTOM(EEREC_D, EEREC_TEMP, 14);
-
- // revert EEREC_T
- if( EEREC_T != EEREC_D ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xC9);
-
- recUpdateFlags(VU, EEREC_D, info);
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// NOP
-//------------------------------------------------------------------
-void recVUMI_NOP( VURegs *VU, int info )
-{
- //Console.WriteLn ("recVUMI_NOP");
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// recVUMI_FTOI_Saturate() - Saturates result from FTOI Instructions
-//------------------------------------------------------------------
-
-// unused, but leaving here for possible reference..
-//static const __aligned16 int rec_const_0x8000000[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
-
-void recVUMI_FTOI_Saturate(int rec_s, int rec_t, int rec_tmp1, int rec_tmp2)
-{
- //Console.WriteLn ("recVUMI_FTOI_Saturate");
- //Duplicate the xor'd sign bit to the whole value
- //FFFF FFFF for positive, 0 for negative
- SSE_MOVAPS_XMM_to_XMM(rec_tmp1, rec_s);
- SSE2_PXOR_M128_to_XMM(rec_tmp1, (uptr)&const_clip[4]);
- SSE2_PSRAD_I8_to_XMM(rec_tmp1, 31);
-
- //Create mask: 0 where !=8000 0000
- SSE_MOVAPS_XMM_to_XMM(rec_tmp2, rec_t);
- SSE2_PCMPEQD_M128_to_XMM(rec_tmp2, (uptr)&const_clip[4]);
-
- //AND the mask w/ the edit values
- SSE_ANDPS_XMM_to_XMM(rec_tmp1, rec_tmp2);
-
- //if v==8000 0000 && positive -> 8000 0000 + FFFF FFFF -> 7FFF FFFF
- //if v==8000 0000 && negative -> 8000 0000 + 0 -> 8000 0000
- //if v!=8000 0000 -> v+0 (masked from the and)
-
- //Add the values as needed
- SSE2_PADDD_XMM_to_XMM(rec_t, rec_tmp1);
-}
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// FTOI 0/4/12/15
-//------------------------------------------------------------------
-static __aligned16 float FTIO_Temp1[4] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000 };
-static __aligned16 float FTIO_Temp2[4] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000 };
-void recVUMI_FTOI0(VURegs *VU, int info)
-{
- int t1reg, t2reg; // Temp XMM regs
-
- if ( _Ft_ == 0 ) return;
-
- //Console.WriteLn ("recVUMI_FTOI0");
-
- if (_X_Y_Z_W != 0xf) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
- SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
-
- t1reg = _vuGetTempXMMreg(info);
-
- if( t1reg >= 0 ) { // If theres a temp XMM reg available
- for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++)
- ; // Find unused reg (For second temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg
-
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
-
- SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg
- _freeXMMreg(t1reg); // Free temp reg
- }
- else { // No temp reg available
- for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
- ; // Find unused reg (For first temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
-
- for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++)
- ; // Find unused reg (For second temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg
-
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
-
- SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
- SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg
- }
-
- VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
- }
- else {
- if (EEREC_T != EEREC_S) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_S);
- vuFloat_useEAX( info, EEREC_T, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
- SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_T, EEREC_T);
-
- t1reg = _vuGetTempXMMreg(info);
-
- if( t1reg >= 0 ) { // If theres a temp XMM reg available
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result
- _freeXMMreg(t1reg); // Free temp reg
- }
- else { // No temp reg available
- for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
- ; // Find unused reg
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
-
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result
-
- SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
- }
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
- SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
-
- t1reg = _vuGetTempXMMreg(info);
-
- if( t1reg >= 0 ) { // If theres a temp XMM reg available
- for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++)
- ; // Find unused reg (For second temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg
-
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
-
- SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg
- _freeXMMreg(t1reg); // Free temp reg
- }
- else { // No temp reg available
- for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
- ; // Find unused reg (For first temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
-
- for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++)
- ; // Find unused reg (For second temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg
-
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
-
- SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
- SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg
- }
-
- SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_TEMP);
- }
- }
-}
-
-void recVUMI_FTOIX(VURegs *VU, int addr, int info)
-{
- int t1reg, t2reg; // Temp XMM regs
-
- if ( _Ft_ == 0 ) return;
-
- //Console.WriteLn ("recVUMI_FTOIX");
- if (_X_Y_Z_W != 0xf) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr);
- vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
- SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
-
- t1reg = _vuGetTempXMMreg(info);
-
- if( t1reg >= 0 ) { // If theres a temp XMM reg available
- for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++)
- ; // Find unused reg (For second temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg
-
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
-
- SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg
- _freeXMMreg(t1reg); // Free temp reg
- }
- else { // No temp reg available
- for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
- ; // Find unused reg (For first temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
-
- for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++)
- ; // Find unused reg (For second temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg
-
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
-
- SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
- SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg
- }
-
- VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
- }
- else {
- if (EEREC_T != EEREC_S) {
- SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_S);
- SSE_MULPS_M128_to_XMM(EEREC_T, addr);
- vuFloat_useEAX( info, EEREC_T, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
- SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_T, EEREC_T);
-
- t1reg = _vuGetTempXMMreg(info);
-
- if( t1reg >= 0 ) { // If theres a temp XMM reg available
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result
- _freeXMMreg(t1reg); // Free temp reg
- }
- else { // No temp reg available
- for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
- ; // Find unused reg
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
-
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result
-
- SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
- }
- }
- else {
- SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr);
- vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
- SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
-
- t1reg = _vuGetTempXMMreg(info);
-
- if( t1reg >= 0 ) { // If theres a temp XMM reg available
- for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++)
- ; // Find unused reg (For second temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg
-
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
-
- SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg
- _freeXMMreg(t1reg); // Free temp reg
- }
- else { // No temp reg available
- for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
- ; // Find unused reg (For first temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
-
- for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++)
- ; // Find unused reg (For second temp reg)
- SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg
-
- recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
-
- SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
- SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg
- }
-
- SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_TEMP);
- }
- }
-}
-
-void recVUMI_FTOI4( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int4[0], info); }
-void recVUMI_FTOI12( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int12[0], info); }
-void recVUMI_FTOI15( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int15[0], info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// ITOF 0/4/12/15
-//------------------------------------------------------------------
-void recVUMI_ITOF0( VURegs *VU, int info )
-{
- if ( _Ft_ == 0 ) return;
-
- //Console.WriteLn ("recVUMI_ITOF0");
- if (_X_Y_Z_W != 0xf) {
- SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- vuFloat_useEAX( info, EEREC_TEMP, 15); // Clamp infinities
- VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
- xmmregs[EEREC_T].mode |= MODE_WRITE;
- }
- else {
- SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_T, EEREC_S);
- vuFloat2(EEREC_T, EEREC_TEMP, 15); // Clamp infinities
- }
-}
-
-void recVUMI_ITOFX(VURegs *VU, int addr, int info)
-{
- if ( _Ft_ == 0 ) return;
-
- //Console.WriteLn ("recVUMI_ITOFX");
- if (_X_Y_Z_W != 0xf) {
- SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
- SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr);
- vuFloat_useEAX( info, EEREC_TEMP, 15); // Clamp infinities
- VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
- xmmregs[EEREC_T].mode |= MODE_WRITE;
- }
- else {
- SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_T, EEREC_S);
- SSE_MULPS_M128_to_XMM(EEREC_T, addr);
- vuFloat2(EEREC_T, EEREC_TEMP, 15); // Clamp infinities
- }
-}
-
-void recVUMI_ITOF4( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float4[0], info); }
-void recVUMI_ITOF12( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float12[0], info); }
-void recVUMI_ITOF15( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float15[0], info); }
-//------------------------------------------------------------------
-
-
-//------------------------------------------------------------------
-// CLIP
-//------------------------------------------------------------------
-void recVUMI_CLIP(VURegs *VU, int info)
-{
- int t1reg = EEREC_D;
- int t2reg = EEREC_ACC;
- int x86temp1, x86temp2;
-
- u32 clipaddr = VU_VI_ADDR(REG_CLIP_FLAG, 0);
- u32 prevclipaddr = VU_VI_ADDR(REG_CLIP_FLAG, 2);
-
- if( clipaddr == 0 ) { // battle star has a clip right before fcset
- Console.WriteLn("skipping vu clip");
- return;
- }
-
- //Flush the clip flag before processing, incase of double clip commands (GoW)
-
- if( prevclipaddr != (uptr)&VU->VI[REG_CLIP_FLAG] ) {
- MOV32MtoR(EAX, prevclipaddr);
- MOV32RtoM((uptr)&VU->VI[REG_CLIP_FLAG], EAX);
- }
-
- assert( clipaddr != 0 );
- assert( t1reg != t2reg && t1reg != EEREC_TEMP && t2reg != EEREC_TEMP );
-
- x86temp1 = ALLOCTEMPX86(MODE_8BITREG);
- x86temp2 = ALLOCTEMPX86(MODE_8BITREG);
-
- //if ( (x86temp1 == 0) || (x86temp2 == 0) ) Console.Error("VU CLIP Allocation Error: EAX being allocated!");
-
- _freeXMMreg(t1reg); // These should have been freed at allocation in eeVURecompileCode()
- _freeXMMreg(t2reg); // but if they've been used since then, then free them. (just doing this incase :p (cottonvibes))
-
- if( _Ft_ == 0 ) {
- SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&s_fones[0]); // all 1s
- SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)&s_fones[4]);
- }
- else {
- _unpackVF_xyzw(EEREC_TEMP, EEREC_T, 3);
- SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[0]);
- SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_TEMP);
- SSE_ORPS_M128_to_XMM(t1reg, (uptr)&const_clip[4]);
- }
-
- MOV32MtoR(EAX, prevclipaddr);
-
- SSE_CMPNLEPS_XMM_to_XMM(t1reg, EEREC_S); //-w, -z, -y, -x
- SSE_CMPLTPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); //+w, +z, +y, +x
-
- SHL32ItoR(EAX, 6);
-
- SSE_MOVAPS_XMM_to_XMM(t2reg, EEREC_TEMP); //t2 = +w, +z, +y, +x
- SSE_UNPCKLPS_XMM_to_XMM(EEREC_TEMP, t1reg); //EEREC_TEMP = -y,+y,-x,+x
- SSE_UNPCKHPS_XMM_to_XMM(t2reg, t1reg); //t2reg = -w,+w,-z,+z
- SSE_MOVMSKPS_XMM_to_R32(x86temp2, EEREC_TEMP); // -y,+y,-x,+x
- SSE_MOVMSKPS_XMM_to_R32(x86temp1, t2reg); // -w,+w,-z,+z
-
- AND8ItoR(x86temp1, 0x3);
- SHL8ItoR(x86temp1, 4);
- OR8RtoR(EAX, x86temp1);
- AND8ItoR(x86temp2, 0xf);
- OR8RtoR(EAX, x86temp2);
- AND32ItoR(EAX, 0xffffff);
-
- MOV32RtoM(clipaddr, EAX);
-
- if (( !(info & (PROCESS_VU_SUPER|PROCESS_VU_COP2)) ) ) //Instantly update the flag if its called from elsewhere (unlikely, but ok)
- MOV32RtoM((uptr)&VU->VI[REG_CLIP_FLAG], EAX);
-
- _freeX86reg(x86temp1);
- _freeX86reg(x86temp2);
-}
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+#include "PrecompiledHeader.h"
+
+#include "Common.h"
+#include "GS.h"
+#include "R5900OpcodeTables.h"
+#include "iR5900.h"
+#include "iMMI.h"
+#include "iFPU.h"
+#include "iCOP0.h"
+#include "VUmicro.h"
+#include "VUflags.h"
+#include "sVU_Micro.h"
+#include "sVU_Debug.h"
+#include "sVU_zerorec.h"
+//------------------------------------------------------------------
+#define MINMAXFIX 1
+//------------------------------------------------------------------
+// Helper Macros
+//------------------------------------------------------------------
+#define _Ft_ (( VU->code >> 16) & 0x1F) // The rt part of the instruction register
+#define _Fs_ (( VU->code >> 11) & 0x1F) // The rd part of the instruction register
+#define _Fd_ (( VU->code >> 6) & 0x1F) // The sa part of the instruction register
+
+#define _X (( VU->code>>24) & 0x1)
+#define _Y (( VU->code>>23) & 0x1)
+#define _Z (( VU->code>>22) & 0x1)
+#define _W (( VU->code>>21) & 0x1)
+
+#define _XYZW_SS (_X+_Y+_Z+_W==1)
+
+#define _Fsf_ (( VU->code >> 21) & 0x03)
+#define _Ftf_ (( VU->code >> 23) & 0x03)
+
+#define _Imm11_ (s32)(VU->code & 0x400 ? 0xfffffc00 | (VU->code & 0x3ff) : VU->code & 0x3ff)
+#define _UImm11_ (s32)(VU->code & 0x7ff)
+
+#define VU_VFx_ADDR(x) (uptr)&VU->VF[x].UL[0]
+#define VU_VFy_ADDR(x) (uptr)&VU->VF[x].UL[1]
+#define VU_VFz_ADDR(x) (uptr)&VU->VF[x].UL[2]
+#define VU_VFw_ADDR(x) (uptr)&VU->VF[x].UL[3]
+
+#define VU_REGR_ADDR (uptr)&VU->VI[REG_R]
+#define VU_REGQ_ADDR (uptr)&VU->VI[REG_Q]
+#define VU_REGMAC_ADDR (uptr)&VU->VI[REG_MAC_FLAG]
+
+#define VU_VI_ADDR(x, read) GetVIAddr(VU, x, read, info)
+
+#define VU_ACCx_ADDR (uptr)&VU->ACC.UL[0]
+#define VU_ACCy_ADDR (uptr)&VU->ACC.UL[1]
+#define VU_ACCz_ADDR (uptr)&VU->ACC.UL[2]
+#define VU_ACCw_ADDR (uptr)&VU->ACC.UL[3]
+
+#define _X_Y_Z_W ((( VU->code >> 21 ) & 0xF ) )
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// Global Variables
+//------------------------------------------------------------------
+static const __aligned16 int SSEmovMask[ 16 ][ 4 ] =
+{
+ { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+ { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+ { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+ { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+ { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+ { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+ { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
+ { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+ { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+static const __aligned16 u32 const_abs_table[16][4] =
+{
+ { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000
+ { 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, //0001
+ { 0xffffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, //0010
+ { 0xffffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, //0011
+ { 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, //0100
+ { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, //0101
+ { 0xffffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, //0110
+ { 0xffffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0111
+ { 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000
+ { 0x7fffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, //1001
+ { 0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, //1010
+ { 0x7fffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, //1011
+ { 0x7fffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, //1100
+ { 0x7fffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, //1101
+ { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, //1110
+ { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1111
+};
+
+static const __aligned16 float recMult_float_to_int4[4] = { 16.0, 16.0, 16.0, 16.0 };
+static const __aligned16 float recMult_float_to_int12[4] = { 4096.0, 4096.0, 4096.0, 4096.0 };
+static const __aligned16 float recMult_float_to_int15[4] = { 32768.0, 32768.0, 32768.0, 32768.0 };
+
+static const __aligned16 float recMult_int_to_float4[4] = { 0.0625f, 0.0625f, 0.0625f, 0.0625f };
+static const __aligned16 float recMult_int_to_float12[4] = { 0.000244140625, 0.000244140625, 0.000244140625, 0.000244140625 };
+static const __aligned16 float recMult_int_to_float15[4] = { 0.000030517578125, 0.000030517578125, 0.000030517578125, 0.000030517578125 };
+
+static const __aligned16 u32 VU_Underflow_Mask1[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
+static const __aligned16 u32 VU_Underflow_Mask2[4] = {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff};
+static const __aligned16 u32 VU_Zero_Mask[4] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
+static const __aligned16 u32 VU_Zero_Helper_Mask[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+static const __aligned16 u32 VU_Signed_Zero_Mask[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+static const __aligned16 u32 VU_Pos_Infinity[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
+static const __aligned16 u32 VU_Neg_Infinity[4] = {0xff800000, 0xff800000, 0xff800000, 0xff800000};
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// recUpdateFlags() - Computes the flags for the Upper Opcodes
+//
+// Note: Computes under/overflow flags if CHECK_VU_EXTRA_FLAGS is 1
+//------------------------------------------------------------------
+static __aligned16 u64 TEMPXMMData[2];
+void recUpdateFlags(VURegs * VU, int reg, int info)
+{
+ static u8 *pjmp, *pjmp2;
+ static u32 *pjmp32;
+ static u32 macaddr, stataddr, prevstataddr;
+ static int x86macflag, x86statflag, x86temp;
+ static int t1reg, t1regBoolean;
+ static const int flipMask[16] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
+
+ if( !(info & PROCESS_VU_UPDATEFLAGS) ) {
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (reg != EEREC_TEMP) vuFloat2(reg, EEREC_TEMP, _X_Y_Z_W);
+ else vuFloat_useEAX(info, reg, _X_Y_Z_W);
+ }
+ return;
+ }
+
+ //Console.WriteLn ("recUpdateFlags");
+
+ macaddr = VU_VI_ADDR(REG_MAC_FLAG, 0);
+ stataddr = VU_VI_ADDR(REG_STATUS_FLAG, 0); // write address
+ prevstataddr = VU_VI_ADDR(REG_STATUS_FLAG, 2); // previous address
+
+ if( stataddr == 0 ) stataddr = prevstataddr;
+ if( macaddr == 0 ) {
+ Console.WriteLn( "VU ALLOCATION WARNING: Using Mac Flag Previous Address!" );
+ macaddr = VU_VI_ADDR(REG_MAC_FLAG, 2);
+ }
+
+ x86macflag = ALLOCTEMPX86(0);
+ x86statflag = ALLOCTEMPX86(0);
+
+ if (reg == EEREC_TEMP) {
+ t1reg = _vuGetTempXMMreg(info);
+ if (t1reg < 0) {
+ //Console.WriteLn( "VU ALLOCATION ERROR: Temp reg can't be allocated!!!!" );
+ t1reg = (reg == 0) ? 1 : 0; // Make t1reg != reg
+ SSE_MOVAPS_XMM_to_M128( (uptr)TEMPXMMData, t1reg ); // Backup data to temp address
+ t1regBoolean = 1;
+ }
+ else t1regBoolean = 0;
+ }
+ else {
+ t1reg = EEREC_TEMP;
+ t1regBoolean = 2;
+ }
+
+ SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip wzyx to xyzw
+ MOV32MtoR(x86statflag, prevstataddr); // Load the previous status in to x86statflag
+ AND16ItoR(x86statflag, 0xff0); // Keep Sticky and D/I flags
+
+
+ if (CHECK_VU_EXTRA_FLAGS) { // Checks all flags
+
+ x86temp = ALLOCTEMPX86(0);
+
+ //-------------------------Check for Overflow flags------------------------------
+
+ //SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
+ //SSE_CMPUNORDPS_XMM_to_XMM(t1reg, reg); // If reg == NaN then set Vector to 0xFFFFFFFF
+
+ //SSE_MOVAPS_XMM_to_XMM(t1reg, reg);
+ //SSE_MINPS_M128_to_XMM(t1reg, (uptr)g_maxvals);
+ //SSE_MAXPS_M128_to_XMM(t1reg, (uptr)g_minvals);
+ //SSE_CMPNEPS_XMM_to_XMM(t1reg, reg); // If they're not equal, then overflow has occured
+
+ SSE_MOVAPS_XMM_to_XMM(t1reg, reg);
+ SSE_ANDPS_M128_to_XMM(t1reg, (uptr)VU_Zero_Helper_Mask);
+ SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)VU_Pos_Infinity); // If infinity, then overflow has occured (NaN's don't report as overflow)
+
+ SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the previous calculation
+
+ AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified)
+ pjmp = JZ8(0); // Skip if none are
+ OR16ItoR(x86statflag, 0x208); // OS, O flags
+ SHL16ItoR(x86macflag, 12);
+ if (_XYZW_SS) pjmp32 = JMP32(0); // Skip Underflow Check
+ x86SetJ8(pjmp);
+
+ //-------------------------Check for Underflow flags------------------------------
+
+ SSE_MOVAPS_XMM_to_XMM(t1reg, reg); // t1reg <- reg
+
+ SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask1[ 0 ]);
+ SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg == zero exponent) then set Vector to 0xFFFFFFFF
+
+ SSE_ANDPS_XMM_to_XMM(t1reg, reg);
+ SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask2[ 0 ]);
+ SSE_CMPNEPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg != zero mantisa) then set Vector to 0xFFFFFFFF
+
+ SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
+
+ AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Has Underflowed" bits from the previous calculation
+ pjmp = JZ8(0); // Skip if none are
+ OR16ItoR(x86statflag, 0x104); // US, U flags
+ SHL16ItoR(EAX, 8);
+ OR32RtoR(x86macflag, EAX);
+ x86SetJ8(pjmp);
+
+ //-------------------------Optional Code: Denormals Are Zero------------------------------
+ if (CHECK_VU_UNDERFLOW) { // Sets underflow/denormals to zero
+ SSE_ANDNPS_XMM_to_XMM(t1reg, reg); // t1reg = !t1reg & reg (t1reg = denormals are positive zero)
+ VU_MERGE_REGS_SAFE(t1reg, reg, (15 - flipMask[_X_Y_Z_W])); // Send t1reg the vectors that shouldn't be modified (since reg was flipped, we need a mask to get the unmodified vectors)
+ // Now we have Denormals are Positive Zero in t1reg; the next two lines take Signed Zero into account
+ SSE_ANDPS_M128_to_XMM(reg, (uptr)&VU_Signed_Zero_Mask[ 0 ]); // Only keep the sign bit for each vector
+ SSE_ORPS_XMM_to_XMM(reg, t1reg); // Denormals are Signed Zero, and unmodified vectors stay the same!
+ }
+
+ if (_XYZW_SS) x86SetJ32(pjmp32); // If we skipped the Underflow Flag Checking (when we had an Overflow), return here
+
+ vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask)
+
+ //-------------------------Check for Signed flags------------------------------
+
+ // The following code makes sure the Signed Bit isn't set with Negative Zero
+ SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
+ SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
+ SSE_MOVMSKPS_XMM_to_R32(x86temp, t1reg); // Used for Zero Flag Calculation
+ SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
+
+ SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg
+
+ AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
+ pjmp = JZ8(0); // Skip if none are
+ OR16ItoR(x86statflag, 0x82); // SS, S flags
+ SHL16ItoR(EAX, 4);
+ OR32RtoR(x86macflag, EAX);
+ if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
+ x86SetJ8(pjmp);
+
+ //-------------------------Check for Zero flags------------------------------
+
+ AND16ItoR(x86temp, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
+ pjmp = JZ8(0); // Skip if none are
+ OR16ItoR(x86statflag, 0x41); // ZS, Z flags
+ OR32RtoR(x86macflag, x86temp);
+ x86SetJ8(pjmp);
+
+ _freeX86reg(x86temp);
+ }
+ else { // Only Checks for Sign and Zero Flags
+
+ vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask)
+
+ //-------------------------Check for Signed flags------------------------------
+
+ // The following code makes sure the Signed Bit isn't set with Negative Zero
+ SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
+ SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
+ SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Used for Zero Flag Calculation
+ SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
+
+ SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the t1reg
+
+ AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
+ pjmp = JZ8(0); // Skip if none are
+ OR16ItoR(x86statflag, 0x82); // SS, S flags
+ SHL16ItoR(x86macflag, 4);
+ if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
+ x86SetJ8(pjmp);
+
+ //-------------------------Check for Zero flags------------------------------
+
+ AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
+ pjmp = JZ8(0); // Skip if none are
+ OR16ItoR(x86statflag, 0x41); // ZS, Z flags
+ OR32RtoR(x86macflag, EAX);
+ x86SetJ8(pjmp);
+ }
+ //-------------------------Finally: Send the Flags to the Mac Flag Address------------------------------
+
+ if (_XYZW_SS) x86SetJ8(pjmp2); // If we skipped the Zero Flag Checking, return here
+
+ if (t1regBoolean == 2) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip back reg to wzyx (have to do this because reg != EEREC_TEMP)
+ else if (t1regBoolean == 1) SSE_MOVAPS_M128_to_XMM( t1reg, (uptr)TEMPXMMData ); // Restore data from temo address
+ else _freeXMMreg(t1reg); // Free temp reg
+
+ MOV16RtoM(macaddr, x86macflag);
+ MOV16RtoM(stataddr, x86statflag);
+
+ _freeX86reg(x86macflag);
+ _freeX86reg(x86statflag);
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// Custom VU ADD/SUB routines by Nneeve
+//
+// Note: See FPU_ADD_SUB() for more info on what this is doing.
+//------------------------------------------------------------------
+static __aligned16 u32 VU_addsuband[2][4];
+static __aligned16 u32 VU_addsub_reg[2][4];
+
+static u32 tempECX;
+
+void VU_ADD_SUB(u32 regd, u32 regt, int is_sub, int info)
+{
+ u8 *localptr[4][8];
+
+ MOV32RtoM((uptr)&tempECX, ECX);
+
+ int temp1 = ECX; //receives regd
+ int temp2 = ALLOCTEMPX86(0);
+
+ if (temp2 == ECX)
+ {
+ temp2 = ALLOCTEMPX86(0);
+ _freeX86reg(ECX);
+ }
+
+ SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
+ SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
+
+ SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
+ SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd);
+ SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd);
+ SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
+
+ SSE2_PSLLD_I8_to_XMM(regd, 1);
+ SSE2_PSLLD_I8_to_XMM(regt, 1);
+
+ SSE2_PSRLD_I8_to_XMM(regd, 24);
+ SSE2_PSRLD_I8_to_XMM(regt, 24);
+
+ SSE2_PSUBD_XMM_to_XMM(regd, regt);
+
+#define PERFORM(i) \
+ \
+ SSE_PEXTRW_XMM_to_R32(temp1, regd, i*2); \
+ MOVSX32R16toR(temp1, temp1); \
+ CMP32ItoR(temp1, 25);\
+ localptr[i][0] = JGE8(0);\
+ CMP32ItoR(temp1, 0);\
+ localptr[i][1] = JG8(0);\
+ localptr[i][2] = JE8(0);\
+ CMP32ItoR(temp1, -25);\
+ localptr[i][3] = JLE8(0);\
+ \
+ NEG32R(temp1); \
+ DEC32R(temp1);\
+ MOV32ItoR(temp2, 0xffffffff); \
+ SHL32CLtoR(temp2); \
+ MOV32RtoM((uptr)&VU_addsuband[0][i], temp2);\
+ localptr[i][4] = JMP8(0);\
+ \
+ x86SetJ8(localptr[i][0]);\
+ MOV32ItoM((uptr)&VU_addsuband[1][i], 0x80000000);\
+ localptr[i][5] = JMP8(0);\
+ \
+ x86SetJ8(localptr[i][1]);\
+ DEC32R(temp1);\
+ MOV32ItoR(temp2, 0xffffffff);\
+ SHL32CLtoR(temp2); \
+ MOV32RtoM((uptr)&VU_addsuband[1][i], temp2);\
+ localptr[i][6] = JMP8(0);\
+ \
+ x86SetJ8(localptr[i][3]);\
+ MOV32ItoM((uptr)&VU_addsuband[0][i], 0x80000000);\
+ localptr[i][7] = JMP8(0);\
+ \
+ x86SetJ8(localptr[i][2]);\
+ \
+ x86SetJ8(localptr[i][4]);\
+ x86SetJ8(localptr[i][5]);\
+ x86SetJ8(localptr[i][6]);\
+ x86SetJ8(localptr[i][7]);
+
+ PERFORM(0);
+ PERFORM(1);
+ PERFORM(2);
+ PERFORM(3);
+#undef PERFORM
+
+ SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
+ SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
+
+ SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[0][0]);
+ SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsuband[1][0]);
+
+ if (is_sub) SSE_SUBPS_XMM_to_XMM(regd, regt);
+ else SSE_ADDPS_XMM_to_XMM(regd, regt);
+
+ SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
+
+ _freeX86reg(temp2);
+
+ MOV32MtoR(ECX, (uptr)&tempECX);
+}
+
+void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
+{
+ u8 *localptr[8];
+ u32 addrt = regt; //for case is_mem
+
+ MOV32RtoM((uptr)&tempECX, ECX);
+
+ int temp1 = ECX; //receives regd
+ int temp2 = ALLOCTEMPX86(0);
+
+ if (temp2 == ECX)
+ {
+ temp2 = ALLOCTEMPX86(0);
+ _freeX86reg(ECX);
+ }
+
+ SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
+ if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
+
+ SSE2_MOVD_XMM_to_R(temp1, regd);
+ SHR32ItoR(temp1, 23);
+
+ if (is_mem) {
+ MOV32MtoR(temp2, addrt);
+ MOV32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
+ SHR32ItoR(temp2, 23);
+ }
+ else {
+ SSE2_MOVD_XMM_to_R(temp2, regt);
+ SHR32ItoR(temp2, 23);
+ }
+
+ AND32ItoR(temp1, 0xff);
+ AND32ItoR(temp2, 0xff);
+
+ SUB32RtoR(temp1, temp2); //temp1 = exponent difference
+
+ CMP32ItoR(temp1, 25);
+ localptr[0] = JGE8(0);
+ CMP32ItoR(temp1, 0);
+ localptr[1] = JG8(0);
+ localptr[2] = JE8(0);
+ CMP32ItoR(temp1, -25);
+ localptr[3] = JLE8(0);
+
+ NEG32R(temp1);
+ DEC32R(temp1);
+ MOV32ItoR(temp2, 0xffffffff);
+ SHL32CLtoR(temp2);
+ SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
+ if (is_mem) {
+ SSE_PINSRW_R32_to_XMM(regd, temp2, 0);
+ SHR32ItoR(temp2, 16);
+ SSE_PINSRW_R32_to_XMM(regd, temp2, 1);
+ }
+ else {
+ SSE2_MOVD_R_to_XMM(regt, temp2);
+ SSE_MOVSS_XMM_to_XMM(regd, regt);
+ SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
+ }
+ localptr[4] = JMP8(0);
+
+ x86SetJ8(localptr[0]);
+ MOV32ItoR(temp2, 0x80000000);
+ if (is_mem)
+ AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
+ else {
+ SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
+ SSE2_MOVD_R_to_XMM(regd, temp2);
+ SSE_MOVSS_XMM_to_XMM(regt, regd);
+ }
+ SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
+ localptr[5] = JMP8(0);
+
+ x86SetJ8(localptr[1]);
+ DEC32R(temp1);
+ MOV32ItoR(temp2, 0xffffffff);
+ SHL32CLtoR(temp2);
+ if (is_mem)
+ AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
+ else {
+ SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
+ SSE2_MOVD_R_to_XMM(regd, temp2);
+ SSE_MOVSS_XMM_to_XMM(regt, regd);
+ }
+ SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
+ localptr[6] = JMP8(0);
+
+ x86SetJ8(localptr[3]);
+ MOV32ItoR(temp2, 0x80000000);
+ SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
+ if (is_mem) {
+ SSE_PINSRW_R32_to_XMM(regd, temp2, 0);
+ SHR32ItoR(temp2, 16);
+ SSE_PINSRW_R32_to_XMM(regd, temp2, 1);
+ }
+ else {
+ SSE2_MOVD_R_to_XMM(regt, temp2);
+ SSE_MOVSS_XMM_to_XMM(regd, regt);
+ SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
+ }
+ localptr[7] = JMP8(0);
+
+ x86SetJ8(localptr[2]);
+ x86SetJ8(localptr[4]);
+ x86SetJ8(localptr[5]);
+ x86SetJ8(localptr[6]);
+ x86SetJ8(localptr[7]);
+
+ if (is_mem)
+ {
+ SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
+
+ if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
+ else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
+ }
+ else
+ {
+ SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
+ SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
+
+ if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt);
+ else SSE_ADDSS_XMM_to_XMM(regd, regt);
+
+ SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
+ }
+
+ _freeX86reg(temp2);
+
+ MOV32MtoR(ECX, (uptr)&tempECX);
+}
+
+void SSE_ADDPS_XMM_to_XMM_custom(int info, int regd, int regt) {
+ if (CHECK_VUADDSUBHACK) {
+ VU_ADD_SUB(regd, regt, 0, info);
+ }
+ else SSE_ADDPS_XMM_to_XMM(regd, regt);
+}
+void SSE_SUBPS_XMM_to_XMM_custom(int info, int regd, int regt) {
+ if (CHECK_VUADDSUBHACK) {
+ VU_ADD_SUB(regd, regt, 1, info);
+ }
+ else SSE_SUBPS_XMM_to_XMM(regd, regt);
+}
+void SSE_ADDSS_XMM_to_XMM_custom(int info, int regd, int regt) {
+ if (CHECK_VUADDSUBHACK) {
+ VU_ADD_SUB_SS(regd, regt, 0, 0, info);
+ }
+ else SSE_ADDSS_XMM_to_XMM(regd, regt);
+}
+void SSE_SUBSS_XMM_to_XMM_custom(int info, int regd, int regt) {
+ if (CHECK_VUADDSUBHACK) {
+ VU_ADD_SUB_SS(regd, regt, 1, 0, info);
+ }
+ else SSE_SUBSS_XMM_to_XMM(regd, regt);
+}
+void SSE_ADDSS_M32_to_XMM_custom(int info, int regd, int regt) {
+ if (CHECK_VUADDSUBHACK) {
+ VU_ADD_SUB_SS(regd, regt, 0, 1, info);
+ }
+ else SSE_ADDSS_M32_to_XMM(regd, regt);
+}
+void SSE_SUBSS_M32_to_XMM_custom(int info, int regd, int regt) {
+ if (CHECK_VUADDSUBHACK) {
+ VU_ADD_SUB_SS(regd, regt, 1, 1, info);
+ }
+ else SSE_SUBSS_M32_to_XMM(regd, regt);
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// *VU Upper Instructions!*
+//
+// Note: * = Checked for errors by cottonvibes
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// ABS*
+//------------------------------------------------------------------
+void recVUMI_ABS(VURegs *VU, int info)
+{
+ //Console.WriteLn("recVUMI_ABS()");
+ if ( (_Ft_ == 0) || (_X_Y_Z_W == 0) ) return;
+
+ if ((_X_Y_Z_W == 0x8) || (_X_Y_Z_W == 0xf)) {
+ VU_MERGE_REGS(EEREC_T, EEREC_S);
+ SSE_ANDPS_M128_to_XMM(EEREC_T, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] );
+ }
+ else { // Use a temp reg because VU_MERGE_REGS() modifies source reg!
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] );
+ VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
+ }
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// ADD*, ADD_iq*, ADD_xyzw*
+//------------------------------------------------------------------
+static const __aligned16 float s_two[4] = {0,0,0,2};
+void recVUMI_ADD(VURegs *VU, int info)
+{
+ //Console.WriteLn("recVUMI_ADD()");
+ if ( _X_Y_Z_W == 0 ) goto flagUpdate; // Don't do anything and just clear flags
+ if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
+
+ if ( _Fs_ == 0 && _Ft_ == 0 ) { // if adding VF00 with VF00, then the result is always 0,0,0,2
+ if ( _X_Y_Z_W != 0xf ) {
+ SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)s_two);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)s_two);
+ }
+ else {
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
+ }
+ if( _X_Y_Z_W == 8 ) { // If only adding x, then we can do a Scalar Add
+ if (EEREC_D == EEREC_S) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ else if (EEREC_D == EEREC_T) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ else {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) { // If xyzw != 1111, then we have to use a temp reg
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else { // All xyzw being modified (xyzw == 1111)
+ if (EEREC_D == EEREC_S) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_T);
+ else if (EEREC_D == EEREC_T) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ }
+flagUpdate:
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_ADD_iq(VURegs *VU, uptr addr, int info)
+{
+ //Console.WriteLn("recVUMI_ADD_iq()");
+ if ( _X_Y_Z_W == 0 ) goto flagUpdate;
+ if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ vuFloat3(addr);
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ }
+
+ if ( _XYZW_SS ) {
+ if ( EEREC_D == EEREC_TEMP ) {
+ _vuFlipRegSS(VU, EEREC_S);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_ADDSS_M32_to_XMM(EEREC_D, addr);
+ _vuFlipRegSS(VU, EEREC_S);
+ _vuFlipRegSS(VU, EEREC_D); // have to flip over EEREC_D for computing flags!
+ }
+ else if ( EEREC_D == EEREC_S ) {
+ _vuFlipRegSS(VU, EEREC_D);
+ SSE_ADDSS_M32_to_XMM(EEREC_D, addr);
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ else {
+ if ( _X ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_ADDSS_M32_to_XMM_custom(info, EEREC_D, addr);
+ }
+ else {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ SSE_ADDPS_XMM_to_XMM_custom(info, EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ }
+ }
+ else {
+ if ( (_X_Y_Z_W != 0xf) || (EEREC_D == EEREC_S) || (EEREC_D == EEREC_TEMP) ) {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ }
+
+ if (_X_Y_Z_W != 0xf) {
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ if ( EEREC_D == EEREC_TEMP ) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ else if ( EEREC_D == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ else {
+ SSE_MOVSS_M32_to_XMM(EEREC_D, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00);
+ SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ }
+ }
+flagUpdate:
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_ADD_xyzw(VURegs *VU, int xyzw, int info)
+{
+ //Console.WriteLn("recVUMI_ADD_xyzw()");
+ if ( _X_Y_Z_W == 0 ) goto flagUpdate;
+ if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
+ }
+
+ if ( _Ft_ == 0 && xyzw < 3 ) { // just move since adding zero
+ if ( _X_Y_Z_W == 0x8 ) { VU_MERGE_REGS(EEREC_D, EEREC_S); }
+ else if ( _X_Y_Z_W != 0xf ) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ else if ( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP) ) {
+ if ( xyzw == 0 ) {
+ if ( EEREC_D == EEREC_T ) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ else {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ else {
+ _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ }
+ else if( _Fs_ == 0 && !_W ) { // just move
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ if ( _X_Y_Z_W != 0xf ) {
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ if( EEREC_D == EEREC_TEMP ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); }
+ else if( EEREC_D == EEREC_S ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); }
+ else { _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); }
+ }
+ }
+flagUpdate:
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_ADDi(VURegs *VU, int info) { recVUMI_ADD_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
+void recVUMI_ADDq(VURegs *VU, int info) { recVUMI_ADD_iq(VU, VU_REGQ_ADDR, info); }
+void recVUMI_ADDx(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 0, info); }
+void recVUMI_ADDy(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 1, info); }
+void recVUMI_ADDz(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 2, info); }
+void recVUMI_ADDw(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 3, info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// ADDA*, ADDA_iq*, ADDA_xyzw*
+//------------------------------------------------------------------
+void recVUMI_ADDA(VURegs *VU, int info)
+{
+ //Console.WriteLn("recVUMI_ADDA()");
+ if ( _X_Y_Z_W == 0 ) goto flagUpdate;
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
+ }
+
+ if( _X_Y_Z_W == 8 ) {
+ if (EEREC_ACC == EEREC_S) SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T); // Can this case happen? (cottonvibes)
+ else if (EEREC_ACC == EEREC_T) SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_S); // Can this case happen?
+ else {
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T);
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+
+ VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
+ }
+ else {
+ if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_T); // Can this case happen?
+ else if( EEREC_ACC == EEREC_T ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S); // Can this case happen?
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_T);
+ }
+ }
+flagUpdate:
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_ADDA_iq(VURegs *VU, uptr addr, int info)
+{
+ //Console.WriteLn("recVUMI_ADDA_iq()");
+ if ( _X_Y_Z_W == 0 ) goto flagUpdate;
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ vuFloat3(addr);
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ }
+
+ if( _XYZW_SS ) {
+ assert( EEREC_ACC != EEREC_TEMP );
+ if( EEREC_ACC == EEREC_S ) {
+ _vuFlipRegSS(VU, EEREC_ACC);
+ SSE_ADDSS_M32_to_XMM(EEREC_ACC, addr);
+ _vuFlipRegSS(VU, EEREC_ACC);
+ }
+ else {
+ if( _X ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_ADDSS_M32_to_XMM(EEREC_ACC, addr);
+ }
+ else {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
+ }
+ }
+ }
+ else {
+ if( _X_Y_Z_W != 0xf || EEREC_ACC == EEREC_S ) {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ }
+
+ if (_X_Y_Z_W != 0xf) {
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
+ }
+ else {
+ if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
+ else {
+ SSE_MOVSS_M32_to_XMM(EEREC_ACC, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_ACC, EEREC_ACC, 0x00);
+ SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ }
+ }
+ }
+flagUpdate:
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_ADDA_xyzw(VURegs *VU, int xyzw, int info)
+{
+ //Console.WriteLn("recVUMI_ADDA_xyzw()");
+ if ( _X_Y_Z_W == 0 ) goto flagUpdate;
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
+ }
+
+ if( _X_Y_Z_W == 8 ) {
+ assert( EEREC_ACC != EEREC_T );
+ if( xyzw == 0 ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T);
+ }
+ else {
+ _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ if( _Fs_ == 0 ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
+ }
+ else {
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
+ }
+ }
+ }
+ else {
+ if( _X_Y_Z_W != 0xf || EEREC_ACC == EEREC_S )
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+
+ if (_X_Y_Z_W != 0xf) {
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
+ }
+ else {
+ if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
+ else {
+ _unpackVF_xyzw(EEREC_ACC, EEREC_T, xyzw);
+ SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ }
+ }
+ }
+flagUpdate:
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_ADDAi(VURegs *VU, int info) { recVUMI_ADDA_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
+void recVUMI_ADDAq(VURegs *VU, int info) { recVUMI_ADDA_iq(VU, VU_REGQ_ADDR, info); }
+void recVUMI_ADDAx(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 0, info); }
+void recVUMI_ADDAy(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 1, info); }
+void recVUMI_ADDAz(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 2, info); }
+void recVUMI_ADDAw(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 3, info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// SUB*, SUB_iq*, SUB_xyzw*
+//------------------------------------------------------------------
+void recVUMI_SUB(VURegs *VU, int info)
+{
+ //Console.WriteLn("recVUMI_SUB()");
+ if ( _X_Y_Z_W == 0 ) goto flagUpdate;
+ if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
+
+ if( EEREC_S == EEREC_T ) {
+ if (_X_Y_Z_W != 0xf) SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&SSEmovMask[15-_X_Y_Z_W][0]);
+ else SSE_XORPS_XMM_to_XMM(EEREC_D, EEREC_D);
+ }
+ else if( _X_Y_Z_W == 8 ) {
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
+ }
+ if (EEREC_D == EEREC_S) {
+ if (_Ft_) SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ else if (EEREC_D == EEREC_T) {
+ if (_Ft_) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_SUBSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ else SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ else {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ if (_Ft_) SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ else {
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
+ }
+ if (_X_Y_Z_W != 0xf) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if( ( _Ft_ > 0 ) || _W ) SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ if (EEREC_D == EEREC_S) SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_T);
+ else if (EEREC_D == EEREC_T) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+ SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ }
+flagUpdate:
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_SUB_iq(VURegs *VU, uptr addr, int info)
+{
+ //Console.WriteLn("recVUMI_SUB_iq()");
+ if ( _X_Y_Z_W == 0 ) goto flagUpdate;
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ vuFloat3(addr);
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ }
+ if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
+
+ if( _XYZW_SS ) {
+ if( EEREC_D == EEREC_TEMP ) {
+ _vuFlipRegSS(VU, EEREC_S);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_SUBSS_M32_to_XMM(EEREC_D, addr);
+ _vuFlipRegSS(VU, EEREC_S);
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ else if( EEREC_D == EEREC_S ) {
+ _vuFlipRegSS(VU, EEREC_D);
+ SSE_SUBSS_M32_to_XMM(EEREC_D, addr);
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ else {
+ if( _X ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_SUBSS_M32_to_XMM(EEREC_D, addr);
+ }
+ else {
+ _vuMoveSS(VU, EEREC_TEMP, EEREC_S);
+ _vuFlipRegSS(VU, EEREC_D);
+ SSE_SUBSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ }
+ }
+ else {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+
+ if (_X_Y_Z_W != 0xf) {
+ int t1reg = _vuGetTempXMMreg(info);
+
+ if( t1reg >= 0 ) {
+ SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
+
+ VU_MERGE_REGS(EEREC_D, t1reg);
+ _freeXMMreg(t1reg);
+ }
+ else {
+ // negate
+ SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ }
+ else {
+ if( EEREC_D == EEREC_TEMP ) {
+ SSE_XORPS_M128_to_XMM(EEREC_D, (uptr)&const_clip[4]);
+ SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ }
+ }
+flagUpdate:
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_SUB_xyzw(VURegs *VU, int xyzw, int info)
+{
+ //Console.WriteLn("recVUMI_SUB_xyzw()");
+ if ( _X_Y_Z_W == 0 ) goto flagUpdate;
+ if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP);
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
+ }
+
+ if ( _X_Y_Z_W == 8 ) {
+ if ( (xyzw == 0) && (_Ft_ == _Fs_) ) {
+ SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&SSEmovMask[7][0]);
+ }
+ else if ( EEREC_D == EEREC_TEMP ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ if ( (_Ft_ > 0) || (xyzw == 3) ) {
+ _vuFlipRegSS_xyzw(EEREC_T, xyzw);
+ SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ _vuFlipRegSS_xyzw(EEREC_T, xyzw);
+ }
+ }
+ else {
+ if ( (_Ft_ > 0) || (xyzw == 3) ) {
+ _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ else SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ }
+ else {
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+
+ if (_X_Y_Z_W != 0xf) {
+ int t1reg = _vuGetTempXMMreg(info);
+
+ if( t1reg >= 0 ) {
+ SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
+
+ VU_MERGE_REGS(EEREC_D, t1reg);
+ _freeXMMreg(t1reg);
+ }
+ else {
+ // negate
+ SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ }
+ else {
+ if( EEREC_D == EEREC_TEMP ) {
+ SSE_XORPS_M128_to_XMM(EEREC_D, (uptr)&const_clip[4]);
+ SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ }
+ }
+flagUpdate:
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_SUBi(VURegs *VU, int info) { recVUMI_SUB_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
+void recVUMI_SUBq(VURegs *VU, int info) { recVUMI_SUB_iq(VU, VU_REGQ_ADDR, info); }
+void recVUMI_SUBx(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 0, info); }
+void recVUMI_SUBy(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 1, info); }
+void recVUMI_SUBz(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 2, info); }
+void recVUMI_SUBw(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 3, info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// SUBA*, SUBA_iq, SUBA_xyzw
+//------------------------------------------------------------------
+void recVUMI_SUBA(VURegs *VU, int info)
+{
+ //Console.WriteLn("recVUMI_SUBA()");
+ if ( _X_Y_Z_W == 0 ) goto flagUpdate;
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
+ }
+
+ if( EEREC_S == EEREC_T ) {
+ if (_X_Y_Z_W != 0xf) SSE_ANDPS_M128_to_XMM(EEREC_ACC, (uptr)&SSEmovMask[15-_X_Y_Z_W][0]);
+ else SSE_XORPS_XMM_to_XMM(EEREC_ACC, EEREC_ACC);
+ }
+ else if( _X_Y_Z_W == 8 ) {
+ if (EEREC_ACC == EEREC_S) SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T);
+ else if (EEREC_ACC == EEREC_T) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_SUBSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
+ }
+ else {
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T);
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+
+ VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
+ }
+ else {
+ if( EEREC_ACC == EEREC_S ) SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_T);
+ else if( EEREC_ACC == EEREC_T ) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+ SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_T);
+ }
+ }
+flagUpdate:
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_SUBA_iq(VURegs *VU, uptr addr, int info)
+{
+ //Console.WriteLn ("recVUMI_SUBA_iq");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ vuFloat3(addr);
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ }
+
+ if( _XYZW_SS ) {
+ if( EEREC_ACC == EEREC_S ) {
+ _vuFlipRegSS(VU, EEREC_ACC);
+ SSE_SUBSS_M32_to_XMM(EEREC_ACC, addr);
+ _vuFlipRegSS(VU, EEREC_ACC);
+ }
+ else {
+ if( _X ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_SUBSS_M32_to_XMM(EEREC_ACC, addr);
+ }
+ else {
+ _vuMoveSS(VU, EEREC_TEMP, EEREC_S);
+ _vuFlipRegSS(VU, EEREC_ACC);
+ SSE_SUBSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
+ _vuFlipRegSS(VU, EEREC_ACC);
+ }
+ }
+ }
+ else {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+
+ if (_X_Y_Z_W != 0xf) {
+ int t1reg = _vuGetTempXMMreg(info);
+
+ if( t1reg >= 0 ) {
+ SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
+
+ VU_MERGE_REGS(EEREC_ACC, t1reg);
+ _freeXMMreg(t1reg);
+ }
+ else {
+ // negate
+ SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
+ }
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
+ }
+ }
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_SUBA_xyzw(VURegs *VU, int xyzw, int info)
+{
+ //Console.WriteLn ("recVUMI_SUBA_xyzw");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
+ }
+
+ if( _X_Y_Z_W == 8 ) {
+ if( xyzw == 0 ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T);
+ }
+ else {
+ _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
+ }
+ }
+ else {
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+
+ if (_X_Y_Z_W != 0xf) {
+ int t1reg = _vuGetTempXMMreg(info);
+
+ if( t1reg >= 0 ) {
+ SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
+
+ VU_MERGE_REGS(EEREC_ACC, t1reg);
+ _freeXMMreg(t1reg);
+ }
+ else {
+ // negate
+ SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP);
+ }
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S);
+ SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP);
+ }
+ }
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_SUBAi(VURegs *VU, int info) { recVUMI_SUBA_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
+void recVUMI_SUBAq(VURegs *VU, int info) { recVUMI_SUBA_iq(VU, VU_REGQ_ADDR, info); }
+void recVUMI_SUBAx(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 0, info); }
+void recVUMI_SUBAy(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 1, info); }
+void recVUMI_SUBAz(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 2, info); }
+void recVUMI_SUBAw(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 3, info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// MUL
+//------------------------------------------------------------------
+void recVUMI_MUL_toD(VURegs *VU, int regd, int info)
+{
+ //Console.WriteLn ("recVUMI_MUL_toD");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
+ }
+
+ if (_X_Y_Z_W == 1 && (_Ft_ == 0 || _Fs_==0) ) { // W
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, _Ft_ ? EEREC_T : EEREC_S);
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else if( _Fd_ == _Fs_ && _Fs_ == _Ft_ && _XYZW_SS ) {
+ _vuFlipRegSS(VU, EEREC_D);
+ SSE_MULSS_XMM_to_XMM(EEREC_D, EEREC_D);
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ else if( _X_Y_Z_W == 8 ) {
+ if (regd == EEREC_S) SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
+ else if (regd == EEREC_T) SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
+ else {
+ SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
+ SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else {
+ if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
+ else if (regd == EEREC_T) SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S);
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
+ }
+ }
+}
+
+void recVUMI_MUL_iq_toD(VURegs *VU, uptr addr, int regd, int info)
+{
+ //Console.WriteLn ("recVUMI_MUL_iq_toD");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ vuFloat3(addr);
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ }
+
+ if( _XYZW_SS ) {
+ if( regd == EEREC_TEMP ) {
+ _vuFlipRegSS(VU, EEREC_S);
+ SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
+ SSE_MULSS_M32_to_XMM(regd, addr);
+ _vuFlipRegSS(VU, EEREC_S);
+ _vuFlipRegSS(VU, regd);
+ }
+ else if( regd == EEREC_S ) {
+ _vuFlipRegSS(VU, regd);
+ SSE_MULSS_M32_to_XMM(regd, addr);
+ _vuFlipRegSS(VU, regd);
+ }
+ else {
+ if( _X ) {
+ SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
+ SSE_MULSS_M32_to_XMM(regd, addr);
+ }
+ else {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ }
+ }
+ else {
+ if( _X_Y_Z_W != 0xf || regd == EEREC_TEMP || regd == EEREC_S ) {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ }
+
+ if (_X_Y_Z_W != 0xf) {
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else {
+ if( regd == EEREC_TEMP ) SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ else if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP);
+ else {
+ SSE_MOVSS_M32_to_XMM(regd, addr);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x00);
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ }
+ }
+ }
+}
+
+void recVUMI_MUL_xyzw_toD(VURegs *VU, int xyzw, int regd, int info)
+{
+ //Console.WriteLn ("recVUMI_MUL_xyzw_toD");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
+ }
+ if (_Fs_) { // This is needed for alot of games; so always clamp this operand
+ if (CHECK_VU_SIGN_OVERFLOW) vFloats4_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
+ else vFloats2[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
+ }
+ if( _Ft_ == 0 ) {
+ if( xyzw < 3 ) {
+ if (_X_Y_Z_W != 0xf) {
+ SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else SSE_XORPS_XMM_to_XMM(regd, regd);
+ }
+ else {
+ assert(xyzw==3);
+ if (_X_Y_Z_W != 0xf) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S);
+ }
+ }
+ else if( _X_Y_Z_W == 8 ) {
+ if( regd == EEREC_TEMP ) {
+ _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
+ }
+ else {
+ if( xyzw == 0 ) {
+ if( regd == EEREC_T ) {
+ SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
+ }
+ else {
+ SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
+ SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
+ }
+ }
+ else {
+ _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
+ SSE_MULSS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ }
+ }
+ else {
+ if( _X_Y_Z_W != 0xf || regd == EEREC_TEMP || regd == EEREC_S )
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+
+ if (_X_Y_Z_W != 0xf) {
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else {
+ if( regd == EEREC_TEMP ) SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ else if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP);
+ else {
+ _unpackVF_xyzw(regd, EEREC_T, xyzw);
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ }
+ }
+ }
+}
+
+void recVUMI_MUL(VURegs *VU, int info)
+{
+ //Console.WriteLn ("recVUMI_MUL");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MUL_toD(VU, EEREC_D, info);
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_MUL_iq(VURegs *VU, int addr, int info)
+{
+ //Console.WriteLn ("recVUMI_MUL_iq");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MUL_iq_toD(VU, addr, EEREC_D, info);
+ recUpdateFlags(VU, EEREC_D, info);
+ // spacefisherman needs overflow checking on MULi.z
+}
+
+void recVUMI_MUL_xyzw(VURegs *VU, int xyzw, int info)
+{
+ //Console.WriteLn ("recVUMI_MUL_xyzw");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MUL_xyzw_toD(VU, xyzw, EEREC_D, info);
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_MULi(VURegs *VU, int info) { recVUMI_MUL_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
+void recVUMI_MULq(VURegs *VU, int info) { recVUMI_MUL_iq(VU, VU_REGQ_ADDR, info); }
+void recVUMI_MULx(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 0, info); }
+void recVUMI_MULy(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 1, info); }
+void recVUMI_MULz(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 2, info); }
+void recVUMI_MULw(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 3, info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// MULA
+//------------------------------------------------------------------
+void recVUMI_MULA( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_MULA");
+ recVUMI_MUL_toD(VU, EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MULA_iq(VURegs *VU, int addr, int info)
+{
+ //Console.WriteLn ("recVUMI_MULA_iq");
+ recVUMI_MUL_iq_toD(VU, addr, EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MULA_xyzw(VURegs *VU, int xyzw, int info)
+{
+ //Console.WriteLn ("recVUMI_MULA_xyzw");
+ recVUMI_MUL_xyzw_toD(VU, xyzw, EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MULAi(VURegs *VU, int info) { recVUMI_MULA_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
+void recVUMI_MULAq(VURegs *VU, int info) { recVUMI_MULA_iq(VU, VU_REGQ_ADDR, info); }
+void recVUMI_MULAx(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 0, info); }
+void recVUMI_MULAy(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 1, info); }
+void recVUMI_MULAz(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 2, info); }
+void recVUMI_MULAw(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 3, info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// MADD
+//------------------------------------------------------------------
+void recVUMI_MADD_toD(VURegs *VU, int regd, int info)
+{
+ //Console.WriteLn ("recVUMI_MADD_toD");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
+ vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
+ }
+
+
+ if( _X_Y_Z_W == 8 ) {
+ if( regd == EEREC_ACC ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ else if (regd == EEREC_T) {
+ SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else if (regd == EEREC_S) {
+ SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else {
+ SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
+ SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
+
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else {
+ if( regd == EEREC_ACC ) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ else if (regd == EEREC_T) {
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else if (regd == EEREC_S) {
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S);
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ }
+}
+
+void recVUMI_MADD_iq_toD(VURegs *VU, uptr addr, int regd, int info)
+{
+ //Console.WriteLn ("recVUMI_MADD_iq_toD");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ vuFloat3(addr);
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
+ }
+
+ if( _X_Y_Z_W == 8 ) {
+ if( _Fs_ == 0 ) {
+ // do nothing if regd == ACC (ACCx <= ACCx + 0.0 * *addr)
+ if( regd != EEREC_ACC ) {
+ SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ return;
+ }
+
+ if( regd == EEREC_ACC ) {
+ assert( EEREC_TEMP < iREGCNT_XMM );
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ else if( regd == EEREC_S ) {
+ SSE_MULSS_M32_to_XMM(regd, addr);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else {
+ SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
+ SSE_MULSS_M32_to_XMM(regd, addr);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ }
+ else {
+ if( _Fs_ == 0 ) {
+ if( regd == EEREC_ACC ) { // ACCxyz is unchanged, ACCw <= ACCw + *addr
+ if( _W ) { // if _W is zero, do nothing
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); // { *addr, 0, 0, 0 }
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x27); // { 0, 0, 0, *addr }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP); // { ACCx, ACCy, ACCz, ACCw + *addr }
+ }
+ }
+ else { // DESTxyz <= ACCxyz, DESTw <= ACCw + *addr
+ if( _W ) {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); // { *addr, 0, 0, 0 }
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x27); // { 0, 0, 0, *addr }
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); // { ACCx, ACCy, ACCz, ACCw + *addr }
+ }
+ else SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+
+ return;
+ }
+
+ if( _X_Y_Z_W != 0xf || regd == EEREC_ACC || regd == EEREC_TEMP || regd == EEREC_S ) {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ }
+
+ if (_X_Y_Z_W != 0xf) {
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
+
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else {
+ if( regd == EEREC_ACC ) {
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ else if( regd == EEREC_S ) {
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else if( regd == EEREC_TEMP ) {
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else {
+ SSE_MOVSS_M32_to_XMM(regd, addr);
+ SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x00);
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ }
+ }
+}
+
+void recVUMI_MADD_xyzw_toD(VURegs *VU, int xyzw, int regd, int info)
+{
+ //Console.WriteLn ("recVUMI_MADD_xyzw_toD");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
+ vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
+ }
+ if (_Fs_) { // This is needed for alot of games; so always clamp this operand
+ if (CHECK_VU_SIGN_OVERFLOW) vFloats4_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
+ else vFloats2[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
+ }
+ if( _Ft_ == 0 ) {
+
+ if( xyzw == 3 ) {
+ // just add
+ if( _X_Y_Z_W == 8 ) {
+ if( regd == EEREC_S ) SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
+ else {
+ SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC);
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_S);
+ }
+ }
+ else {
+ if( _X_Y_Z_W != 0xf ) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
+
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else {
+ if( regd == EEREC_S ) SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC);
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_S);
+ }
+ }
+ }
+ }
+ else {
+ // just move acc to regd
+ if( _X_Y_Z_W != 0xf ) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+
+ return;
+ }
+
+ if( _X_Y_Z_W == 8 ) {
+ _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+
+ if( regd == EEREC_ACC ) {
+ SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ else if( regd == EEREC_S ) {
+ SSE_MULSS_XMM_to_XMM(regd, EEREC_TEMP);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else if( regd == EEREC_TEMP ) {
+ SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else {
+ SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC);
+ SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); }
+ SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ }
+ else {
+ if( _X_Y_Z_W != 0xf || regd == EEREC_ACC || regd == EEREC_TEMP || regd == EEREC_S ) {
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ }
+
+ if (_X_Y_Z_W != 0xf) {
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
+
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ else {
+ if( regd == EEREC_ACC ) {
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ else if( regd == EEREC_S ) {
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else if( regd == EEREC_TEMP ) {
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ else {
+ _unpackVF_xyzw(regd, EEREC_T, xyzw);
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC);
+ }
+ }
+ }
+}
+
+void recVUMI_MADD(VURegs *VU, int info)
+{
+ //Console.WriteLn ("recVUMI_MADD");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MADD_toD(VU, EEREC_D, info);
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_MADD_iq(VURegs *VU, int addr, int info)
+{
+ //Console.WriteLn ("recVUMI_MADD_iq");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MADD_iq_toD(VU, addr, EEREC_D, info);
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_MADD_xyzw(VURegs *VU, int xyzw, int info)
+{
+ //Console.WriteLn ("recVUMI_MADD_xyzw");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MADD_xyzw_toD(VU, xyzw, EEREC_D, info);
+ recUpdateFlags(VU, EEREC_D, info);
+ // super bust-a-move arrows needs overflow clamping
+}
+
+void recVUMI_MADDi(VURegs *VU, int info) { recVUMI_MADD_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
+void recVUMI_MADDq(VURegs *VU, int info) { recVUMI_MADD_iq(VU, VU_REGQ_ADDR, info); }
+void recVUMI_MADDx(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 0, info); }
+void recVUMI_MADDy(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 1, info); }
+void recVUMI_MADDz(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 2, info); }
+void recVUMI_MADDw(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 3, info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// MADDA
+//------------------------------------------------------------------
+void recVUMI_MADDA( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_MADDA");
+ recVUMI_MADD_toD(VU, EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MADDAi( VURegs *VU , int info)
+{
+ //Console.WriteLn ("recVUMI_MADDAi");
+ recVUMI_MADD_iq_toD( VU, VU_VI_ADDR(REG_I, 1), EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MADDAq( VURegs *VU , int info)
+{
+ //Console.WriteLn ("recVUMI_MADDAq ");
+ recVUMI_MADD_iq_toD( VU, VU_REGQ_ADDR, EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MADDAx( VURegs *VU , int info)
+{
+ //Console.WriteLn ("recVUMI_MADDAx");
+ recVUMI_MADD_xyzw_toD(VU, 0, EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MADDAy( VURegs *VU , int info)
+{
+ //Console.WriteLn ("recVUMI_MADDAy");
+ recVUMI_MADD_xyzw_toD(VU, 1, EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MADDAz( VURegs *VU , int info)
+{
+ //Console.WriteLn ("recVUMI_MADDAz");
+ recVUMI_MADD_xyzw_toD(VU, 2, EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MADDAw( VURegs *VU , int info)
+{
+ //Console.WriteLn ("recVUMI_MADDAw");
+ recVUMI_MADD_xyzw_toD(VU, 3, EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// MSUB
+//------------------------------------------------------------------
+void recVUMI_MSUB_toD(VURegs *VU, int regd, int info)
+{
+ //Console.WriteLn ("recVUMI_MSUB_toD");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
+ vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
+ }
+
+ if (_X_Y_Z_W != 0xf) {
+ int t1reg = _vuGetTempXMMreg(info);
+
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+
+ if( t1reg >= 0 ) {
+ SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_ACC);
+ SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
+
+ VU_MERGE_REGS(regd, t1reg);
+ _freeXMMreg(t1reg);
+ }
+ else {
+ SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ }
+ else {
+ if( regd == EEREC_S ) {
+ assert( regd != EEREC_ACC );
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC);
+ SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]);
+ }
+ else if( regd == EEREC_T ) {
+ assert( regd != EEREC_ACC );
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC);
+ SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]);
+ }
+ else if( regd == EEREC_TEMP ) {
+ SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S);
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC);
+ SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC);
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+ SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ }
+}
+
+void recVUMI_MSUB_temp_toD(VURegs *VU, int regd, int info)
+{
+ //Console.WriteLn ("recVUMI_MSUB_temp_toD");
+
+ if (_X_Y_Z_W != 0xf) {
+ int t1reg = _vuGetTempXMMreg(info);
+
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+
+ if( t1reg >= 0 ) {
+ SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_ACC);
+ SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP);
+
+ if ( regd != EEREC_TEMP ) { VU_MERGE_REGS(regd, t1reg); }
+ else SSE_MOVAPS_XMM_to_XMM(regd, t1reg);
+
+ _freeXMMreg(t1reg);
+ }
+ else {
+ SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
+ VU_MERGE_REGS(regd, EEREC_TEMP);
+ }
+ }
+ else {
+ if( regd == EEREC_ACC ) {
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+ SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ else if( regd == EEREC_S ) {
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC);
+ SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]);
+ }
+ else if( regd == EEREC_TEMP ) {
+ SSE_MULPS_XMM_to_XMM(regd, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); }
+ SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC);
+ SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC);
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); }
+ SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP);
+ }
+ }
+}
+
+void recVUMI_MSUB_iq_toD(VURegs *VU, int regd, int addr, int info)
+{
+ //Console.WriteLn ("recVUMI_MSUB_iq_toD");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
+ vuFloat3(addr);
+ }
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ recVUMI_MSUB_temp_toD(VU, regd, info);
+}
+
+void recVUMI_MSUB_xyzw_toD(VURegs *VU, int regd, int xyzw, int info)
+{
+ //Console.WriteLn ("recVUMI_MSUB_xyzw_toD");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 1 << (3 - xyzw));
+ vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
+ }
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ recVUMI_MSUB_temp_toD(VU, regd, info);
+}
+
+void recVUMI_MSUB(VURegs *VU, int info)
+{
+ //Console.WriteLn ("recVUMI_MSUB");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MSUB_toD(VU, EEREC_D, info);
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_MSUB_iq(VURegs *VU, int addr, int info)
+{
+ //Console.WriteLn ("recVUMI_MSUB_iq");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MSUB_iq_toD(VU, EEREC_D, addr, info);
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_MSUBi(VURegs *VU, int info) { recVUMI_MSUB_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
+void recVUMI_MSUBq(VURegs *VU, int info) { recVUMI_MSUB_iq(VU, VU_REGQ_ADDR, info); }
+void recVUMI_MSUBx(VURegs *VU, int info)
+{
+ //Console.WriteLn ("recVUMI_MSUBx");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 0, info);
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_MSUBy(VURegs *VU, int info)
+{
+ //Console.WriteLn ("recVUMI_MSUBy");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 1, info);
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_MSUBz(VURegs *VU, int info)
+{
+ //Console.WriteLn ("recVUMI_MSUBz");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 2, info);
+ recUpdateFlags(VU, EEREC_D, info);
+}
+
+void recVUMI_MSUBw(VURegs *VU, int info)
+{
+ //Console.WriteLn ("recVUMI_MSUBw");
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 3, info);
+ recUpdateFlags(VU, EEREC_D, info);
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// MSUBA
+//------------------------------------------------------------------
+void recVUMI_MSUBA( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_MSUBA");
+ recVUMI_MSUB_toD(VU, EEREC_ACC, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MSUBAi( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_MSUBAi ");
+ recVUMI_MSUB_iq_toD( VU, EEREC_ACC, VU_VI_ADDR(REG_I, 1), info );
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MSUBAq( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_MSUBAq");
+ recVUMI_MSUB_iq_toD( VU, EEREC_ACC, VU_REGQ_ADDR, info );
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MSUBAx( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_MSUBAx");
+ recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 0, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MSUBAy( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_MSUBAy");
+ recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 1, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MSUBAz( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_MSUBAz ");
+ recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 2, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+
+void recVUMI_MSUBAw( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_MSUBAw");
+ recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 3, info);
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+//------------------------------------------------------------------
+
+
+static const __aligned16 u32 special_mask[4] = {0xffffffff, 0x80000000, 0xffffffff, 0x80000000};
+static const __aligned16 u32 special_mask2[4] = {0, 0x40000000, 0, 0x40000000};
+
+__aligned16 u32 temp_loc[4];
+__aligned16 u32 temp_loc2[4];
+
+//MAX/MINI are non-arithmetic operations that implicitly support numbers with the EXP field being 0 ("denormals").
+//
+//As such, they are sometimes used for integer move and (positive!) integer max/min, knowing that integers that
+//represent denormals will not be flushed to 0.
+//
+//As such, this implementation performs a non-arithmetic operation that supports "denormals" and "infs/nans".
+//There might be an easier way to do it but here, MAX/MIN is performed with PMAXPD/PMINPD.
+//Fake double-precision numbers are constructed by copying the sign of the original numbers, clearing the upper 32 bits,
+//setting the 62nd bit to 1 (to ensure double-precision number is "normalized") and having the lower 32bits
+//being the same as the original number.
+
+void MINMAXlogical(VURegs *VU, int info, int min, int mode, uptr addr = 0, int xyzw = 0)
+//mode1 = iq, mode2 = xyzw, mode0 = normal
+{
+ int t1regbool = 0;
+ int t1reg = _vuGetTempXMMreg(info);
+ if (t1reg < 0)
+ {
+ t1regbool = 1;
+ for (t1reg = 0; ( (t1reg == EEREC_D) || (t1reg == EEREC_S) || (mode != 1 && t1reg == EEREC_T)
+ || (t1reg == EEREC_TEMP) ); t1reg++); // Find unused reg (For first temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)temp_loc, t1reg); // Backup t1reg XMM reg
+ }
+ int t2regbool = -1;
+ int t2reg = EEREC_TEMP;
+ if (EEREC_TEMP == EEREC_D || EEREC_TEMP == EEREC_S || (mode != 1 && EEREC_TEMP == EEREC_T))
+ {
+ t2regbool = 0;
+ t2reg = _vuGetTempXMMreg(info);
+ if (t2reg < 0)
+ {
+ t2regbool = 1;
+ for (t2reg = 0; ( (t2reg == EEREC_D) || (t2reg == EEREC_S) || (mode != 1 && t2reg == EEREC_T) ||
+ (t2reg == t1reg) || (t2reg == EEREC_TEMP) ); t2reg++); // Find unused reg (For second temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)temp_loc2, t2reg); // Backup t2reg XMM reg
+ }
+ }
+
+ if (_X || _Y)
+ {
+ SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0x50);
+ SSE2_PAND_M128_to_XMM(t1reg, (uptr)special_mask);
+ SSE2_POR_M128_to_XMM(t1reg, (uptr)special_mask2);
+ if (mode == 0)
+ SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0x50);
+ else if (mode == 1)
+ {
+ SSE2_MOVD_M32_to_XMM(t2reg, addr);
+ SSE2_PSHUFD_XMM_to_XMM(t2reg, t2reg, 0x00);
+ }
+ else if (mode == 2)
+ _unpackVF_xyzw(t2reg, EEREC_T, xyzw);
+ SSE2_PAND_M128_to_XMM(t2reg, (uptr)special_mask);
+ SSE2_POR_M128_to_XMM(t2reg, (uptr)special_mask2);
+ if (min)
+ SSE2_MINPD_XMM_to_XMM(t1reg, t2reg);
+ else
+ SSE2_MAXPD_XMM_to_XMM(t1reg, t2reg);
+ SSE2_PSHUFD_XMM_to_XMM(t1reg, t1reg, 0x88);
+ VU_MERGE_REGS_CUSTOM(EEREC_D, t1reg, 0xc & _X_Y_Z_W);
+ }
+
+ if (_Z || _W)
+ {
+ SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0xfa);
+ SSE2_PAND_M128_to_XMM(t1reg, (uptr)special_mask);
+ SSE2_POR_M128_to_XMM(t1reg, (uptr)special_mask2);
+ if (mode == 0)
+ SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0xfa);
+ else if (mode == 1)
+ {
+ SSE2_MOVD_M32_to_XMM(t2reg, addr);
+ SSE2_PSHUFD_XMM_to_XMM(t2reg, t2reg, 0x00);
+ }
+ else if (mode == 2)
+ _unpackVF_xyzw(t2reg, EEREC_T, xyzw);
+ SSE2_PAND_M128_to_XMM(t2reg, (uptr)special_mask);
+ SSE2_POR_M128_to_XMM(t2reg, (uptr)special_mask2);
+ if (min)
+ SSE2_MINPD_XMM_to_XMM(t1reg, t2reg);
+ else
+ SSE2_MAXPD_XMM_to_XMM(t1reg, t2reg);
+ SSE2_PSHUFD_XMM_to_XMM(t1reg, t1reg, 0x88);
+ VU_MERGE_REGS_CUSTOM(EEREC_D, t1reg, 0x3 & _X_Y_Z_W);
+ }
+
+ if (t1regbool == 0)
+ _freeXMMreg(t1reg);
+ else if (t1regbool == 1)
+ SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)temp_loc); // Restore t1reg XMM reg
+ if (t2regbool == 0)
+ _freeXMMreg(t2reg);
+ else if (t2regbool == 1)
+ SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)temp_loc2); // Restore t2reg XMM reg
+}
+
+//------------------------------------------------------------------
+// MAX
+//------------------------------------------------------------------
+
+void recVUMI_MAX(VURegs *VU, int info)
+{
+ if ( _Fd_ == 0 ) return;
+ //Console.WriteLn ("recVUMI_MAX");
+
+ if (MINMAXFIX)
+ MINMAXlogical(VU, info, 0, 0);
+ else
+ {
+
+ if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
+ if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
+
+ if( _X_Y_Z_W == 8 ) {
+ if (EEREC_D == EEREC_S) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ else if (EEREC_D == EEREC_T) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ else {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ if( EEREC_D == EEREC_S ) SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_T);
+ else if( EEREC_D == EEREC_T ) SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ }
+}
+
+void recVUMI_MAX_iq(VURegs *VU, uptr addr, int info)
+{
+ if ( _Fd_ == 0 ) return;
+ //Console.WriteLn ("recVUMI_MAX_iq");
+
+ if (MINMAXFIX)
+ MINMAXlogical(VU, info, 0, 1, addr);
+ else
+ {
+ if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
+ vuFloat3(addr);
+
+ if( _XYZW_SS ) {
+ if( EEREC_D == EEREC_TEMP ) {
+ _vuFlipRegSS(VU, EEREC_S);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MAXSS_M32_to_XMM(EEREC_D, addr);
+ _vuFlipRegSS(VU, EEREC_S);
+
+ // have to flip over EEREC_D if computing flags!
+ //if( (info & PROCESS_VU_UPDATEFLAGS) )
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ else if( EEREC_D == EEREC_S ) {
+ _vuFlipRegSS(VU, EEREC_D);
+ SSE_MAXSS_M32_to_XMM(EEREC_D, addr);
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ else {
+ if( _X ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MAXSS_M32_to_XMM(EEREC_D, addr);
+ }
+ else {
+ _vuMoveSS(VU, EEREC_TEMP, EEREC_S);
+ _vuFlipRegSS(VU, EEREC_D);
+ SSE_MAXSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ if(EEREC_D == EEREC_S) {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ SSE_MOVSS_M32_to_XMM(EEREC_D, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00);
+ SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ }
+ }
+}
+
+void recVUMI_MAX_xyzw(VURegs *VU, int xyzw, int info)
+{
+ if ( _Fd_ == 0 ) return;
+ //Console.WriteLn ("recVUMI_MAX_xyzw");
+
+ if (_Fs_ == 0 && _Ft_ == 0)
+ {
+ if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) {
+ if( xyzw < 3 ) {
+ SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)s_fones);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ if( xyzw < 3 ) {
+ if( _X_Y_Z_W & 1 ) SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[0]); // w included, so insert the whole reg
+ else SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); // w not included, can zero out
+ }
+ else SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)s_fones);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ //If VF0.w isnt chosen as the constant, then its going to be MAX( 0, VF0 ), so the result is VF0
+ if( xyzw < 3 ) { SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)&VU->VF[0].UL[0]); }
+ else SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)s_fones);
+ }
+ return;
+ }
+
+ if (MINMAXFIX)
+ MINMAXlogical(VU, info, 0, 2, 0, xyzw);
+ else
+ {
+ if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
+ if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
+
+ if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) {
+ if( xyzw == 0 ) {
+ if( EEREC_D == EEREC_S ) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ else if( EEREC_D == EEREC_T ) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ else {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ else {
+ _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ if (EEREC_D == EEREC_S) {
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw);
+ SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ }
+ }
+}
+
+void recVUMI_MAXi(VURegs *VU, int info) { recVUMI_MAX_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
+void recVUMI_MAXx(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 0, info); }
+void recVUMI_MAXy(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 1, info); }
+void recVUMI_MAXz(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 2, info); }
+void recVUMI_MAXw(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 3, info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// MINI
+//------------------------------------------------------------------
+void recVUMI_MINI(VURegs *VU, int info)
+{
+ if ( _Fd_ == 0 ) return;
+ //Console.WriteLn ("recVUMI_MINI");
+
+ if (MINMAXFIX)
+ MINMAXlogical(VU, info, 1, 0);
+ else
+ {
+
+ if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
+ if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
+
+ if( _X_Y_Z_W == 8 ) {
+ if (EEREC_D == EEREC_S) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ else if (EEREC_D == EEREC_T) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ else {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ if( EEREC_D == EEREC_S ) {
+ //ClampUnordered(EEREC_T, EEREC_TEMP, 0); // need for GT4 vu0rec
+ SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ else if( EEREC_D == EEREC_T ) {
+ //ClampUnordered(EEREC_S, EEREC_TEMP, 0); // need for GT4 vu0rec
+ SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ }
+}
+
+void recVUMI_MINI_iq(VURegs *VU, uptr addr, int info)
+{
+ if ( _Fd_ == 0 ) return;
+ //Console.WriteLn ("recVUMI_MINI_iq");
+
+ if (MINMAXFIX)
+ MINMAXlogical(VU, info, 1, 1, addr);
+ else
+ {
+
+ if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
+ vuFloat3(addr);
+
+ if( _XYZW_SS ) {
+ if( EEREC_D == EEREC_TEMP ) {
+ _vuFlipRegSS(VU, EEREC_S);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MINSS_M32_to_XMM(EEREC_D, addr);
+ _vuFlipRegSS(VU, EEREC_S);
+
+ // have to flip over EEREC_D if computing flags!
+ //if( (info & PROCESS_VU_UPDATEFLAGS) )
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ else if( EEREC_D == EEREC_S ) {
+ _vuFlipRegSS(VU, EEREC_D);
+ SSE_MINSS_M32_to_XMM(EEREC_D, addr);
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ else {
+ if( _X ) {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MINSS_M32_to_XMM(EEREC_D, addr);
+ }
+ else {
+ _vuMoveSS(VU, EEREC_TEMP, EEREC_S);
+ _vuFlipRegSS(VU, EEREC_D);
+ SSE_MINSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ _vuFlipRegSS(VU, EEREC_D);
+ }
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ if(EEREC_D == EEREC_S) {
+ SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00);
+ SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ SSE_MOVSS_M32_to_XMM(EEREC_D, addr);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00);
+ SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ }
+ }
+}
+
+void recVUMI_MINI_xyzw(VURegs *VU, int xyzw, int info)
+{
+ if ( _Fd_ == 0 ) return;
+ //Console.WriteLn ("recVUMI_MINI_xyzw");
+
+ if (_Fs_ == 0 && _Ft_ == 0)
+ {
+ if( _X_Y_Z_W == 0xf )
+ {
+ //If VF0.w is the constant, the result will match VF0, else its all 0's
+ if(xyzw == 3) SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)&VU->VF[0].UL[0]);
+ else SSE_XORPS_XMM_to_XMM(EEREC_D, EEREC_D);
+ }
+ else
+ {
+ //If VF0.w is the constant, the result will match VF0, else its all 0's
+ if(xyzw == 3) SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[0]);
+ else SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ return;
+ }
+ if (MINMAXFIX)
+ MINMAXlogical(VU, info, 1, 2, 0, xyzw);
+ else
+ {
+ if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping
+ if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
+
+ if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) {
+ if( xyzw == 0 ) {
+ if( EEREC_D == EEREC_S ) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ else if( EEREC_D == EEREC_T ) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ else {
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T);
+ }
+ }
+ else {
+ _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S);
+ SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ }
+ else if (_X_Y_Z_W != 0xf) {
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ VU_MERGE_REGS(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ if (EEREC_D == EEREC_S) {
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw);
+ SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
+ }
+ else {
+ _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw);
+ SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S);
+ }
+ }
+ }
+}
+
+void recVUMI_MINIi(VURegs *VU, int info) { recVUMI_MINI_iq(VU, VU_VI_ADDR(REG_I, 1), info); }
+void recVUMI_MINIx(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 0, info); }
+void recVUMI_MINIy(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 1, info); }
+void recVUMI_MINIz(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 2, info); }
+void recVUMI_MINIw(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 3, info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// OPMULA
+//------------------------------------------------------------------
+void recVUMI_OPMULA( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_OPMULA");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, 0xE);
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 0xE);
+ }
+
+ SSE_MOVAPS_XMM_to_XMM( EEREC_TEMP, EEREC_S );
+ SSE_SHUFPS_XMM_to_XMM( EEREC_T, EEREC_T, 0xD2 ); // EEREC_T = WYXZ
+ SSE_SHUFPS_XMM_to_XMM( EEREC_TEMP, EEREC_TEMP, 0xC9 ); // EEREC_TEMP = WXZY
+ SSE_MULPS_XMM_to_XMM( EEREC_TEMP, EEREC_T );
+
+ VU_MERGE_REGS_CUSTOM(EEREC_ACC, EEREC_TEMP, 14);
+
+ // revert EEREC_T
+ if( EEREC_T != EEREC_ACC )
+ SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xC9);
+
+ recUpdateFlags(VU, EEREC_ACC, info);
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// OPMSUB
+//------------------------------------------------------------------
+void recVUMI_OPMSUB( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_OPMSUB");
+ if (CHECK_VU_EXTRA_OVERFLOW) {
+ if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, 0xE);
+ if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 0xE);
+ }
+
+ if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP);
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xD2); // EEREC_T = WYXZ
+ SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0xC9); // EEREC_TEMP = WXZY
+ SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
+
+ // negate and add
+ SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]);
+ SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC);
+ VU_MERGE_REGS_CUSTOM(EEREC_D, EEREC_TEMP, 14);
+
+ // revert EEREC_T
+ if( EEREC_T != EEREC_D ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xC9);
+
+ recUpdateFlags(VU, EEREC_D, info);
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// NOP
+//------------------------------------------------------------------
+void recVUMI_NOP( VURegs *VU, int info )
+{
+ //Console.WriteLn ("recVUMI_NOP");
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// recVUMI_FTOI_Saturate() - Saturates result from FTOI Instructions
+//------------------------------------------------------------------
+
+// unused, but leaving here for possible reference..
+//static const __aligned16 int rec_const_0x8000000[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+
+void recVUMI_FTOI_Saturate(int rec_s, int rec_t, int rec_tmp1, int rec_tmp2)
+{
+ //Console.WriteLn ("recVUMI_FTOI_Saturate");
+ //Duplicate the xor'd sign bit to the whole value
+ //FFFF FFFF for positive, 0 for negative
+ SSE_MOVAPS_XMM_to_XMM(rec_tmp1, rec_s);
+ SSE2_PXOR_M128_to_XMM(rec_tmp1, (uptr)&const_clip[4]);
+ SSE2_PSRAD_I8_to_XMM(rec_tmp1, 31);
+
+ //Create mask: 0 where !=8000 0000
+ SSE_MOVAPS_XMM_to_XMM(rec_tmp2, rec_t);
+ SSE2_PCMPEQD_M128_to_XMM(rec_tmp2, (uptr)&const_clip[4]);
+
+ //AND the mask w/ the edit values
+ SSE_ANDPS_XMM_to_XMM(rec_tmp1, rec_tmp2);
+
+ //if v==8000 0000 && positive -> 8000 0000 + FFFF FFFF -> 7FFF FFFF
+ //if v==8000 0000 && negative -> 8000 0000 + 0 -> 8000 0000
+ //if v!=8000 0000 -> v+0 (masked from the and)
+
+ //Add the values as needed
+ SSE2_PADDD_XMM_to_XMM(rec_t, rec_tmp1);
+}
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// FTOI 0/4/12/15
+//------------------------------------------------------------------
+static __aligned16 float FTIO_Temp1[4] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000 };
+static __aligned16 float FTIO_Temp2[4] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000 };
+void recVUMI_FTOI0(VURegs *VU, int info)
+{
+ int t1reg, t2reg; // Temp XMM regs
+
+ if ( _Ft_ == 0 ) return;
+
+ //Console.WriteLn ("recVUMI_FTOI0");
+
+ if (_X_Y_Z_W != 0xf) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
+ SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
+
+ t1reg = _vuGetTempXMMreg(info);
+
+ if( t1reg >= 0 ) { // If theres a temp XMM reg available
+ for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++)
+ ; // Find unused reg (For second temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg
+
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
+
+ SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg
+ _freeXMMreg(t1reg); // Free temp reg
+ }
+ else { // No temp reg available
+ for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
+ ; // Find unused reg (For first temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
+
+ for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++)
+ ; // Find unused reg (For second temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg
+
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
+
+ SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
+ SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg
+ }
+
+ VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
+ }
+ else {
+ if (EEREC_T != EEREC_S) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_S);
+ vuFloat_useEAX( info, EEREC_T, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
+ SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_T, EEREC_T);
+
+ t1reg = _vuGetTempXMMreg(info);
+
+ if( t1reg >= 0 ) { // If theres a temp XMM reg available
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result
+ _freeXMMreg(t1reg); // Free temp reg
+ }
+ else { // No temp reg available
+ for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
+ ; // Find unused reg
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
+
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result
+
+ SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
+ }
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
+ SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
+
+ t1reg = _vuGetTempXMMreg(info);
+
+ if( t1reg >= 0 ) { // If theres a temp XMM reg available
+ for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++)
+ ; // Find unused reg (For second temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg
+
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
+
+ SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg
+ _freeXMMreg(t1reg); // Free temp reg
+ }
+ else { // No temp reg available
+ for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
+ ; // Find unused reg (For first temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
+
+ for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++)
+ ; // Find unused reg (For second temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg
+
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
+
+ SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
+ SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg
+ }
+
+ SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_TEMP);
+ }
+ }
+}
+
+void recVUMI_FTOIX(VURegs *VU, int addr, int info)
+{
+ int t1reg, t2reg; // Temp XMM regs
+
+ if ( _Ft_ == 0 ) return;
+
+ //Console.WriteLn ("recVUMI_FTOIX");
+ if (_X_Y_Z_W != 0xf) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr);
+ vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
+ SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
+
+ t1reg = _vuGetTempXMMreg(info);
+
+ if( t1reg >= 0 ) { // If theres a temp XMM reg available
+ for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++)
+ ; // Find unused reg (For second temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg
+
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
+
+ SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg
+ _freeXMMreg(t1reg); // Free temp reg
+ }
+ else { // No temp reg available
+ for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
+ ; // Find unused reg (For first temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
+
+ for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++)
+ ; // Find unused reg (For second temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg
+
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
+
+ SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
+ SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg
+ }
+
+ VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
+ }
+ else {
+ if (EEREC_T != EEREC_S) {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_S);
+ SSE_MULPS_M128_to_XMM(EEREC_T, addr);
+ vuFloat_useEAX( info, EEREC_T, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
+ SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_T, EEREC_T);
+
+ t1reg = _vuGetTempXMMreg(info);
+
+ if( t1reg >= 0 ) { // If theres a temp XMM reg available
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result
+ _freeXMMreg(t1reg); // Free temp reg
+ }
+ else { // No temp reg available
+ for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
+ ; // Find unused reg
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
+
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result
+
+ SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
+ }
+ }
+ else {
+ SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr);
+ vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax)
+ SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
+
+ t1reg = _vuGetTempXMMreg(info);
+
+ if( t1reg >= 0 ) { // If theres a temp XMM reg available
+ for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++)
+ ; // Find unused reg (For second temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg
+
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
+
+ SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg
+ _freeXMMreg(t1reg); // Free temp reg
+ }
+ else { // No temp reg available
+ for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++)
+ ; // Find unused reg (For first temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg
+
+ for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++)
+ ; // Find unused reg (For second temp reg)
+ SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg
+
+ recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result
+
+ SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg
+ SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg
+ }
+
+ SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_TEMP);
+ }
+ }
+}
+
+void recVUMI_FTOI4( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int4[0], info); }
+void recVUMI_FTOI12( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int12[0], info); }
+void recVUMI_FTOI15( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int15[0], info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// ITOF 0/4/12/15
+//------------------------------------------------------------------
+void recVUMI_ITOF0( VURegs *VU, int info )
+{
+ if ( _Ft_ == 0 ) return;
+
+ //Console.WriteLn ("recVUMI_ITOF0");
+ if (_X_Y_Z_W != 0xf) {
+ SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ vuFloat_useEAX( info, EEREC_TEMP, 15); // Clamp infinities
+ VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
+ xmmregs[EEREC_T].mode |= MODE_WRITE;
+ }
+ else {
+ SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_T, EEREC_S);
+ vuFloat2(EEREC_T, EEREC_TEMP, 15); // Clamp infinities
+ }
+}
+
+void recVUMI_ITOFX(VURegs *VU, int addr, int info)
+{
+ if ( _Ft_ == 0 ) return;
+
+ //Console.WriteLn ("recVUMI_ITOFX");
+ if (_X_Y_Z_W != 0xf) {
+ SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
+ SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr);
+ vuFloat_useEAX( info, EEREC_TEMP, 15); // Clamp infinities
+ VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
+ xmmregs[EEREC_T].mode |= MODE_WRITE;
+ }
+ else {
+ SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_T, EEREC_S);
+ SSE_MULPS_M128_to_XMM(EEREC_T, addr);
+ vuFloat2(EEREC_T, EEREC_TEMP, 15); // Clamp infinities
+ }
+}
+
+void recVUMI_ITOF4( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float4[0], info); }
+void recVUMI_ITOF12( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float12[0], info); }
+void recVUMI_ITOF15( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float15[0], info); }
+//------------------------------------------------------------------
+
+
+//------------------------------------------------------------------
+// CLIP
+//------------------------------------------------------------------
+void recVUMI_CLIP(VURegs *VU, int info)
+{
+ int t1reg = EEREC_D;
+ int t2reg = EEREC_ACC;
+ int x86temp1, x86temp2;
+
+ u32 clipaddr = VU_VI_ADDR(REG_CLIP_FLAG, 0);
+ u32 prevclipaddr = VU_VI_ADDR(REG_CLIP_FLAG, 2);
+
+ if( clipaddr == 0 ) { // battle star has a clip right before fcset
+ Console.WriteLn("skipping vu clip");
+ return;
+ }
+
+ //Flush the clip flag before processing, incase of double clip commands (GoW)
+
+ if( prevclipaddr != (uptr)&VU->VI[REG_CLIP_FLAG] ) {
+ MOV32MtoR(EAX, prevclipaddr);
+ MOV32RtoM((uptr)&VU->VI[REG_CLIP_FLAG], EAX);
+ }
+
+ assert( clipaddr != 0 );
+ assert( t1reg != t2reg && t1reg != EEREC_TEMP && t2reg != EEREC_TEMP );
+
+ x86temp1 = ALLOCTEMPX86(MODE_8BITREG);
+ x86temp2 = ALLOCTEMPX86(MODE_8BITREG);
+
+ //if ( (x86temp1 == 0) || (x86temp2 == 0) ) Console.Error("VU CLIP Allocation Error: EAX being allocated!");
+
+ _freeXMMreg(t1reg); // These should have been freed at allocation in eeVURecompileCode()
+ _freeXMMreg(t2reg); // but if they've been used since then, then free them. (just doing this incase :p (cottonvibes))
+
+ if( _Ft_ == 0 ) {
+ SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&s_fones[0]); // all 1s
+ SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)&s_fones[4]);
+ }
+ else {
+ _unpackVF_xyzw(EEREC_TEMP, EEREC_T, 3);
+ SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[0]);
+ SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_TEMP);
+ SSE_ORPS_M128_to_XMM(t1reg, (uptr)&const_clip[4]);
+ }
+
+ MOV32MtoR(EAX, prevclipaddr);
+
+ SSE_CMPNLEPS_XMM_to_XMM(t1reg, EEREC_S); //-w, -z, -y, -x
+ SSE_CMPLTPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); //+w, +z, +y, +x
+
+ SHL32ItoR(EAX, 6);
+
+ SSE_MOVAPS_XMM_to_XMM(t2reg, EEREC_TEMP); //t2 = +w, +z, +y, +x
+ SSE_UNPCKLPS_XMM_to_XMM(EEREC_TEMP, t1reg); //EEREC_TEMP = -y,+y,-x,+x
+ SSE_UNPCKHPS_XMM_to_XMM(t2reg, t1reg); //t2reg = -w,+w,-z,+z
+ SSE_MOVMSKPS_XMM_to_R32(x86temp2, EEREC_TEMP); // -y,+y,-x,+x
+ SSE_MOVMSKPS_XMM_to_R32(x86temp1, t2reg); // -w,+w,-z,+z
+
+ AND8ItoR(x86temp1, 0x3);
+ SHL8ItoR(x86temp1, 4);
+ OR8RtoR(EAX, x86temp1);
+ AND8ItoR(x86temp2, 0xf);
+ OR8RtoR(EAX, x86temp2);
+ AND32ItoR(EAX, 0xffffff);
+
+ MOV32RtoM(clipaddr, EAX);
+
+ if (( !(info & (PROCESS_VU_SUPER|PROCESS_VU_COP2)) ) ) //Instantly update the flag if its called from elsewhere (unlikely, but ok)
+ MOV32RtoM((uptr)&VU->VI[REG_CLIP_FLAG], EAX);
+
+ _freeX86reg(x86temp1);
+ _freeX86reg(x86temp2);
+}
diff --git a/pcsx2/x86/sVU_zerorec.h b/pcsx2/x86/sVU_zerorec.h
index 9582c7981c..5e252e5d0c 100644
--- a/pcsx2/x86/sVU_zerorec.h
+++ b/pcsx2/x86/sVU_zerorec.h
@@ -1,73 +1,73 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2009 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-// Super VU recompiler - author: zerofrog(@gmail.com)
-
-#pragma once
-
-#include "sVU_Micro.h"
-
-//Using assembly code from an external file.
-#ifdef __LINUX__
-extern "C" {
-#endif
-extern void SuperVUExecuteProgram(u32 startpc, int vuindex);
-extern void SuperVUEndProgram();
-extern void svudispfntemp();
-#ifdef __LINUX__
-}
-#endif
-
-extern void SuperVUDestroy(int vuindex);
-extern void SuperVUReset(int vuindex);
-
-// read = 0, will write to reg
-// read = 1, will read from reg
-// read = 2, addr of previously written reg (used for status and clip flags)
-extern u32 SuperVUGetVIAddr(int reg, int read);
-
-// if p == 0, flush q else flush p; if wait is != 0, waits for p/q
-extern void SuperVUFlush(int p, int wait);
-
-
-class recSuperVU0 : public BaseVUmicroCPU
-{
-public:
- recSuperVU0();
-
- const char* GetShortName() const { return "sVU0"; }
- wxString GetLongName() const { return L"SuperVU0 Recompiler"; }
-
- void Allocate();
- void Shutdown() throw();
- void Reset();
- void ExecuteBlock();
- void Clear(u32 Addr, u32 Size);
-};
-
-class recSuperVU1 : public BaseVUmicroCPU
-{
-public:
- recSuperVU1();
-
- const char* GetShortName() const { return "sVU1"; }
- wxString GetLongName() const { return L"SuperVU1 Recompiler"; }
-
- void Allocate();
- void Shutdown() throw();
- void Reset();
- void ExecuteBlock();
- void Clear(u32 Addr, u32 Size);
-};
+/* PCSX2 - PS2 Emulator for PCs
+ * Copyright (C) 2002-2009 PCSX2 Dev Team
+ *
+ * PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with PCSX2.
+ * If not, see .
+ */
+
+// Super VU recompiler - author: zerofrog(@gmail.com)
+
+#pragma once
+
+#include "sVU_Micro.h"
+
+//Using assembly code from an external file.
+#ifdef __LINUX__
+extern "C" {
+#endif
+extern void SuperVUExecuteProgram(u32 startpc, int vuindex);
+extern void SuperVUEndProgram();
+extern void svudispfntemp();
+#ifdef __LINUX__
+}
+#endif
+
+extern void SuperVUDestroy(int vuindex);
+extern void SuperVUReset(int vuindex);
+
+// read = 0, will write to reg
+// read = 1, will read from reg
+// read = 2, addr of previously written reg (used for status and clip flags)
+extern u32 SuperVUGetVIAddr(int reg, int read);
+
+// if p == 0, flush q else flush p; if wait is != 0, waits for p/q
+extern void SuperVUFlush(int p, int wait);
+
+
+class recSuperVU0 : public BaseVUmicroCPU
+{
+public:
+ recSuperVU0();
+
+ const char* GetShortName() const { return "sVU0"; }
+ wxString GetLongName() const { return L"SuperVU0 Recompiler"; }
+
+ void Allocate();
+ void Shutdown() throw();
+ void Reset();
+ void ExecuteBlock();
+ void Clear(u32 Addr, u32 Size);
+};
+
+class recSuperVU1 : public BaseVUmicroCPU
+{
+public:
+ recSuperVU1();
+
+ const char* GetShortName() const { return "sVU1"; }
+ wxString GetLongName() const { return L"SuperVU1 Recompiler"; }
+
+ void Allocate();
+ void Shutdown() throw();
+ void Reset();
+ void ExecuteBlock();
+ void Clear(u32 Addr, u32 Size);
+};