diff --git a/common/include/x86emitter/x86types.h b/common/include/x86emitter/x86types.h index 951b0f21b7..1b0db38900 100644 --- a/common/include/x86emitter/x86types.h +++ b/common/include/x86emitter/x86types.h @@ -157,9 +157,12 @@ template< typename T > void xWrite( T val ); class ModSibBase; extern void xSetPtr( void* ptr ); - extern u8* xGetPtr(); extern void xAlignPtr( uint bytes ); extern void xAdvancePtr( uint bytes ); + extern void xAlignCallTarget(); + + extern u8* xGetPtr(); + extern u8* xGetAlignedCallTarget(); extern JccComparisonType xInvertCond( JccComparisonType src ); diff --git a/common/src/x86emitter/x86emitter.cpp b/common/src/x86emitter/x86emitter.cpp index f540415b49..6de0c6a7e2 100644 --- a/common/src/x86emitter/x86emitter.cpp +++ b/common/src/x86emitter/x86emitter.cpp @@ -395,6 +395,32 @@ __emitinline void xAlignPtr( uint bytes ) x86Ptr = (u8*)( ( (uptr)x86Ptr + bytes - 1) & ~(bytes - 1) ); } +// Performs best-case alignment for the target CPU, for use prior to starting a new +// function. This is not meant to be used prior to jump targets, since it doesn't +// add padding (additionally, speed benefit from jump alignment is minimal, and often +// a loss). +__emitinline void xAlignCallTarget() +{ + // Core2/i7 CPUs prefer unaligned addresses. Checking for SSSE3 is a decent filter. + // (also align in debug modes for disasm convenience) + + if( IsDebugBuild || !x86caps.hasSupplementalStreamingSIMD3Extensions ) + { + // - P4's and earlier prefer 16 byte alignment. + // - AMD Athlons and Phenoms prefer 8 byte alignment, but I don't have an easy + // heuristic for it yet. + // - AMD Phenom IIs are unknown (either prefer 8 byte, or unaligned). + + xAlignPtr( 16 ); + } +} + +__emitinline u8* xGetAlignedCallTarget() +{ + xAlignCallTarget(); + return x86Ptr; +} + __emitinline void xAdvancePtr( uint bytes ) { if( IsDevBuild ) diff --git a/pcsx2/Vif1Dma.cpp b/pcsx2/Vif1Dma.cpp index 8c7df26cb3..873ffd8c33 100644 --- a/pcsx2/Vif1Dma.cpp +++ b/pcsx2/Vif1Dma.cpp @@ -58,6 +58,11 @@ __forceinline void vif1FLUSH() void vif1Init() { +#ifdef newVif1 + extern void initNewVif(int idx); + initNewVif(1); +#endif + SetNewMask(g_vif1Masks, g_vif1HasMask3, 0, 0xffffffff); } @@ -313,19 +318,13 @@ static int __fastcall Vif1TransDirectHL(u32 *data) return ret; } -#ifdef newVif1 - extern void initNewVif(int idx); - extern int nVifUnpack(int idx, u32 *data); - static int testVif = 0; -#endif static int __fastcall Vif1TransUnpack(u32 *data) { #ifdef newVif1 - if (!testVif) { initNewVif(1); testVif = 1; } - //int temp = nVifUnpack(1, data); - //if (temp >= 0) return temp; + extern int nVifUnpack(int idx, u32 *data); return nVifUnpack(1, data); #endif + XMMRegisters::Freeze(); if (vif1.vifpacketsize < vif1.tag.size) diff --git a/pcsx2/VifDma_internal.h b/pcsx2/VifDma_internal.h index 8c02fd7576..7ce8556ebf 100644 --- a/pcsx2/VifDma_internal.h +++ b/pcsx2/VifDma_internal.h @@ -60,7 +60,7 @@ static __forceinline u32 vif_size(u8 num) return (num == 0) ? 0x1000 : 0x4000; } -#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code) -#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined) +//#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code) +//#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined) //#define newVif0 // Use New Code for Vif0 Unpacks (not implemented) #endif diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp index b1441f63f4..8bfc45bbf0 100644 --- a/pcsx2/x86/ix86-32/iR5900-32.cpp +++ b/pcsx2/x86/ix86-32/iR5900-32.cpp @@ -371,7 +371,7 @@ static DynGenFunc* _DynGen_JITCompile() { pxAssertMsg( DispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple. Thanks." ); - u8* retval = xGetPtr(); + u8* retval = xGetAlignedCallTarget(); _DynGen_StackFrameCheck(); xMOV( ecx, &cpuRegs.pc ); @@ -388,7 +388,7 @@ static DynGenFunc* _DynGen_JITCompile() static DynGenFunc* _DynGen_JITCompileInBlock() { - u8* retval = xGetPtr(); + u8* retval = xGetAlignedCallTarget(); xJMP( JITCompile ); return (DynGenFunc*)retval; } @@ -396,7 +396,7 @@ static DynGenFunc* _DynGen_JITCompileInBlock() // called when jumping to variable pc address static DynGenFunc* _DynGen_DispatcherReg() { - u8* retval = xGetPtr(); + u8* retval = xGetPtr(); // fallthrough target, can't align it! _DynGen_StackFrameCheck(); xMOV( eax, &cpuRegs.pc ); @@ -410,7 +410,7 @@ static DynGenFunc* _DynGen_DispatcherReg() static DynGenFunc* _DynGen_EnterRecompiledCode() { - u8* retval = xGetPtr(); + u8* retval = xGetAlignedCallTarget(); // "standard" frame pointer setup for aligned stack: Record the original // esp into ebp, and then align esp. ebp references the original esp base @@ -446,6 +446,8 @@ static DynGenFunc* _DynGen_EnterRecompiledCode() xMOV( &s_store_ebp, ebp ); xJMP( ptr32[&DispatcherReg] ); + + xAlignCallTarget(); imm = (uptr)xGetPtr(); ExitRecompiledCode = (DynGenFunc*)xGetPtr(); @@ -1254,7 +1256,7 @@ void recompileNextInstruction(int delayslot) // _flushCachedRegs(); // g_cpuHasConstReg = 1; - if (!delayslot && x86Ptr - recPtr > 0x1000) + if (!delayslot && (xGetPtr() - recPtr > 0x1000) ) s_nEndBlock = pc; } @@ -1335,9 +1337,8 @@ static void __fastcall recRecompile( const u32 startpc ) recResetEE(); } - x86SetPtr( recPtr ); - x86Align(16); - recPtr = x86Ptr; + xSetPtr( recPtr ); + recPtr = xGetAlignedCallTarget(); s_nBlockFF = false; if (HWADDR(startpc) == 0x81fc0) @@ -1718,14 +1719,14 @@ StartRecomp: } } - pxAssert( x86Ptr < recMem+REC_CACHEMEM ); + pxAssert( xGetPtr() < recMem+REC_CACHEMEM ); pxAssert( recConstBufPtr < recConstBuf + RECCONSTBUF_SIZE ); pxAssert( x86FpuState == 0 ); - pxAssert(x86Ptr - recPtr < 0x10000); - s_pCurBlockEx->x86size = x86Ptr - recPtr; + pxAssert(xGetPtr() - recPtr < 0x10000); + s_pCurBlockEx->x86size = xGetPtr() - recPtr; - recPtr = x86Ptr; + recPtr = xGetPtr(); pxAssert( (g_cpuHasConstReg&g_cpuFlushedConstReg) == g_cpuHasConstReg ); diff --git a/pcsx2/x86/microVU_Analyze.inl b/pcsx2/x86/microVU_Analyze.inl index ddb2133d97..52f76cadd6 100644 --- a/pcsx2/x86/microVU_Analyze.inl +++ b/pcsx2/x86/microVU_Analyze.inl @@ -1,441 +1,441 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#pragma once - -//------------------------------------------------------------------ -// Micro VU - Pass 1 Functions -//------------------------------------------------------------------ - -//------------------------------------------------------------------ -// Helper Macros -//------------------------------------------------------------------ - -#define aReg(x) mVUregs.VF[x] -#define bReg(x, y) mVUregsTemp.VFreg[y] = x; mVUregsTemp.VF[y] -#define aMax(x, y) ((x > y) ? x : y) -#define aMin(x, y) ((x < y) ? x : y) - -// Read a VF reg -#define analyzeReg1(xReg, vfRead) { \ - if (xReg) { \ - if (_X) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \ - if (_Y) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \ - if (_Z) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \ - if (_W) { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \ - } \ -} - -// Write to a VF reg -#define analyzeReg2(xReg, vfWrite, isLowOp) { \ - if (xReg) { \ - if (_X) { bReg(xReg, isLowOp).x = 4; vfWrite.reg = xReg; vfWrite.x = 4; } \ - if (_Y) { bReg(xReg, isLowOp).y = 4; vfWrite.reg = xReg; vfWrite.y = 4; } \ - if (_Z) { bReg(xReg, isLowOp).z = 4; vfWrite.reg = xReg; vfWrite.z = 4; } \ - if (_W) { bReg(xReg, isLowOp).w = 4; vfWrite.reg = xReg; vfWrite.w = 4; } \ - } \ -} - -// Read a VF reg (BC opcodes) -#define analyzeReg3(xReg, vfRead) { \ - if (xReg) { \ - if (_bc_x) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \ - else if (_bc_y) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \ - else if (_bc_z) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \ - else { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \ - } \ -} - -// For Clip Opcode -#define analyzeReg4(xReg, vfRead) { \ - if (xReg) { \ - mVUstall = aMax(mVUstall, aReg(xReg).w); \ - vfRead.reg = xReg; vfRead.w = 1; \ - } \ -} - -// Read VF reg (FsF/FtF) -#define analyzeReg5(xReg, fxf, vfRead) { \ - if (xReg) { \ - switch (fxf) { \ - case 0: mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; break; \ - case 1: mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; break; \ - case 2: mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; break; \ - case 3: mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; break; \ - } \ - } \ -} - -// Flips xyzw stalls to yzwx (MR32 Opcode) -#define analyzeReg6(xReg, vfRead) { \ - if (xReg) { \ - if (_X) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \ - if (_Y) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \ - if (_Z) { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \ - if (_W) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \ - } \ -} - -// Reading a VI reg -#define analyzeVIreg1(xReg, viRead) { \ - if (xReg) { \ - mVUstall = aMax(mVUstall, mVUregs.VI[xReg]); \ - viRead.reg = xReg; viRead.used = 1; \ - } \ -} - -// Writing to a VI reg -#define analyzeVIreg2(xReg, viWrite, aCycles) { \ - if (xReg) { \ - mVUconstReg[xReg].isValid = 0; \ - mVUregsTemp.VIreg = xReg; \ - mVUregsTemp.VI = aCycles; \ - viWrite.reg = xReg; \ - viWrite.used = aCycles; \ - } \ -} - -#define analyzeQreg(x) { mVUregsTemp.q = x; mVUstall = aMax(mVUstall, mVUregs.q); } -#define analyzePreg(x) { mVUregsTemp.p = x; mVUstall = aMax(mVUstall, ((mVUregs.p) ? (mVUregs.p - 1) : 0)); } -#define analyzeRreg() { mVUregsTemp.r = 1; } -#define analyzeXGkick1() { mVUstall = aMax(mVUstall, mVUregs.xgkick); } -#define analyzeXGkick2(x) { mVUregsTemp.xgkick = x; } -#define setConstReg(x, v) { if (x) { mVUconstReg[x].isValid = 1; mVUconstReg[x].regValue = v; } } - -//------------------------------------------------------------------ -// FMAC1 - Normal FMAC Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeFMAC1(mV, int Fd, int Fs, int Ft) { - sFLAG.doFlag = 1; - analyzeReg1(Fs, mVUup.VF_read[0]); - analyzeReg1(Ft, mVUup.VF_read[1]); - analyzeReg2(Fd, mVUup.VF_write, 0); -} - -//------------------------------------------------------------------ -// FMAC2 - ABS/FTOI/ITOF Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeFMAC2(mV, int Fs, int Ft) { - analyzeReg1(Fs, mVUup.VF_read[0]); - analyzeReg2(Ft, mVUup.VF_write, 0); -} - -//------------------------------------------------------------------ -// FMAC3 - BC(xyzw) FMAC Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeFMAC3(mV, int Fd, int Fs, int Ft) { - sFLAG.doFlag = 1; - analyzeReg1(Fs, mVUup.VF_read[0]); - analyzeReg3(Ft, mVUup.VF_read[1]); - analyzeReg2(Fd, mVUup.VF_write, 0); -} - -//------------------------------------------------------------------ -// FMAC4 - Clip FMAC Opcode -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeFMAC4(mV, int Fs, int Ft) { - cFLAG.doFlag = 1; - analyzeReg1(Fs, mVUup.VF_read[0]); - analyzeReg4(Ft, mVUup.VF_read[1]); -} - -//------------------------------------------------------------------ -// IALU - IALU Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeIALU1(mV, int Id, int Is, int It) { - if (!Id) { mVUlow.isNOP = 1; } - analyzeVIreg1(Is, mVUlow.VI_read[0]); - analyzeVIreg1(It, mVUlow.VI_read[1]); - analyzeVIreg2(Id, mVUlow.VI_write, 1); -} - -microVUt(void) mVUanalyzeIALU2(mV, int Is, int It) { - if (!It) { mVUlow.isNOP = 1; } - analyzeVIreg1(Is, mVUlow.VI_read[0]); - analyzeVIreg2(It, mVUlow.VI_write, 1); -} - -microVUt(void) mVUanalyzeIADDI(mV, int Is, int It, s16 imm) { - mVUanalyzeIALU2(mVU, Is, It); - if (!Is) { setConstReg(It, imm); } -} - -//------------------------------------------------------------------ -// MR32 - MR32 Opcode -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeMR32(mV, int Fs, int Ft) { - if (!Ft) { mVUlow.isNOP = 1; } - analyzeReg6(Fs, mVUlow.VF_read[0]); - analyzeReg2(Ft, mVUlow.VF_write, 1); -} - -//------------------------------------------------------------------ -// FDIV - DIV/SQRT/RSQRT Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeFDIV(mV, int Fs, int Fsf, int Ft, int Ftf, u8 xCycles) { - mVUprint("microVU: DIV Opcode"); - analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]); - analyzeReg5(Ft, Ftf, mVUlow.VF_read[1]); - analyzeQreg(xCycles); -} - -//------------------------------------------------------------------ -// EFU - EFU Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeEFU1(mV, int Fs, int Fsf, u8 xCycles) { - mVUprint("microVU: EFU Opcode"); - analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]); - analyzePreg(xCycles); -} - -microVUt(void) mVUanalyzeEFU2(mV, int Fs, u8 xCycles) { - mVUprint("microVU: EFU Opcode"); - analyzeReg1(Fs, mVUlow.VF_read[0]); - analyzePreg(xCycles); -} - -//------------------------------------------------------------------ -// MFP - MFP Opcode -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeMFP(mV, int Ft) { - if (!Ft) { mVUlow.isNOP = 1; } - analyzeReg2(Ft, mVUlow.VF_write, 1); -} - -//------------------------------------------------------------------ -// MOVE - MOVE Opcode -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeMOVE(mV, int Fs, int Ft) { - if (!Ft || (Ft == Fs)) { mVUlow.isNOP = 1; } - analyzeReg1(Fs, mVUlow.VF_read[0]); - analyzeReg2(Ft, mVUlow.VF_write, 1); -} - -//------------------------------------------------------------------ -// LQx - LQ/LQD/LQI Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeLQ(mV, int Ft, int Is, bool writeIs) { - analyzeVIreg1(Is, mVUlow.VI_read[0]); - analyzeReg2 (Ft, mVUlow.VF_write, 1); - if (!Ft) { if (writeIs && Is) { mVUlow.noWriteVF = 1; } else { mVUlow.isNOP = 1; } } - if (writeIs) { analyzeVIreg2(Is, mVUlow.VI_write, 1); } -} - -//------------------------------------------------------------------ -// SQx - SQ/SQD/SQI Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeSQ(mV, int Fs, int It, bool writeIt) { - analyzeReg1 (Fs, mVUlow.VF_read[0]); - analyzeVIreg1(It, mVUlow.VI_read[0]); - if (writeIt) { analyzeVIreg2(It, mVUlow.VI_write, 1); } -} - -//------------------------------------------------------------------ -// R*** - R Reg Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeR1(mV, int Fs, int Fsf) { - analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]); - analyzeRreg(); -} - -microVUt(void) mVUanalyzeR2(mV, int Ft, bool canBeNOP) { - if (!Ft) { if (canBeNOP) { mVUlow.isNOP = 1; } else { mVUlow.noWriteVF = 1; } } - analyzeReg2(Ft, mVUlow.VF_write, 1); - analyzeRreg(); -} - -//------------------------------------------------------------------ -// Sflag - Status Flag Opcodes -//------------------------------------------------------------------ -microVUt(void) flagSet(mV, bool setMacFlag) { - int curPC = iPC; - for (int i = mVUcount, j = 0; i > 0; i--, j++) { - j += mVUstall; - incPC2(-2); - if (sFLAG.doFlag && (j >= 3)) { - if (setMacFlag) { mFLAG.doFlag = 1; } - else { sFLAG.doNonSticky = 1; } - break; - } - } - iPC = curPC; -} - -microVUt(void) mVUanalyzeSflag(mV, int It) { - mVUlow.readFlags = 1; - analyzeVIreg2(It, mVUlow.VI_write, 1); - if (!It) { mVUlow.isNOP = 1; } - else { - mVUsFlagHack = 0; // Don't Optimize Out Status Flags for this block - mVUinfo.swapOps = 1; - flagSet(mVU, 0); - if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 1; } - } -} - -microVUt(void) mVUanalyzeFSSET(mV) { - mVUlow.isFSSET = 1; - mVUlow.readFlags = 1; -} - -//------------------------------------------------------------------ -// Mflag - Mac Flag Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeMflag(mV, int Is, int It) { - mVUlow.readFlags = 1; - analyzeVIreg1(Is, mVUlow.VI_read[0]); - analyzeVIreg2(It, mVUlow.VI_write, 1); - if (!It) { mVUlow.isNOP = 1; } - else { - mVUinfo.swapOps = 1; - flagSet(mVU, 1); - if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 2; } - } -} - -//------------------------------------------------------------------ -// Cflag - Clip Flag Opcodes -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeCflag(mV, int It) { - mVUinfo.swapOps = 1; - mVUlow.readFlags = 1; - if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 4; } - analyzeVIreg2(It, mVUlow.VI_write, 1); -} - -//------------------------------------------------------------------ -// XGkick -//------------------------------------------------------------------ - -microVUt(void) mVUanalyzeXGkick(mV, int Fs, int xCycles) { - analyzeVIreg1(Fs, mVUlow.VI_read[0]); - analyzeXGkick1(); - analyzeXGkick2(xCycles); - // Note: Technically XGKICK should stall on the next instruction, - // this code stalls on the same instruction. The only case where this - // will be a problem with, is if you have very-specifically placed - // FMxxx or FSxxx opcodes checking flags near this instruction AND - // the XGKICK instruction stalls. No-game should be effected by - // this minor difference. -} - -//------------------------------------------------------------------ -// Branches - Branch Opcodes -//------------------------------------------------------------------ - -microVUt(void) analyzeBranchVI(mV, int xReg, bool &infoVar) { - if (!xReg) return; - int i; - int iEnd = aMin(5, (mVUcount+1)); - int bPC = iPC; - incPC2(-2); - for (i = 0; i < iEnd; i++) { - if ((i == mVUcount) && (i < 5)) { - if (mVUpBlock->pState.viBackUp == xReg) { - infoVar = 1; - i++; - } - break; - } - if ((mVUlow.VI_write.reg == xReg) && mVUlow.VI_write.used) { - if (mVUlow.readFlags || i == 5) break; - if (i == 0) { incPC2(-2); continue; } - if (((mVUlow.VI_read[0].reg == xReg) && (mVUlow.VI_read[0].used)) - || ((mVUlow.VI_read[1].reg == xReg) && (mVUlow.VI_read[1].used))) - { incPC2(-2); continue; } - } - break; - } - if (i) { - if (!infoVar) { - incPC2(2); - mVUlow.backupVI = 1; - infoVar = 1; - } - iPC = bPC; - Console.WriteLn( Color_Green, "microVU%d: Branch VI-Delay (%d) [%04x]", getIndex, i, xPC); - } - else iPC = bPC; -} - -// Branch in Branch Delay-Slots -microVUt(int) mVUbranchCheck(mV) { - if (!mVUcount) return 0; - incPC(-2); - if (mVUlow.branch) { - mVUlow.badBranch = 1; - incPC(2); - mVUlow.evilBranch = 1; - mVUregs.blockType = 2; - Console.Warning("microVU%d Warning: Branch in Branch delay slot! [%04x]", mVU->index, xPC); - return 1; - } - incPC(2); - return 0; -} - -microVUt(void) mVUanalyzeCondBranch1(mV, int Is) { - analyzeVIreg1(Is, mVUlow.VI_read[0]); - if (!mVUstall && !mVUbranchCheck(mVU)) { - analyzeBranchVI(mVU, Is, mVUlow.memReadIs); - } -} - -microVUt(void) mVUanalyzeCondBranch2(mV, int Is, int It) { - analyzeVIreg1(Is, mVUlow.VI_read[0]); - analyzeVIreg1(It, mVUlow.VI_read[1]); - if (!mVUstall && !mVUbranchCheck(mVU)) { - analyzeBranchVI(mVU, Is, mVUlow.memReadIs); - analyzeBranchVI(mVU, It, mVUlow.memReadIt); - } -} - -microVUt(void) mVUanalyzeNormBranch(mV, int It, bool isBAL) { - mVUbranchCheck(mVU); - if (isBAL) { - analyzeVIreg2(It, mVUlow.VI_write, 1); - setConstReg(It, bSaveAddr); - } -} - -microVUt(void) mVUanalyzeJump(mV, int Is, int It, bool isJALR) { - mVUbranchCheck(mVU); - mVUlow.branch = (isJALR) ? 10 : 9; - if (mVUconstReg[Is].isValid && !CHECK_VU_CONSTHACK) { - mVUlow.constJump.isValid = 1; - mVUlow.constJump.regValue = mVUconstReg[Is].regValue; - //DevCon.Status("microVU%d: Constant JR/JALR Address Optimization", mVU->index); - } - analyzeVIreg1(Is, mVUlow.VI_read[0]); - if (isJALR) { - analyzeVIreg2(It, mVUlow.VI_write, 1); - setConstReg(It, bSaveAddr); - } -} +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +//------------------------------------------------------------------ +// Micro VU - Pass 1 Functions +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// Helper Macros +//------------------------------------------------------------------ + +#define aReg(x) mVUregs.VF[x] +#define bReg(x, y) mVUregsTemp.VFreg[y] = x; mVUregsTemp.VF[y] +#define aMax(x, y) ((x > y) ? x : y) +#define aMin(x, y) ((x < y) ? x : y) + +// Read a VF reg +#define analyzeReg1(xReg, vfRead) { \ + if (xReg) { \ + if (_X) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \ + if (_Y) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \ + if (_Z) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \ + if (_W) { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \ + } \ +} + +// Write to a VF reg +#define analyzeReg2(xReg, vfWrite, isLowOp) { \ + if (xReg) { \ + if (_X) { bReg(xReg, isLowOp).x = 4; vfWrite.reg = xReg; vfWrite.x = 4; } \ + if (_Y) { bReg(xReg, isLowOp).y = 4; vfWrite.reg = xReg; vfWrite.y = 4; } \ + if (_Z) { bReg(xReg, isLowOp).z = 4; vfWrite.reg = xReg; vfWrite.z = 4; } \ + if (_W) { bReg(xReg, isLowOp).w = 4; vfWrite.reg = xReg; vfWrite.w = 4; } \ + } \ +} + +// Read a VF reg (BC opcodes) +#define analyzeReg3(xReg, vfRead) { \ + if (xReg) { \ + if (_bc_x) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \ + else if (_bc_y) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \ + else if (_bc_z) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \ + else { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \ + } \ +} + +// For Clip Opcode +#define analyzeReg4(xReg, vfRead) { \ + if (xReg) { \ + mVUstall = aMax(mVUstall, aReg(xReg).w); \ + vfRead.reg = xReg; vfRead.w = 1; \ + } \ +} + +// Read VF reg (FsF/FtF) +#define analyzeReg5(xReg, fxf, vfRead) { \ + if (xReg) { \ + switch (fxf) { \ + case 0: mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; break; \ + case 1: mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; break; \ + case 2: mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; break; \ + case 3: mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; break; \ + } \ + } \ +} + +// Flips xyzw stalls to yzwx (MR32 Opcode) +#define analyzeReg6(xReg, vfRead) { \ + if (xReg) { \ + if (_X) { mVUstall = aMax(mVUstall, aReg(xReg).y); vfRead.reg = xReg; vfRead.y = 1; } \ + if (_Y) { mVUstall = aMax(mVUstall, aReg(xReg).z); vfRead.reg = xReg; vfRead.z = 1; } \ + if (_Z) { mVUstall = aMax(mVUstall, aReg(xReg).w); vfRead.reg = xReg; vfRead.w = 1; } \ + if (_W) { mVUstall = aMax(mVUstall, aReg(xReg).x); vfRead.reg = xReg; vfRead.x = 1; } \ + } \ +} + +// Reading a VI reg +#define analyzeVIreg1(xReg, viRead) { \ + if (xReg) { \ + mVUstall = aMax(mVUstall, mVUregs.VI[xReg]); \ + viRead.reg = xReg; viRead.used = 1; \ + } \ +} + +// Writing to a VI reg +#define analyzeVIreg2(xReg, viWrite, aCycles) { \ + if (xReg) { \ + mVUconstReg[xReg].isValid = 0; \ + mVUregsTemp.VIreg = xReg; \ + mVUregsTemp.VI = aCycles; \ + viWrite.reg = xReg; \ + viWrite.used = aCycles; \ + } \ +} + +#define analyzeQreg(x) { mVUregsTemp.q = x; mVUstall = aMax(mVUstall, mVUregs.q); } +#define analyzePreg(x) { mVUregsTemp.p = x; mVUstall = aMax(mVUstall, ((mVUregs.p) ? (mVUregs.p - 1) : 0)); } +#define analyzeRreg() { mVUregsTemp.r = 1; } +#define analyzeXGkick1() { mVUstall = aMax(mVUstall, mVUregs.xgkick); } +#define analyzeXGkick2(x) { mVUregsTemp.xgkick = x; } +#define setConstReg(x, v) { if (x) { mVUconstReg[x].isValid = 1; mVUconstReg[x].regValue = v; } } + +//------------------------------------------------------------------ +// FMAC1 - Normal FMAC Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeFMAC1(mV, int Fd, int Fs, int Ft) { + sFLAG.doFlag = 1; + analyzeReg1(Fs, mVUup.VF_read[0]); + analyzeReg1(Ft, mVUup.VF_read[1]); + analyzeReg2(Fd, mVUup.VF_write, 0); +} + +//------------------------------------------------------------------ +// FMAC2 - ABS/FTOI/ITOF Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeFMAC2(mV, int Fs, int Ft) { + analyzeReg1(Fs, mVUup.VF_read[0]); + analyzeReg2(Ft, mVUup.VF_write, 0); +} + +//------------------------------------------------------------------ +// FMAC3 - BC(xyzw) FMAC Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeFMAC3(mV, int Fd, int Fs, int Ft) { + sFLAG.doFlag = 1; + analyzeReg1(Fs, mVUup.VF_read[0]); + analyzeReg3(Ft, mVUup.VF_read[1]); + analyzeReg2(Fd, mVUup.VF_write, 0); +} + +//------------------------------------------------------------------ +// FMAC4 - Clip FMAC Opcode +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeFMAC4(mV, int Fs, int Ft) { + cFLAG.doFlag = 1; + analyzeReg1(Fs, mVUup.VF_read[0]); + analyzeReg4(Ft, mVUup.VF_read[1]); +} + +//------------------------------------------------------------------ +// IALU - IALU Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeIALU1(mV, int Id, int Is, int It) { + if (!Id) { mVUlow.isNOP = 1; } + analyzeVIreg1(Is, mVUlow.VI_read[0]); + analyzeVIreg1(It, mVUlow.VI_read[1]); + analyzeVIreg2(Id, mVUlow.VI_write, 1); +} + +microVUt(void) mVUanalyzeIALU2(mV, int Is, int It) { + if (!It) { mVUlow.isNOP = 1; } + analyzeVIreg1(Is, mVUlow.VI_read[0]); + analyzeVIreg2(It, mVUlow.VI_write, 1); +} + +microVUt(void) mVUanalyzeIADDI(mV, int Is, int It, s16 imm) { + mVUanalyzeIALU2(mVU, Is, It); + if (!Is) { setConstReg(It, imm); } +} + +//------------------------------------------------------------------ +// MR32 - MR32 Opcode +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeMR32(mV, int Fs, int Ft) { + if (!Ft) { mVUlow.isNOP = 1; } + analyzeReg6(Fs, mVUlow.VF_read[0]); + analyzeReg2(Ft, mVUlow.VF_write, 1); +} + +//------------------------------------------------------------------ +// FDIV - DIV/SQRT/RSQRT Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeFDIV(mV, int Fs, int Fsf, int Ft, int Ftf, u8 xCycles) { + mVUprint("microVU: DIV Opcode"); + analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]); + analyzeReg5(Ft, Ftf, mVUlow.VF_read[1]); + analyzeQreg(xCycles); +} + +//------------------------------------------------------------------ +// EFU - EFU Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeEFU1(mV, int Fs, int Fsf, u8 xCycles) { + mVUprint("microVU: EFU Opcode"); + analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]); + analyzePreg(xCycles); +} + +microVUt(void) mVUanalyzeEFU2(mV, int Fs, u8 xCycles) { + mVUprint("microVU: EFU Opcode"); + analyzeReg1(Fs, mVUlow.VF_read[0]); + analyzePreg(xCycles); +} + +//------------------------------------------------------------------ +// MFP - MFP Opcode +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeMFP(mV, int Ft) { + if (!Ft) { mVUlow.isNOP = 1; } + analyzeReg2(Ft, mVUlow.VF_write, 1); +} + +//------------------------------------------------------------------ +// MOVE - MOVE Opcode +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeMOVE(mV, int Fs, int Ft) { + if (!Ft || (Ft == Fs)) { mVUlow.isNOP = 1; } + analyzeReg1(Fs, mVUlow.VF_read[0]); + analyzeReg2(Ft, mVUlow.VF_write, 1); +} + +//------------------------------------------------------------------ +// LQx - LQ/LQD/LQI Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeLQ(mV, int Ft, int Is, bool writeIs) { + analyzeVIreg1(Is, mVUlow.VI_read[0]); + analyzeReg2 (Ft, mVUlow.VF_write, 1); + if (!Ft) { if (writeIs && Is) { mVUlow.noWriteVF = 1; } else { mVUlow.isNOP = 1; } } + if (writeIs) { analyzeVIreg2(Is, mVUlow.VI_write, 1); } +} + +//------------------------------------------------------------------ +// SQx - SQ/SQD/SQI Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeSQ(mV, int Fs, int It, bool writeIt) { + analyzeReg1 (Fs, mVUlow.VF_read[0]); + analyzeVIreg1(It, mVUlow.VI_read[0]); + if (writeIt) { analyzeVIreg2(It, mVUlow.VI_write, 1); } +} + +//------------------------------------------------------------------ +// R*** - R Reg Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeR1(mV, int Fs, int Fsf) { + analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]); + analyzeRreg(); +} + +microVUt(void) mVUanalyzeR2(mV, int Ft, bool canBeNOP) { + if (!Ft) { if (canBeNOP) { mVUlow.isNOP = 1; } else { mVUlow.noWriteVF = 1; } } + analyzeReg2(Ft, mVUlow.VF_write, 1); + analyzeRreg(); +} + +//------------------------------------------------------------------ +// Sflag - Status Flag Opcodes +//------------------------------------------------------------------ +microVUt(void) flagSet(mV, bool setMacFlag) { + int curPC = iPC; + for (int i = mVUcount, j = 0; i > 0; i--, j++) { + j += mVUstall; + incPC2(-2); + if (sFLAG.doFlag && (j >= 3)) { + if (setMacFlag) { mFLAG.doFlag = 1; } + else { sFLAG.doNonSticky = 1; } + break; + } + } + iPC = curPC; +} + +microVUt(void) mVUanalyzeSflag(mV, int It) { + mVUlow.readFlags = 1; + analyzeVIreg2(It, mVUlow.VI_write, 1); + if (!It) { mVUlow.isNOP = 1; } + else { + mVUsFlagHack = 0; // Don't Optimize Out Status Flags for this block + mVUinfo.swapOps = 1; + flagSet(mVU, 0); + if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 1; } + } +} + +microVUt(void) mVUanalyzeFSSET(mV) { + mVUlow.isFSSET = 1; + mVUlow.readFlags = 1; +} + +//------------------------------------------------------------------ +// Mflag - Mac Flag Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeMflag(mV, int Is, int It) { + mVUlow.readFlags = 1; + analyzeVIreg1(Is, mVUlow.VI_read[0]); + analyzeVIreg2(It, mVUlow.VI_write, 1); + if (!It) { mVUlow.isNOP = 1; } + else { + mVUinfo.swapOps = 1; + flagSet(mVU, 1); + if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 2; } + } +} + +//------------------------------------------------------------------ +// Cflag - Clip Flag Opcodes +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeCflag(mV, int It) { + mVUinfo.swapOps = 1; + mVUlow.readFlags = 1; + if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 4; } + analyzeVIreg2(It, mVUlow.VI_write, 1); +} + +//------------------------------------------------------------------ +// XGkick +//------------------------------------------------------------------ + +microVUt(void) mVUanalyzeXGkick(mV, int Fs, int xCycles) { + analyzeVIreg1(Fs, mVUlow.VI_read[0]); + analyzeXGkick1(); + analyzeXGkick2(xCycles); + // Note: Technically XGKICK should stall on the next instruction, + // this code stalls on the same instruction. The only case where this + // will be a problem with, is if you have very-specifically placed + // FMxxx or FSxxx opcodes checking flags near this instruction AND + // the XGKICK instruction stalls. No-game should be effected by + // this minor difference. +} + +//------------------------------------------------------------------ +// Branches - Branch Opcodes +//------------------------------------------------------------------ + +microVUt(void) analyzeBranchVI(mV, int xReg, bool &infoVar) { + if (!xReg) return; + int i; + int iEnd = aMin(5, (mVUcount+1)); + int bPC = iPC; + incPC2(-2); + for (i = 0; i < iEnd; i++) { + if ((i == mVUcount) && (i < 5)) { + if (mVUpBlock->pState.viBackUp == xReg) { + infoVar = 1; + i++; + } + break; + } + if ((mVUlow.VI_write.reg == xReg) && mVUlow.VI_write.used) { + if (mVUlow.readFlags || i == 5) break; + if (i == 0) { incPC2(-2); continue; } + if (((mVUlow.VI_read[0].reg == xReg) && (mVUlow.VI_read[0].used)) + || ((mVUlow.VI_read[1].reg == xReg) && (mVUlow.VI_read[1].used))) + { incPC2(-2); continue; } + } + break; + } + if (i) { + if (!infoVar) { + incPC2(2); + mVUlow.backupVI = 1; + infoVar = 1; + } + iPC = bPC; + Console.WriteLn( Color_Green, "microVU%d: Branch VI-Delay (%d) [%04x]", getIndex, i, xPC); + } + else iPC = bPC; +} + +// Branch in Branch Delay-Slots +microVUt(int) mVUbranchCheck(mV) { + if (!mVUcount) return 0; + incPC(-2); + if (mVUlow.branch) { + mVUlow.badBranch = 1; + incPC(2); + mVUlow.evilBranch = 1; + mVUregs.blockType = 2; + Console.Warning("microVU%d Warning: Branch in Branch delay slot! [%04x]", mVU->index, xPC); + return 1; + } + incPC(2); + return 0; +} + +microVUt(void) mVUanalyzeCondBranch1(mV, int Is) { + analyzeVIreg1(Is, mVUlow.VI_read[0]); + if (!mVUstall && !mVUbranchCheck(mVU)) { + analyzeBranchVI(mVU, Is, mVUlow.memReadIs); + } +} + +microVUt(void) mVUanalyzeCondBranch2(mV, int Is, int It) { + analyzeVIreg1(Is, mVUlow.VI_read[0]); + analyzeVIreg1(It, mVUlow.VI_read[1]); + if (!mVUstall && !mVUbranchCheck(mVU)) { + analyzeBranchVI(mVU, Is, mVUlow.memReadIs); + analyzeBranchVI(mVU, It, mVUlow.memReadIt); + } +} + +microVUt(void) mVUanalyzeNormBranch(mV, int It, bool isBAL) { + mVUbranchCheck(mVU); + if (isBAL) { + analyzeVIreg2(It, mVUlow.VI_write, 1); + setConstReg(It, bSaveAddr); + } +} + +microVUt(void) mVUanalyzeJump(mV, int Is, int It, bool isJALR) { + mVUbranchCheck(mVU); + mVUlow.branch = (isJALR) ? 10 : 9; + if (mVUconstReg[Is].isValid && !CHECK_VU_CONSTHACK) { + mVUlow.constJump.isValid = 1; + mVUlow.constJump.regValue = mVUconstReg[Is].regValue; + //DevCon.Status("microVU%d: Constant JR/JALR Address Optimization", mVU->index); + } + analyzeVIreg1(Is, mVUlow.VI_read[0]); + if (isJALR) { + analyzeVIreg2(It, mVUlow.VI_write, 1); + setConstReg(It, bSaveAddr); + } +} diff --git a/pcsx2/x86/microVU_Clamp.inl b/pcsx2/x86/microVU_Clamp.inl index e0b2b6fa67..61c5139e4d 100644 --- a/pcsx2/x86/microVU_Clamp.inl +++ b/pcsx2/x86/microVU_Clamp.inl @@ -1,106 +1,106 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#pragma once - -//------------------------------------------------------------------ -// Micro VU - Clamp Functions -//------------------------------------------------------------------ - -const __aligned16 u32 sse4_minvals[2][4] = { - { 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000 - { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //1111 -}; -const __aligned16 u32 sse4_maxvals[2][4] = { - { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000 - { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //1111 -}; - -// Used for Result Clamping -// Note: This function will not preserve NaN values' sign. -// The theory behind this is that when we compute a result, and we've -// gotten a NaN value, then something went wrong; and the NaN's sign -// is not to be trusted. Games like positive values better usually, -// and its faster... so just always make NaNs into positive infinity. -void mVUclamp1(int reg, int regT1, int xyzw, bool bClampE = 0) { - if ((!clampE && CHECK_VU_OVERFLOW) || (clampE && bClampE)) { - switch (xyzw) { - case 1: case 2: case 4: case 8: - SSE_MINSS_M32_to_XMM(reg, (uptr)mVUglob.maxvals); - SSE_MAXSS_M32_to_XMM(reg, (uptr)mVUglob.minvals); - break; - default: - SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals); - SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals); - break; - } - } -} - -// Used for Operand Clamping -// Note 1: If 'preserve sign' mode is on, it will preserve the sign of NaN values. -// Note 2: Using regalloc here seems to contaminate some regs in certain games. -// Must be some specific case I've overlooked (or I used regalloc improperly on an opcode) -// so we just use a temporary mem location for our backup for now... (non-sse4 version only) -void mVUclamp2(microVU* mVU, int reg, int regT1, int xyzw, bool bClampE = 0) { - if ((!clampE && CHECK_VU_SIGN_OVERFLOW) || (clampE && bClampE && CHECK_VU_SIGN_OVERFLOW)) { - if (x86caps.hasStreamingSIMD4Extensions) { - int i = (xyzw==1||xyzw==2||xyzw==4||xyzw==8) ? 0: 1; - SSE4_PMINSD_M128_to_XMM(reg, (uptr)&sse4_maxvals[i][0]); - SSE4_PMINUD_M128_to_XMM(reg, (uptr)&sse4_minvals[i][0]); - return; - } - int regT1b = 0; - if (regT1 < 0) { - regT1b = 1; regT1=(reg+1)%8; - SSE_MOVAPS_XMM_to_M128((uptr)mVU->xmmCTemp, regT1); - //regT1 = mVU->regAlloc->allocReg(); - } - switch (xyzw) { - case 1: case 2: case 4: case 8: - SSE_MOVAPS_XMM_to_XMM(regT1, reg); - SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit); - SSE_MINSS_M32_to_XMM (reg, (uptr)mVUglob.maxvals); - SSE_MAXSS_M32_to_XMM (reg, (uptr)mVUglob.minvals); - SSE_ORPS_XMM_to_XMM (reg, regT1); - break; - default: - SSE_MOVAPS_XMM_to_XMM(regT1, reg); - SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit); - SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals); - SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals); - SSE_ORPS_XMM_to_XMM (reg, regT1); - break; - } - //if (regT1b) mVU->regAlloc->clearNeeded(regT1); - if (regT1b) SSE_MOVAPS_M128_to_XMM(regT1, (uptr)mVU->xmmCTemp); - } - else mVUclamp1(reg, regT1, xyzw, bClampE); -} - -// Used for operand clamping on every SSE instruction (add/sub/mul/div) -void mVUclamp3(microVU* mVU, int reg, int regT1, int xyzw) { - if (clampE) mVUclamp2(mVU, reg, regT1, xyzw, 1); -} - -// Used for result clamping on every SSE instruction (add/sub/mul/div) -// Note: Disabled in "preserve sign" mode because in certain cases it -// makes too much code-gen, and you get jump8-overflows in certain -// emulated opcodes (causing crashes). Since we're clamping the operands -// with mVUclamp3, we should almost never be getting a NaN result, -// but this clamp is just a precaution just-in-case. -void mVUclamp4(int reg, int regT1, int xyzw) { - if (clampE && !CHECK_VU_SIGN_OVERFLOW) mVUclamp1(reg, regT1, xyzw, 1); -} +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +//------------------------------------------------------------------ +// Micro VU - Clamp Functions +//------------------------------------------------------------------ + +const __aligned16 u32 sse4_minvals[2][4] = { + { 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000 + { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //1111 +}; +const __aligned16 u32 sse4_maxvals[2][4] = { + { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000 + { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //1111 +}; + +// Used for Result Clamping +// Note: This function will not preserve NaN values' sign. +// The theory behind this is that when we compute a result, and we've +// gotten a NaN value, then something went wrong; and the NaN's sign +// is not to be trusted. Games like positive values better usually, +// and its faster... so just always make NaNs into positive infinity. +void mVUclamp1(int reg, int regT1, int xyzw, bool bClampE = 0) { + if ((!clampE && CHECK_VU_OVERFLOW) || (clampE && bClampE)) { + switch (xyzw) { + case 1: case 2: case 4: case 8: + SSE_MINSS_M32_to_XMM(reg, (uptr)mVUglob.maxvals); + SSE_MAXSS_M32_to_XMM(reg, (uptr)mVUglob.minvals); + break; + default: + SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals); + SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals); + break; + } + } +} + +// Used for Operand Clamping +// Note 1: If 'preserve sign' mode is on, it will preserve the sign of NaN values. +// Note 2: Using regalloc here seems to contaminate some regs in certain games. +// Must be some specific case I've overlooked (or I used regalloc improperly on an opcode) +// so we just use a temporary mem location for our backup for now... (non-sse4 version only) +void mVUclamp2(microVU* mVU, int reg, int regT1, int xyzw, bool bClampE = 0) { + if ((!clampE && CHECK_VU_SIGN_OVERFLOW) || (clampE && bClampE && CHECK_VU_SIGN_OVERFLOW)) { + if (x86caps.hasStreamingSIMD4Extensions) { + int i = (xyzw==1||xyzw==2||xyzw==4||xyzw==8) ? 0: 1; + SSE4_PMINSD_M128_to_XMM(reg, (uptr)&sse4_maxvals[i][0]); + SSE4_PMINUD_M128_to_XMM(reg, (uptr)&sse4_minvals[i][0]); + return; + } + int regT1b = 0; + if (regT1 < 0) { + regT1b = 1; regT1=(reg+1)%8; + SSE_MOVAPS_XMM_to_M128((uptr)mVU->xmmCTemp, regT1); + //regT1 = mVU->regAlloc->allocReg(); + } + switch (xyzw) { + case 1: case 2: case 4: case 8: + SSE_MOVAPS_XMM_to_XMM(regT1, reg); + SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit); + SSE_MINSS_M32_to_XMM (reg, (uptr)mVUglob.maxvals); + SSE_MAXSS_M32_to_XMM (reg, (uptr)mVUglob.minvals); + SSE_ORPS_XMM_to_XMM (reg, regT1); + break; + default: + SSE_MOVAPS_XMM_to_XMM(regT1, reg); + SSE_ANDPS_M128_to_XMM(regT1, (uptr)mVUglob.signbit); + SSE_MINPS_M128_to_XMM(reg, (uptr)mVUglob.maxvals); + SSE_MAXPS_M128_to_XMM(reg, (uptr)mVUglob.minvals); + SSE_ORPS_XMM_to_XMM (reg, regT1); + break; + } + //if (regT1b) mVU->regAlloc->clearNeeded(regT1); + if (regT1b) SSE_MOVAPS_M128_to_XMM(regT1, (uptr)mVU->xmmCTemp); + } + else mVUclamp1(reg, regT1, xyzw, bClampE); +} + +// Used for operand clamping on every SSE instruction (add/sub/mul/div) +void mVUclamp3(microVU* mVU, int reg, int regT1, int xyzw) { + if (clampE) mVUclamp2(mVU, reg, regT1, xyzw, 1); +} + +// Used for result clamping on every SSE instruction (add/sub/mul/div) +// Note: Disabled in "preserve sign" mode because in certain cases it +// makes too much code-gen, and you get jump8-overflows in certain +// emulated opcodes (causing crashes). Since we're clamping the operands +// with mVUclamp3, we should almost never be getting a NaN result, +// but this clamp is just a precaution just-in-case. +void mVUclamp4(int reg, int regT1, int xyzw) { + if (clampE && !CHECK_VU_SIGN_OVERFLOW) mVUclamp1(reg, regT1, xyzw, 1); +} diff --git a/pcsx2/x86/microVU_IR.h b/pcsx2/x86/microVU_IR.h index 014ff69a9f..7f27848dbe 100644 --- a/pcsx2/x86/microVU_IR.h +++ b/pcsx2/x86/microVU_IR.h @@ -1,337 +1,337 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#pragma once - -union regInfo { - u32 reg; - struct { - u8 x; - u8 y; - u8 z; - u8 w; - }; -}; - -#ifdef _MSC_VER -# pragma pack(1) -# pragma warning(disable:4996) // 'function': was declared deprecated -#endif - -struct __aligned16 microRegInfo { // Ordered for Faster Compares - u32 vi15; // Constant Prop Info for vi15 (only valid if sign-bit set) - u8 needExactMatch; // If set, block needs an exact match of pipeline state - u8 q; - u8 p; - u8 r; - u8 xgkick; - u8 viBackUp; - u8 VI[16]; - regInfo VF[32]; - u8 flags; // clip x2 :: status x2 - u8 blockType; // 0 = Normal; 1,2 = Compile one instruction (E-bit/Branch Ending) - u8 padding[5]; // 160 bytes -} __packed; - -struct __aligned16 microBlock { - microRegInfo pState; // Detailed State of Pipeline - microRegInfo pStateEnd; // Detailed State of Pipeline at End of Block (needed by JR/JALR opcodes) - u8* x86ptrStart; // Start of code -} __packed; - -#ifdef _MSC_VER -# pragma pack() -#endif - -struct microTempRegInfo { - regInfo VF[2]; // Holds cycle info for Fd, VF[0] = Upper Instruction, VF[1] = Lower Instruction - u8 VFreg[2]; // Index of the VF reg - u8 VI; // Holds cycle info for Id - u8 VIreg; // Index of the VI reg - u8 q; // Holds cycle info for Q reg - u8 p; // Holds cycle info for P reg - u8 r; // Holds cycle info for R reg (Will never cause stalls, but useful to know if R is modified) - u8 xgkick; // Holds the cycle info for XGkick -}; - -struct microVFreg { - u8 reg; // Reg Index - u8 x; // X vector read/written to? - u8 y; // Y vector read/written to? - u8 z; // Z vector read/written to? - u8 w; // W vector read/written to? -}; - -struct microVIreg { - u8 reg; // Reg Index - u8 used; // Reg is Used? (Read/Written) -}; - -struct microConstInfo { - u8 isValid; // Is the constant in regValue valid? - u32 regValue; // Constant Value -}; - -struct microUpperOp { - bool eBit; // Has E-bit set - bool iBit; // Has I-bit set - bool mBit; // Has M-bit set - microVFreg VF_write; // VF Vectors written to by this instruction - microVFreg VF_read[2]; // VF Vectors read by this instruction -}; - -struct microLowerOp { - microVFreg VF_write; // VF Vectors written to by this instruction - microVFreg VF_read[2]; // VF Vectors read by this instruction - microVIreg VI_write; // VI reg written to by this instruction - microVIreg VI_read[2]; // VI regs read by this instruction - microConstInfo constJump; // Constant Reg Info for JR/JARL instructions - u32 branch; // Branch Type (0 = Not a Branch, 1 = B. 2 = BAL, 3~8 = Conditional Branches, 9 = JALR, 10 = JR) - bool badBranch; // This instruction is a Branch who has another branch in its Delay Slot - bool evilBranch;// This instruction is a Branch in a Branch Delay Slot (Instruction after badBranch) - bool isNOP; // This instruction is a NOP - bool isFSSET; // This instruction is a FSSET - bool noWriteVF; // Don't write back the result of a lower op to VF reg if upper op writes to same reg (or if VF = 0) - bool backupVI; // Backup VI reg to memory if modified before branch (branch uses old VI value unless opcode is ILW or ILWR) - bool memReadIs; // Read Is (VI reg) from memory (used by branches) - bool memReadIt; // Read If (VI reg) from memory (used by branches) - bool readFlags; // Current Instruction reads Status, Mac, or Clip flags -}; - -struct microFlagInst { - bool doFlag; // Update Flag on this Instruction - bool doNonSticky; // Update O,U,S,Z (non-sticky) bits on this Instruction (status flag only) - u8 write; // Points to the instance that should be written to (s-stage write) - u8 lastWrite; // Points to the instance that was last written to (most up-to-date flag) - u8 read; // Points to the instance that should be read by a lower instruction (t-stage read) -}; - -struct microFlagCycles { - int xStatus[4]; - int xMac[4]; - int xClip[4]; - int cycles; -}; - -struct microOp { - u8 stall; // Info on how much current instruction stalled - bool isEOB; // Cur Instruction is last instruction in block (End of Block) - bool isBdelay; // Cur Instruction in Branch Delay slot - bool swapOps; // Run Lower Instruction before Upper Instruction - bool backupVF; // Backup mVUlow.VF_write.reg, and restore it before the Upper Instruction is called - bool doXGKICK; // Do XGKICK transfer on this instruction - bool doDivFlag; // Transfer Div flag to Status Flag on this instruction - int readQ; // Q instance for reading - int writeQ; // Q instance for writing - int readP; // P instance for reading - int writeP; // P instance for writing - microFlagInst sFlag; // Status Flag Instance Info - microFlagInst mFlag; // Mac Flag Instance Info - microFlagInst cFlag; // Clip Flag Instance Info - microUpperOp uOp; // Upper Op Info - microLowerOp lOp; // Lower Op Info -}; - -template -struct microIR { - microBlock block; // Block/Pipeline info - microBlock* pBlock; // Pointer to a block in mVUblocks - microTempRegInfo regsTemp; // Temp Pipeline info (used so that new pipeline info isn't conflicting between upper and lower instructions in the same cycle) - microOp info[pSize/2]; // Info for Instructions in current block - microConstInfo constReg[16]; // Simple Const Propagation Info for VI regs within blocks - u8 branch; - u32 cycles; // Cycles for current block - u32 count; // Number of VU 64bit instructions ran (starts at 0 for each block) - u32 curPC; // Current PC - u32 startPC; // Start PC for Cur Block - u32 sFlagHack; // Optimize out all Status flag updates if microProgram doesn't use Status flags -}; - -//------------------------------------------------------------------ -// Reg Alloc -//------------------------------------------------------------------ - -void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW); -void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW); -void mVUloadReg(int reg, uptr offset, int xyzw); -void mVUloadIreg(int reg, int xyzw, VURegs* vuRegs); - -struct microXMM { - int reg; // VF Reg Number Stored (-1 = Temp; 0 = vf0 and will not be written back; 32 = ACC; 33 = I reg) - int xyzw; // xyzw to write back (0 = Don't write back anything AND cached vfReg has all vectors valid) - int count; // Count of when last used - bool isNeeded; // Is needed for current instruction -}; - -#define xmmTotal 7 // Don't allocate PQ? -class microRegAlloc { -private: - microXMM xmmReg[xmmTotal]; - VURegs* vuRegs; - int counter; - int findFreeRegRec(int startIdx) { - for (int i = startIdx; i < xmmTotal; i++) { - if (!xmmReg[i].isNeeded) { - int x = findFreeRegRec(i+1); - if (x == -1) return i; - return ((xmmReg[i].count < xmmReg[x].count) ? i : x); - } - } - return -1; - } - int findFreeReg() { - for (int i = 0; i < xmmTotal; i++) { - if (!xmmReg[i].isNeeded && (xmmReg[i].reg < 0)) { - return i; // Reg is not needed and was a temp reg - } - } - int x = findFreeRegRec(0); - if (x < 0) { DevCon.Error("microVU Allocation Error!"); return 0; } - return x; - } - -public: - microRegAlloc(VURegs* vuRegsPtr) { - vuRegs = vuRegsPtr; - reset(); - } - void reset() { - for (int i = 0; i < xmmTotal; i++) { - clearReg(i); - } - counter = 0; - } - void flushAll(bool clearState = 1) { - for (int i = 0; i < xmmTotal; i++) { - writeBackReg(i); - if (clearState) clearReg(i); - } - } - void clearReg(int reg) { - xmmReg[reg].reg = -1; - xmmReg[reg].count = 0; - xmmReg[reg].xyzw = 0; - xmmReg[reg].isNeeded = 0; - } - void clearRegVF(int VFreg) { - for (int i = 0; i < xmmTotal; i++) { - if (xmmReg[i].reg == VFreg) clearReg(i); - } - } - void writeBackReg(int reg, bool invalidateRegs = 1) { - if ((xmmReg[reg].reg > 0) && xmmReg[reg].xyzw) { // Reg was modified and not Temp or vf0 - if (xmmReg[reg].reg == 33) SSE_MOVSS_XMM_to_M32((uptr)&vuRegs->VI[REG_I].UL, reg); - else if (xmmReg[reg].reg == 32) mVUsaveReg(reg, (uptr)&vuRegs->ACC.UL[0], xmmReg[reg].xyzw, 1); - else mVUsaveReg(reg, (uptr)&vuRegs->VF[xmmReg[reg].reg].UL[0], xmmReg[reg].xyzw, 1); - if (invalidateRegs) { - for (int i = 0; i < xmmTotal; i++) { - if ((i == reg) || xmmReg[i].isNeeded) continue; - if (xmmReg[i].reg == xmmReg[reg].reg) { - if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon.Error("microVU Error: writeBackReg() [%d]", xmmReg[i].reg); - clearReg(i); // Invalidate any Cached Regs of same vf Reg - } - } - } - if (xmmReg[reg].xyzw == 0xf) { // Make Cached Reg if All Vectors were Modified - xmmReg[reg].count = counter; - xmmReg[reg].xyzw = 0; - xmmReg[reg].isNeeded = 0; - return; - } - } - clearReg(reg); // Clear Reg - } - void clearNeeded(int reg) { - if ((reg < 0) || (reg >= xmmTotal)) return; - xmmReg[reg].isNeeded = 0; - if (xmmReg[reg].xyzw) { // Reg was modified - if (xmmReg[reg].reg > 0) { - int mergeRegs = 0; - if (xmmReg[reg].xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes - for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg - if (i == reg) continue; - if (xmmReg[i].reg == xmmReg[reg].reg) { - if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon.Error("microVU Error: clearNeeded() [%d]", xmmReg[i].reg); - if (mergeRegs == 1) { - mVUmergeRegs(i, reg, xmmReg[reg].xyzw, 1); - xmmReg[i].xyzw = 0xf; - xmmReg[i].count = counter; - mergeRegs = 2; - } - else clearReg(i); - } - } - if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged - else if (mergeRegs) writeBackReg(reg); // Write Back Partial Writes if couldn't merge - } - else clearReg(reg); // If Reg was temp or vf0, then invalidate itself - } - } - int allocReg(int vfLoadReg = -1, int vfWriteReg = -1, int xyzw = 0, bool cloneWrite = 1) { - counter++; - if (vfLoadReg >= 0) { // Search For Cached Regs - for (int i = 0; i < xmmTotal; i++) { - if ((xmmReg[i].reg == vfLoadReg) && (!xmmReg[i].xyzw // Reg Was Not Modified - || (xmmReg[i].reg && (xmmReg[i].xyzw==0xf)))) { // Reg Had All Vectors Modified and != VF0 - int z = i; - if (vfWriteReg >= 0) { // Reg will be modified - if (cloneWrite) { // Clone Reg so as not to use the same Cached Reg - z = findFreeReg(); - writeBackReg(z); - if (z!=i && xyzw==8) SSE_MOVAPS_XMM_to_XMM (z, i); - else if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1); - else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2); - else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3); - else if (z != i) SSE_MOVAPS_XMM_to_XMM (z, i); - xmmReg[i].count = counter; // Reg i was used, so update counter - } - else { // Don't clone reg, but shuffle to adjust for SS ops - if ((vfLoadReg != vfWriteReg) || (xyzw != 0xf)) { writeBackReg(z); } - if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1); - else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2); - else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3); - } - xmmReg[z].reg = vfWriteReg; - xmmReg[z].xyzw = xyzw; - } - xmmReg[z].count = counter; - xmmReg[z].isNeeded = 1; - return z; - } - } - } - int x = findFreeReg(); - writeBackReg(x); - - if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading) - if ((vfLoadReg == 0) && !(xyzw & 1)) { SSE2_PXOR_XMM_to_XMM(x, x); } - else if (vfLoadReg == 33) mVUloadIreg(x, xyzw, vuRegs); - else if (vfLoadReg == 32) mVUloadReg (x, (uptr)&vuRegs->ACC.UL[0], xyzw); - else if (vfLoadReg >= 0) mVUloadReg (x, (uptr)&vuRegs->VF[vfLoadReg].UL[0], xyzw); - xmmReg[x].reg = vfWriteReg; - xmmReg[x].xyzw = xyzw; - } - else { // Reg Will Not Be Modified (always load full reg for caching) - if (vfLoadReg == 33) mVUloadIreg(x, 0xf, vuRegs); - else if (vfLoadReg == 32) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->ACC.UL[0]); - else if (vfLoadReg >= 0) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->VF[vfLoadReg].UL[0]); - xmmReg[x].reg = vfLoadReg; - xmmReg[x].xyzw = 0; - } - xmmReg[x].count = counter; - xmmReg[x].isNeeded = 1; - return x; - } -}; +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +union regInfo { + u32 reg; + struct { + u8 x; + u8 y; + u8 z; + u8 w; + }; +}; + +#ifdef _MSC_VER +# pragma pack(1) +# pragma warning(disable:4996) // 'function': was declared deprecated +#endif + +struct __aligned16 microRegInfo { // Ordered for Faster Compares + u32 vi15; // Constant Prop Info for vi15 (only valid if sign-bit set) + u8 needExactMatch; // If set, block needs an exact match of pipeline state + u8 q; + u8 p; + u8 r; + u8 xgkick; + u8 viBackUp; + u8 VI[16]; + regInfo VF[32]; + u8 flags; // clip x2 :: status x2 + u8 blockType; // 0 = Normal; 1,2 = Compile one instruction (E-bit/Branch Ending) + u8 padding[5]; // 160 bytes +} __packed; + +struct __aligned16 microBlock { + microRegInfo pState; // Detailed State of Pipeline + microRegInfo pStateEnd; // Detailed State of Pipeline at End of Block (needed by JR/JALR opcodes) + u8* x86ptrStart; // Start of code +} __packed; + +#ifdef _MSC_VER +# pragma pack() +#endif + +struct microTempRegInfo { + regInfo VF[2]; // Holds cycle info for Fd, VF[0] = Upper Instruction, VF[1] = Lower Instruction + u8 VFreg[2]; // Index of the VF reg + u8 VI; // Holds cycle info for Id + u8 VIreg; // Index of the VI reg + u8 q; // Holds cycle info for Q reg + u8 p; // Holds cycle info for P reg + u8 r; // Holds cycle info for R reg (Will never cause stalls, but useful to know if R is modified) + u8 xgkick; // Holds the cycle info for XGkick +}; + +struct microVFreg { + u8 reg; // Reg Index + u8 x; // X vector read/written to? + u8 y; // Y vector read/written to? + u8 z; // Z vector read/written to? + u8 w; // W vector read/written to? +}; + +struct microVIreg { + u8 reg; // Reg Index + u8 used; // Reg is Used? (Read/Written) +}; + +struct microConstInfo { + u8 isValid; // Is the constant in regValue valid? + u32 regValue; // Constant Value +}; + +struct microUpperOp { + bool eBit; // Has E-bit set + bool iBit; // Has I-bit set + bool mBit; // Has M-bit set + microVFreg VF_write; // VF Vectors written to by this instruction + microVFreg VF_read[2]; // VF Vectors read by this instruction +}; + +struct microLowerOp { + microVFreg VF_write; // VF Vectors written to by this instruction + microVFreg VF_read[2]; // VF Vectors read by this instruction + microVIreg VI_write; // VI reg written to by this instruction + microVIreg VI_read[2]; // VI regs read by this instruction + microConstInfo constJump; // Constant Reg Info for JR/JARL instructions + u32 branch; // Branch Type (0 = Not a Branch, 1 = B. 2 = BAL, 3~8 = Conditional Branches, 9 = JALR, 10 = JR) + bool badBranch; // This instruction is a Branch who has another branch in its Delay Slot + bool evilBranch;// This instruction is a Branch in a Branch Delay Slot (Instruction after badBranch) + bool isNOP; // This instruction is a NOP + bool isFSSET; // This instruction is a FSSET + bool noWriteVF; // Don't write back the result of a lower op to VF reg if upper op writes to same reg (or if VF = 0) + bool backupVI; // Backup VI reg to memory if modified before branch (branch uses old VI value unless opcode is ILW or ILWR) + bool memReadIs; // Read Is (VI reg) from memory (used by branches) + bool memReadIt; // Read If (VI reg) from memory (used by branches) + bool readFlags; // Current Instruction reads Status, Mac, or Clip flags +}; + +struct microFlagInst { + bool doFlag; // Update Flag on this Instruction + bool doNonSticky; // Update O,U,S,Z (non-sticky) bits on this Instruction (status flag only) + u8 write; // Points to the instance that should be written to (s-stage write) + u8 lastWrite; // Points to the instance that was last written to (most up-to-date flag) + u8 read; // Points to the instance that should be read by a lower instruction (t-stage read) +}; + +struct microFlagCycles { + int xStatus[4]; + int xMac[4]; + int xClip[4]; + int cycles; +}; + +struct microOp { + u8 stall; // Info on how much current instruction stalled + bool isEOB; // Cur Instruction is last instruction in block (End of Block) + bool isBdelay; // Cur Instruction in Branch Delay slot + bool swapOps; // Run Lower Instruction before Upper Instruction + bool backupVF; // Backup mVUlow.VF_write.reg, and restore it before the Upper Instruction is called + bool doXGKICK; // Do XGKICK transfer on this instruction + bool doDivFlag; // Transfer Div flag to Status Flag on this instruction + int readQ; // Q instance for reading + int writeQ; // Q instance for writing + int readP; // P instance for reading + int writeP; // P instance for writing + microFlagInst sFlag; // Status Flag Instance Info + microFlagInst mFlag; // Mac Flag Instance Info + microFlagInst cFlag; // Clip Flag Instance Info + microUpperOp uOp; // Upper Op Info + microLowerOp lOp; // Lower Op Info +}; + +template +struct microIR { + microBlock block; // Block/Pipeline info + microBlock* pBlock; // Pointer to a block in mVUblocks + microTempRegInfo regsTemp; // Temp Pipeline info (used so that new pipeline info isn't conflicting between upper and lower instructions in the same cycle) + microOp info[pSize/2]; // Info for Instructions in current block + microConstInfo constReg[16]; // Simple Const Propagation Info for VI regs within blocks + u8 branch; + u32 cycles; // Cycles for current block + u32 count; // Number of VU 64bit instructions ran (starts at 0 for each block) + u32 curPC; // Current PC + u32 startPC; // Start PC for Cur Block + u32 sFlagHack; // Optimize out all Status flag updates if microProgram doesn't use Status flags +}; + +//------------------------------------------------------------------ +// Reg Alloc +//------------------------------------------------------------------ + +void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW); +void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW); +void mVUloadReg(int reg, uptr offset, int xyzw); +void mVUloadIreg(int reg, int xyzw, VURegs* vuRegs); + +struct microXMM { + int reg; // VF Reg Number Stored (-1 = Temp; 0 = vf0 and will not be written back; 32 = ACC; 33 = I reg) + int xyzw; // xyzw to write back (0 = Don't write back anything AND cached vfReg has all vectors valid) + int count; // Count of when last used + bool isNeeded; // Is needed for current instruction +}; + +#define xmmTotal 7 // Don't allocate PQ? +class microRegAlloc { +private: + microXMM xmmReg[xmmTotal]; + VURegs* vuRegs; + int counter; + int findFreeRegRec(int startIdx) { + for (int i = startIdx; i < xmmTotal; i++) { + if (!xmmReg[i].isNeeded) { + int x = findFreeRegRec(i+1); + if (x == -1) return i; + return ((xmmReg[i].count < xmmReg[x].count) ? i : x); + } + } + return -1; + } + int findFreeReg() { + for (int i = 0; i < xmmTotal; i++) { + if (!xmmReg[i].isNeeded && (xmmReg[i].reg < 0)) { + return i; // Reg is not needed and was a temp reg + } + } + int x = findFreeRegRec(0); + if (x < 0) { DevCon.Error("microVU Allocation Error!"); return 0; } + return x; + } + +public: + microRegAlloc(VURegs* vuRegsPtr) { + vuRegs = vuRegsPtr; + reset(); + } + void reset() { + for (int i = 0; i < xmmTotal; i++) { + clearReg(i); + } + counter = 0; + } + void flushAll(bool clearState = 1) { + for (int i = 0; i < xmmTotal; i++) { + writeBackReg(i); + if (clearState) clearReg(i); + } + } + void clearReg(int reg) { + xmmReg[reg].reg = -1; + xmmReg[reg].count = 0; + xmmReg[reg].xyzw = 0; + xmmReg[reg].isNeeded = 0; + } + void clearRegVF(int VFreg) { + for (int i = 0; i < xmmTotal; i++) { + if (xmmReg[i].reg == VFreg) clearReg(i); + } + } + void writeBackReg(int reg, bool invalidateRegs = 1) { + if ((xmmReg[reg].reg > 0) && xmmReg[reg].xyzw) { // Reg was modified and not Temp or vf0 + if (xmmReg[reg].reg == 33) SSE_MOVSS_XMM_to_M32((uptr)&vuRegs->VI[REG_I].UL, reg); + else if (xmmReg[reg].reg == 32) mVUsaveReg(reg, (uptr)&vuRegs->ACC.UL[0], xmmReg[reg].xyzw, 1); + else mVUsaveReg(reg, (uptr)&vuRegs->VF[xmmReg[reg].reg].UL[0], xmmReg[reg].xyzw, 1); + if (invalidateRegs) { + for (int i = 0; i < xmmTotal; i++) { + if ((i == reg) || xmmReg[i].isNeeded) continue; + if (xmmReg[i].reg == xmmReg[reg].reg) { + if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon.Error("microVU Error: writeBackReg() [%d]", xmmReg[i].reg); + clearReg(i); // Invalidate any Cached Regs of same vf Reg + } + } + } + if (xmmReg[reg].xyzw == 0xf) { // Make Cached Reg if All Vectors were Modified + xmmReg[reg].count = counter; + xmmReg[reg].xyzw = 0; + xmmReg[reg].isNeeded = 0; + return; + } + } + clearReg(reg); // Clear Reg + } + void clearNeeded(int reg) { + if ((reg < 0) || (reg >= xmmTotal)) return; + xmmReg[reg].isNeeded = 0; + if (xmmReg[reg].xyzw) { // Reg was modified + if (xmmReg[reg].reg > 0) { + int mergeRegs = 0; + if (xmmReg[reg].xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes + for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg + if (i == reg) continue; + if (xmmReg[i].reg == xmmReg[reg].reg) { + if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon.Error("microVU Error: clearNeeded() [%d]", xmmReg[i].reg); + if (mergeRegs == 1) { + mVUmergeRegs(i, reg, xmmReg[reg].xyzw, 1); + xmmReg[i].xyzw = 0xf; + xmmReg[i].count = counter; + mergeRegs = 2; + } + else clearReg(i); + } + } + if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged + else if (mergeRegs) writeBackReg(reg); // Write Back Partial Writes if couldn't merge + } + else clearReg(reg); // If Reg was temp or vf0, then invalidate itself + } + } + int allocReg(int vfLoadReg = -1, int vfWriteReg = -1, int xyzw = 0, bool cloneWrite = 1) { + counter++; + if (vfLoadReg >= 0) { // Search For Cached Regs + for (int i = 0; i < xmmTotal; i++) { + if ((xmmReg[i].reg == vfLoadReg) && (!xmmReg[i].xyzw // Reg Was Not Modified + || (xmmReg[i].reg && (xmmReg[i].xyzw==0xf)))) { // Reg Had All Vectors Modified and != VF0 + int z = i; + if (vfWriteReg >= 0) { // Reg will be modified + if (cloneWrite) { // Clone Reg so as not to use the same Cached Reg + z = findFreeReg(); + writeBackReg(z); + if (z!=i && xyzw==8) SSE_MOVAPS_XMM_to_XMM (z, i); + else if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1); + else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2); + else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3); + else if (z != i) SSE_MOVAPS_XMM_to_XMM (z, i); + xmmReg[i].count = counter; // Reg i was used, so update counter + } + else { // Don't clone reg, but shuffle to adjust for SS ops + if ((vfLoadReg != vfWriteReg) || (xyzw != 0xf)) { writeBackReg(z); } + if (xyzw == 4) SSE2_PSHUFD_XMM_to_XMM(z, i, 1); + else if (xyzw == 2) SSE2_PSHUFD_XMM_to_XMM(z, i, 2); + else if (xyzw == 1) SSE2_PSHUFD_XMM_to_XMM(z, i, 3); + } + xmmReg[z].reg = vfWriteReg; + xmmReg[z].xyzw = xyzw; + } + xmmReg[z].count = counter; + xmmReg[z].isNeeded = 1; + return z; + } + } + } + int x = findFreeReg(); + writeBackReg(x); + + if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading) + if ((vfLoadReg == 0) && !(xyzw & 1)) { SSE2_PXOR_XMM_to_XMM(x, x); } + else if (vfLoadReg == 33) mVUloadIreg(x, xyzw, vuRegs); + else if (vfLoadReg == 32) mVUloadReg (x, (uptr)&vuRegs->ACC.UL[0], xyzw); + else if (vfLoadReg >= 0) mVUloadReg (x, (uptr)&vuRegs->VF[vfLoadReg].UL[0], xyzw); + xmmReg[x].reg = vfWriteReg; + xmmReg[x].xyzw = xyzw; + } + else { // Reg Will Not Be Modified (always load full reg for caching) + if (vfLoadReg == 33) mVUloadIreg(x, 0xf, vuRegs); + else if (vfLoadReg == 32) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->ACC.UL[0]); + else if (vfLoadReg >= 0) SSE_MOVAPS_M128_to_XMM(x, (uptr)&vuRegs->VF[vfLoadReg].UL[0]); + xmmReg[x].reg = vfLoadReg; + xmmReg[x].xyzw = 0; + } + xmmReg[x].count = counter; + xmmReg[x].isNeeded = 1; + return x; + } +}; diff --git a/pcsx2/x86/newVif.h b/pcsx2/x86/newVif.h index 9db218179e..6d369a8d00 100644 --- a/pcsx2/x86/newVif.h +++ b/pcsx2/x86/newVif.h @@ -1,67 +1,67 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#pragma once - -#ifdef newVif -#include "x86emitter/x86emitter.h" -using namespace x86Emitter; -extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0); -extern void _nVifUnpack(int idx, u8 *data, u32 size); - -typedef u32 (__fastcall *nVifCall)(void*, void*); - -static __pagealigned u8 nVifUpkExec[__pagesize*16]; -static __aligned16 nVifCall nVifUpk[(2*2*16)*4*4]; // ([USN][Masking][Unpack Type]) [curCycle][CyclesToWrite-1] -static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector] - -#define _v0 0 -#define _v1 0x55 -#define _v2 0xaa -#define _v3 0xff -#define aMax(x, y) std::max(x,y) -#define aMin(x, y) std::min(x,y) -#define _f __forceinline - -#define xShiftR(regX, n) { \ - if (usn) { xPSRL.D(regX, n); } \ - else { xPSRA.D(regX, n); } \ -} - -static const u32 nVifT[16] = { - 4, // S-32 - 2, // S-16 - 1, // S-8 - 0, // ---- - 8, // V2-32 - 4, // V2-16 - 2, // V2-8 - 0, // ---- - 12,// V3-32 - 6, // V3-16 - 3, // V3-8 - 0, // ---- - 16,// V4-32 - 8, // V4-16 - 4, // V4-8 - 2, // V4-5 -}; - -#include "newVif_BlockBuffer.h" -#include "newVif_OldUnpack.inl" -#include "newVif_UnpackGen.inl" -#include "newVif_Unpack.inl" - -#endif +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +#ifdef newVif +#include "x86emitter/x86emitter.h" +using namespace x86Emitter; +extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0); +extern void _nVifUnpack(int idx, u8 *data, u32 size); + +typedef u32 (__fastcall *nVifCall)(void*, void*); + +static __pagealigned u8 nVifUpkExec[__pagesize*16]; +static __aligned16 nVifCall nVifUpk[(2*2*16)*4*4]; // ([USN][Masking][Unpack Type]) [curCycle][CyclesToWrite-1] +static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector] + +#define _v0 0 +#define _v1 0x55 +#define _v2 0xaa +#define _v3 0xff +#define aMax(x, y) std::max(x,y) +#define aMin(x, y) std::min(x,y) +#define _f __forceinline + +#define xShiftR(regX, n) { \ + if (usn) { xPSRL.D(regX, n); } \ + else { xPSRA.D(regX, n); } \ +} + +static const u32 nVifT[16] = { + 4, // S-32 + 2, // S-16 + 1, // S-8 + 0, // ---- + 8, // V2-32 + 4, // V2-16 + 2, // V2-8 + 0, // ---- + 12,// V3-32 + 6, // V3-16 + 3, // V3-8 + 0, // ---- + 16,// V4-32 + 8, // V4-16 + 4, // V4-8 + 2, // V4-5 +}; + +#include "newVif_BlockBuffer.h" +#include "newVif_OldUnpack.inl" +#include "newVif_UnpackGen.inl" +#include "newVif_Unpack.inl" + +#endif diff --git a/pcsx2/x86/newVif_BlockBuffer.h b/pcsx2/x86/newVif_BlockBuffer.h index 806565cff4..cb378c74e5 100644 --- a/pcsx2/x86/newVif_BlockBuffer.h +++ b/pcsx2/x86/newVif_BlockBuffer.h @@ -1,40 +1,40 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#pragma once - -class BlockBuffer { -private: - u32 mSize; // Cur Size - u32 mSizeT; // Total Size - u8* mData; // Data Ptr - void grow(u32 newSize) { - u8* temp = new u8[newSize]; - memcpy(temp, mData, mSizeT); - safe_delete( mData ); - mData = temp; - } -public: - BlockBuffer(u32 tSize) { mSizeT = tSize; mSize = 0; mData = new u8[mSizeT]; } - virtual ~BlockBuffer() { safe_delete(mData); } - void append(void *addr, u32 size) { - if (mSize + size > mSizeT) grow(mSize*2 + size); - memcpy(&mData[mSize], addr, size); - mSize += size; - } - void clear() { mSize = 0; } - u32 getSize() { return mSize; } - u8* getBlock() { return mData; } -}; +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +class BlockBuffer { +private: + u32 mSize; // Cur Size + u32 mSizeT; // Total Size + u8* mData; // Data Ptr + void grow(u32 newSize) { + u8* temp = new u8[newSize]; + memcpy(temp, mData, mSizeT); + safe_delete( mData ); + mData = temp; + } +public: + BlockBuffer(u32 tSize) { mSizeT = tSize; mSize = 0; mData = new u8[mSizeT]; } + virtual ~BlockBuffer() { safe_delete(mData); } + void append(void *addr, u32 size) { + if (mSize + size > mSizeT) grow(mSize*2 + size); + memcpy(&mData[mSize], addr, size); + mSize += size; + } + void clear() { mSize = 0; } + u32 getSize() { return mSize; } + u8* getBlock() { return mData; } +}; diff --git a/pcsx2/x86/newVif_OldUnpack.inl b/pcsx2/x86/newVif_OldUnpack.inl index 3f19e93a43..6dcaebf2b1 100644 --- a/pcsx2/x86/newVif_OldUnpack.inl +++ b/pcsx2/x86/newVif_OldUnpack.inl @@ -1,167 +1,167 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -// Old Vif Unpack Code -// Only here for testing/reference -// If newVif is defined and newVif1 isn't, vif1 will use this code -// same goes for vif0... -template void VIFunpack<0>(u32 *data, vifCode *v, u32 size); -template void VIFunpack<1>(u32 *data, vifCode *v, u32 size); -template void VIFunpack(u32 *data, vifCode *v, u32 size) { - //if (!VIFdmanum) DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data); - UNPACKFUNCTYPE func; - const VIFUnpackFuncTable *ft; - VURegs * VU; - u8 *cdata = (u8*)data; - u32 tempsize = 0; - const u32 memlimit = vif_size(VIFdmanum); - - if (VIFdmanum == 0) { - VU = &VU0; - vifRegs = vif0Regs; - vifMaskRegs = g_vif0Masks; - vif = &vif0; - vifRow = g_vifmask.Row0; - } - else { - VU = &VU1; - vifRegs = vif1Regs; - vifMaskRegs = g_vif1Masks; - vif = &vif1; - vifRow = g_vifmask.Row1; - } - - u32 *dest = (u32*)(VU->Mem + v->addr); - u32 unpackType = v->cmd & 0xf; - - ft = &VIFfuncTable[ unpackType ]; - func = vif->usn ? ft->funcU : ft->funcS; - size <<= 2; - - if (vifRegs->cycle.cl >= vifRegs->cycle.wl) { // skipping write - if (v->addr >= memlimit) { - DevCon.Warning("Overflown at the start"); - v->addr &= (memlimit - 1); - dest = (u32*)(VU->Mem + v->addr); - } - - size = min(size, (int)vifRegs->num * ft->gsize); //size will always be the same or smaller - - tempsize = v->addr + ((((vifRegs->num-1) / vifRegs->cycle.wl) * - (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16); - - //Sanity Check (memory overflow) - if (tempsize > memlimit) { - if (((vifRegs->cycle.cl != vifRegs->cycle.wl) && - ((memlimit + (vifRegs->cycle.cl - vifRegs->cycle.wl) * 16) == tempsize))) { - //It's a red herring, so ignore it! SSE unpacks will be much quicker. - DevCon.WriteLn("what!!!!!!!!!"); - //tempsize = 0; - tempsize = size; - size = 0; - } - else { - DevCon.Warning("VIF%x Unpack ending %x > %x", VIFdmanum, tempsize, VIFdmanum ? 0x4000 : 0x1000); - tempsize = size; - size = 0; - } - } - else { - tempsize = size; - size = 0; - } - if (tempsize) { - int incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4; - size = 0; - int addrstart = v->addr; - //if((tempsize >> 2) != v->size) DevCon.Warning("split when size != tagsize"); - - VIFUNPACK_LOG("sorting tempsize :p, size %d, vifnum %d, addr %x", tempsize, vifRegs->num, v->addr); - - while ((tempsize >= ft->gsize) && (vifRegs->num > 0)) { - if(v->addr >= memlimit) { - DevCon.Warning("Mem limit overflow"); - v->addr &= (memlimit - 1); - dest = (u32*)(VU->Mem + v->addr); - } - - func(dest, (u32*)cdata, ft->qsize); - cdata += ft->gsize; - tempsize -= ft->gsize; - - vifRegs->num--; - vif->cl++; - - if (vif->cl == vifRegs->cycle.wl) { - dest += incdest; - v->addr +=(incdest * 4); - vif->cl = 0; - } - else { - dest += 4; - v->addr += 16; - } - } - if (v->addr >= memlimit) { - v->addr &=(memlimit - 1); - dest = (u32*)(VU->Mem + v->addr); - } - v->addr = addrstart; - if(tempsize > 0) size = tempsize; - } - - if (size >= ft->dsize && vifRegs->num > 0) { //Else write what we do have - DevCon.Warning("huh!!!!!!!!!!!!!!!!!!!!!!"); - VIF_LOG("warning, end with size = %d", size); - // unpack one qword - //v->addr += (size / ft->dsize) * 4; - func(dest, (u32*)cdata, size / ft->dsize); - size = 0; - VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, v->addr); - } - } - else { // filling write - if(vifRegs->cycle.cl > 0) // Quicker and avoids zero division :P - if((u32)(((size / ft->gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num) - DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft->gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl); - - DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType, vif->tag.addr); - while (vifRegs->num > 0) { - if (vif->cl == vifRegs->cycle.wl) { - vif->cl = 0; - } - // unpack one qword - if (vif->cl < vifRegs->cycle.cl) { - if(size < ft->gsize) { DevCon.WriteLn("Out of Filling write data!"); break; } - func(dest, (u32*)cdata, ft->qsize); - cdata += ft->gsize; - size -= ft->gsize; - vif->cl++; - vifRegs->num--; - if (vif->cl == vifRegs->cycle.wl) { - vif->cl = 0; - } - } - else { - func(dest, (u32*)cdata, ft->qsize); - v->addr += 16; - vifRegs->num--; - vif->cl++; - } - dest += 4; - if (vifRegs->num == 0) break; - } - } -} +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +// Old Vif Unpack Code +// Only here for testing/reference +// If newVif is defined and newVif1 isn't, vif1 will use this code +// same goes for vif0... +template void VIFunpack<0>(u32 *data, vifCode *v, u32 size); +template void VIFunpack<1>(u32 *data, vifCode *v, u32 size); +template void VIFunpack(u32 *data, vifCode *v, u32 size) { + //if (!VIFdmanum) DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data); + UNPACKFUNCTYPE func; + const VIFUnpackFuncTable *ft; + VURegs * VU; + u8 *cdata = (u8*)data; + u32 tempsize = 0; + const u32 memlimit = vif_size(VIFdmanum); + + if (VIFdmanum == 0) { + VU = &VU0; + vifRegs = vif0Regs; + vifMaskRegs = g_vif0Masks; + vif = &vif0; + vifRow = g_vifmask.Row0; + } + else { + VU = &VU1; + vifRegs = vif1Regs; + vifMaskRegs = g_vif1Masks; + vif = &vif1; + vifRow = g_vifmask.Row1; + } + + u32 *dest = (u32*)(VU->Mem + v->addr); + u32 unpackType = v->cmd & 0xf; + + ft = &VIFfuncTable[ unpackType ]; + func = vif->usn ? ft->funcU : ft->funcS; + size <<= 2; + + if (vifRegs->cycle.cl >= vifRegs->cycle.wl) { // skipping write + if (v->addr >= memlimit) { + DevCon.Warning("Overflown at the start"); + v->addr &= (memlimit - 1); + dest = (u32*)(VU->Mem + v->addr); + } + + size = min(size, (int)vifRegs->num * ft->gsize); //size will always be the same or smaller + + tempsize = v->addr + ((((vifRegs->num-1) / vifRegs->cycle.wl) * + (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16); + + //Sanity Check (memory overflow) + if (tempsize > memlimit) { + if (((vifRegs->cycle.cl != vifRegs->cycle.wl) && + ((memlimit + (vifRegs->cycle.cl - vifRegs->cycle.wl) * 16) == tempsize))) { + //It's a red herring, so ignore it! SSE unpacks will be much quicker. + DevCon.WriteLn("what!!!!!!!!!"); + //tempsize = 0; + tempsize = size; + size = 0; + } + else { + DevCon.Warning("VIF%x Unpack ending %x > %x", VIFdmanum, tempsize, VIFdmanum ? 0x4000 : 0x1000); + tempsize = size; + size = 0; + } + } + else { + tempsize = size; + size = 0; + } + if (tempsize) { + int incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4; + size = 0; + int addrstart = v->addr; + //if((tempsize >> 2) != v->size) DevCon.Warning("split when size != tagsize"); + + VIFUNPACK_LOG("sorting tempsize :p, size %d, vifnum %d, addr %x", tempsize, vifRegs->num, v->addr); + + while ((tempsize >= ft->gsize) && (vifRegs->num > 0)) { + if(v->addr >= memlimit) { + DevCon.Warning("Mem limit overflow"); + v->addr &= (memlimit - 1); + dest = (u32*)(VU->Mem + v->addr); + } + + func(dest, (u32*)cdata, ft->qsize); + cdata += ft->gsize; + tempsize -= ft->gsize; + + vifRegs->num--; + vif->cl++; + + if (vif->cl == vifRegs->cycle.wl) { + dest += incdest; + v->addr +=(incdest * 4); + vif->cl = 0; + } + else { + dest += 4; + v->addr += 16; + } + } + if (v->addr >= memlimit) { + v->addr &=(memlimit - 1); + dest = (u32*)(VU->Mem + v->addr); + } + v->addr = addrstart; + if(tempsize > 0) size = tempsize; + } + + if (size >= ft->dsize && vifRegs->num > 0) { //Else write what we do have + DevCon.Warning("huh!!!!!!!!!!!!!!!!!!!!!!"); + VIF_LOG("warning, end with size = %d", size); + // unpack one qword + //v->addr += (size / ft->dsize) * 4; + func(dest, (u32*)cdata, size / ft->dsize); + size = 0; + VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, v->addr); + } + } + else { // filling write + if(vifRegs->cycle.cl > 0) // Quicker and avoids zero division :P + if((u32)(((size / ft->gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num) + DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft->gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl); + + DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType, vif->tag.addr); + while (vifRegs->num > 0) { + if (vif->cl == vifRegs->cycle.wl) { + vif->cl = 0; + } + // unpack one qword + if (vif->cl < vifRegs->cycle.cl) { + if(size < ft->gsize) { DevCon.WriteLn("Out of Filling write data!"); break; } + func(dest, (u32*)cdata, ft->qsize); + cdata += ft->gsize; + size -= ft->gsize; + vif->cl++; + vifRegs->num--; + if (vif->cl == vifRegs->cycle.wl) { + vif->cl = 0; + } + } + else { + func(dest, (u32*)cdata, ft->qsize); + v->addr += 16; + vifRegs->num--; + vif->cl++; + } + dest += 4; + if (vifRegs->num == 0) break; + } + } +} diff --git a/pcsx2/x86/newVif_Unpack.inl b/pcsx2/x86/newVif_Unpack.inl index 544560221d..6d1296f8c0 100644 --- a/pcsx2/x86/newVif_Unpack.inl +++ b/pcsx2/x86/newVif_Unpack.inl @@ -1,261 +1,279 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -// newVif! - author: cottonvibes(@gmail.com) - -#pragma once - -struct nVifStruct { - u32 idx; // VIF0 or VIF1 - vifStruct* vif; // Vif Struct ptr - VIFregisters* vifRegs; // Vif Regs ptr - VURegs* VU; // VU Regs ptr - u8* vuMemEnd; // End of VU Memory - u32 vuMemLimit; // Use for fast AND - BlockBuffer* vifBlock; // Block Buffer -}; -nVifStruct nVif[2]; - -void initNewVif(int idx) { - nVif[idx].idx = idx; - nVif[idx].VU = idx ? &VU1 : &VU0; - nVif[idx].vif = idx ? &vif1 : &vif0; - nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs; - nVif[idx].vifBlock = new BlockBuffer(0x2000); // 8kb Block Buffer - nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); - nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; - - HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false); - memset8<0xcc>( nVifUpkExec ); - - xSetPtr( nVifUpkExec ); - - for (int a = 0; a < 2; a++) { - for (int b = 0; b < 2; b++) { - for (int c = 0; c < 4; c++) { - for (int d = 0; d < 3; d++) { - nVifGen(a, b, c, d); - }}}} - - HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true); -} - -int nVifUnpack(int idx, u32 *data) { - XMMRegisters::Freeze(); - //BlockBuffer* vB = nVif[idx].vifBlock; - int ret = aMin(vif1.vifpacketsize, vif1.tag.size); - vif1.tag.size -= ret; - _nVifUnpack(idx, (u8*)data, ret<<2); - if (vif1.tag.size <= 0) vif1.tag.size = 0; - if (vif1.tag.size <= 0) vif1.cmd = 0; - XMMRegisters::Thaw(); - return ret; -} - -_f u8* setVUptr(int idx, int offset) { - return (u8*)(nVif[idx].VU->Mem + (offset & nVif[idx].vuMemLimit)); -} - -_f void incVUptr(int idx, u8* &ptr, int amount) { - ptr += amount; - int diff = ptr - nVif[idx].vuMemEnd; - if (diff >= 0) { - ptr = nVif[idx].VU->Mem + diff; - } - if ((uptr)ptr & 0xf) DevCon.WriteLn("unaligned wtf :("); -} - -static void setMasks(const VIFregisters& v) { - for (int i = 0; i < 16; i++) { - int m = (v.mask >> (i*2)) & 3; - switch (m) { - case 0: // Data - nVifMask[0][i/4][i%4] = 0xffffffff; - nVifMask[1][i/4][i%4] = 0; - nVifMask[2][i/4][i%4] = 0; - break; - case 1: // Row - nVifMask[0][i/4][i%4] = 0; - nVifMask[1][i/4][i%4] = 0; - nVifMask[2][i/4][i%4] = ((u32*)&v.r0)[(i%4)*4]; - break; - case 2: // Col - nVifMask[0][i/4][i%4] = 0; - nVifMask[1][i/4][i%4] = 0; - nVifMask[2][i/4][i%4] = ((u32*)&v.c0)[(i/4)*4]; - break; - case 3: // Write Protect - nVifMask[0][i/4][i%4] = 0; - nVifMask[1][i/4][i%4] = 0xffffffff; - nVifMask[2][i/4][i%4] = 0; - break; - } - } -} - -// ---------------------------------------------------------------------------- -// Unpacking Optimization notes: -// ---------------------------------------------------------------------------- -// Some games send a LOT of small packets. This is a problem because the new VIF unpacker -// has a lot of setup code to establish which unpack function to call. The best way to -// optimize this is to cache the unpack function's base (see fnbase below) and update it -// when the variables it depends on are modified: writes to vif->tag.cmd and vif->usn. -// -// A secondary optimization would be adding special handlers for packets where vifRegs->num==1. -// (which would remove the loop, simplify the incVUptr code, etc). But checking for it has -// to be simple enough that it doesn't offset the benefits (which I'm not sure is possible). -// -- air - - -template< int idx, bool doMode, bool isFill > -__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size ) -{ - // Eh... template attempt, tho not sure it helped much. There's too much setup code (see - // optimization note above) -- air - - const int usn = !!(vif->usn); - const int doMask = !!(vif->tag.cmd & 0x10); - const int upkNum = vif->tag.cmd & 0xf; - const u32& vift = nVifT[upkNum]; - - u8* dest = setVUptr(idx, vif->tag.addr); - const VIFUnpackFuncTable& ft = VIFfuncTable[vif->tag.cmd & 0xf]; - UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS; - - const nVifCall* fnbase = &nVifUpk[ - ((usn*2*16) + (doMask*16) + (upkNum)) * (4*4) - ]; - - const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl; - const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl; - - if (doMask) - setMasks(*vifRegs); - - if (vif->cl >= blockSize) { - vif->cl = 0; - } - - while (vifRegs->num > 0) { - if (vif->cl < cycleSize) { - //if (size <= 0) { DbgCon.WriteLn("_nVifUnpack: Out of Data!"); break; } - if (doMode /*|| doMask*/) { - //if (doMask) - //DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum); - func((u32*)dest, (u32*)data, ft.qsize); - data += ft.gsize; - size -= ft.gsize; - vifRegs->num--; - } - else if (1) { - //DevCon.WriteLn("SSE Unpack!"); - fnbase[aMin(vif->cl, 4) * 4](dest, data); - data += vift; - size -= vift; - vifRegs->num--; - } - else { - - //DevCon.WriteLn("SSE Unpack!"); - int c = aMin((cycleSize - vif->cl), 3); - size -= vift * c; - //if (c>1) { DevCon.WriteLn("C > 1!"); } - if (c<0||c>3) { DbgCon.WriteLn("C wtf!"); } - if (size < 0) { DbgCon.WriteLn("Size Shit"); size+=vift*c;c=1;size-=vift*c;} - fnbase[(aMin(vif->cl, 4) * 4) + c-1](dest, data); - data += vift * c; - vifRegs->num -= c; - } - } - else if (isFill) { - func((u32*)dest, (u32*)data, ft.qsize); - vifRegs->num--; - } - incVUptr(idx, dest, 16); - - // Removing this modulo was a huge speedup for God of War. (62->73 fps) - // (GoW uses a lot of blockSize==1 packets, resulting in tons of loops -- so the biggest - // factor in performance ends up being the top-level conditionals of the loop, and - // also the loop prep code.) --air - - //vif->cl = (vif->cl+1) % blockSize; - if( ++vif->cl == blockSize ) vif->cl = 0; - } -} - -void _nVifUnpack(int idx, u8 *data, u32 size) { - /*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write - if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2); - else VIFunpack<1>((u32*)data, &vif1.tag, size>>2); - return; - } - else*/ { // filling write - vif = nVif[idx].vif; - vifRegs = nVif[idx].vifRegs; - - const bool doMode = !!vifRegs->mode; - const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl); - - //UnpackLoopTable[idx][doMode][isFill]( data, size ); - - if( idx ) - { - if( doMode ) - { - if( isFill ) - _nVifUnpackLoop<1,true,true>( data, size ); - else - _nVifUnpackLoop<1,true,false>( data, size ); - } - else - { - if( isFill ) - _nVifUnpackLoop<1,false,true>( data, size ); - else - _nVifUnpackLoop<1,false,false>( data, size ); - } - } - else - { - pxFailDev( "No VIF0 support yet, sorry!" ); - } - - //if (isFill) - //DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok")); - //DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num); - - } -} - -//int nVifUnpack(int idx, u32 *data) { -// XMMRegisters::Freeze(); -// BlockBuffer* vB = nVif[idx].vifBlock; -// int ret = aMin(vif1.vifpacketsize, vif1.tag.size); -// //vB->append(data, ret<<2); -// vif1.tag.size -= ret; -// //DevCon.WriteLn("2 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2); -// //if (vif1.tag.size <= 0) { -// //DevCon.WriteLn("3 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2); -// //VIFunpack<1>(vB->getBlock(), &vif1.tag, vB->getSize()>>2); -// //_nVifUnpack(idx, vB->getBlock(), vB->getSize()); -// _nVifUnpack(idx, (u8*)data, ret<<2); -// if (vif1.tag.size <= 0) vif1.tag.size = 0; -// if (vif1.tag.size <= 0) vif1.cmd = 0; -// //vB->clear(); -// //} -// //else { vif1.tag.size+=ret; ret = -1; vB->clear(); } -// XMMRegisters::Thaw(); -// return ret; -//} +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +// newVif! - author: cottonvibes(@gmail.com) + +#pragma once + +struct nVifStruct { + u32 idx; // VIF0 or VIF1 + vifStruct* vif; // Vif Struct ptr + VIFregisters* vifRegs; // Vif Regs ptr + VURegs* VU; // VU Regs ptr + u8* vuMemEnd; // End of VU Memory + u32 vuMemLimit; // Use for fast AND + BlockBuffer* vifBlock; // Block Buffer +}; + +static __aligned16 nVifStruct nVif[2]; + +void initNewVif(int idx) { + nVif[idx].idx = idx; + nVif[idx].VU = idx ? &VU1 : &VU0; + nVif[idx].vif = idx ? &vif1 : &vif0; + nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs; + nVif[idx].vifBlock = new BlockBuffer(0x2000); // 8kb Block Buffer + nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); + nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; + + HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false); + memset8<0xcc>( nVifUpkExec ); + + xSetPtr( nVifUpkExec ); + + for (int a = 0; a < 2; a++) { + for (int b = 0; b < 2; b++) { + for (int c = 0; c < 4; c++) { + for (int d = 0; d < 3; d++) { + nVifGen(a, b, c, d); + }}}} + + HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true); +} + +int nVifUnpack(int idx, u32 *data) { + XMMRegisters::Freeze(); + //BlockBuffer* vB = nVif[idx].vifBlock; + int ret = aMin(vif1.vifpacketsize, vif1.tag.size); + vif1.tag.size -= ret; + _nVifUnpack(idx, (u8*)data, ret<<2); + if (vif1.tag.size <= 0) vif1.tag.size = 0; + if (vif1.tag.size <= 0) vif1.cmd = 0; + XMMRegisters::Thaw(); + return ret; +} + +_f u8* setVUptr(int idx, int offset) { + return (u8*)(nVif[idx].VU->Mem + (offset & nVif[idx].vuMemLimit)); +} + +_f void incVUptr(int idx, u8* &ptr, int amount) { + ptr += amount; + int diff = ptr - nVif[idx].vuMemEnd; + if (diff >= 0) { + ptr = nVif[idx].VU->Mem + diff; + } + if ((uptr)ptr & 0xf) DevCon.WriteLn("unaligned wtf :("); +} + +static void setMasks(const VIFregisters& v) { + for (int i = 0; i < 16; i++) { + int m = (v.mask >> (i*2)) & 3; + switch (m) { + case 0: // Data + nVifMask[0][i/4][i%4] = 0xffffffff; + nVifMask[1][i/4][i%4] = 0; + nVifMask[2][i/4][i%4] = 0; + break; + case 1: // Row + nVifMask[0][i/4][i%4] = 0; + nVifMask[1][i/4][i%4] = 0; + nVifMask[2][i/4][i%4] = ((u32*)&v.r0)[(i%4)*4]; + break; + case 2: // Col + nVifMask[0][i/4][i%4] = 0; + nVifMask[1][i/4][i%4] = 0; + nVifMask[2][i/4][i%4] = ((u32*)&v.c0)[(i/4)*4]; + break; + case 3: // Write Protect + nVifMask[0][i/4][i%4] = 0; + nVifMask[1][i/4][i%4] = 0xffffffff; + nVifMask[2][i/4][i%4] = 0; + break; + } + } +} + +// ---------------------------------------------------------------------------- +// Unpacking Optimization notes: +// ---------------------------------------------------------------------------- +// Some games send a LOT of small packets. This is a problem because the new VIF unpacker +// has a lot of setup code to establish which unpack function to call. The best way to +// optimize this is to cache the unpack function's base (see fnbase below) and update it +// when the variables it depends on are modified: writes to vif->tag.cmd and vif->usn. +// Problem: vif->tag.cmd is modified a lot. Like, constantly. So won't work. +// +// A secondary optimization would be adding special handlers for packets where vifRegs->num==1. +// (which would remove the loop, simplify the incVUptr code, etc). But checking for it has +// to be simple enough that it doesn't offset the benefits (which I'm not sure is possible). +// -- air + + +//template< int idx, bool doMode, bool isFill > +//__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size ) +__releaseinline void __fastcall _nVifUnpackLoop( int idx, u8 *data, u32 size ) +{ + // comment out the following 2 lines to test templated version... + const bool doMode = !!vifRegs->mode; + const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl); + + const int usn = !!(vif->usn); + const int doMask = !!(vif->tag.cmd & 0x10); + const int upkNum = vif->tag.cmd & 0xf; + const u32& vift = nVifT[upkNum]; + + u8* dest = setVUptr(idx, vif->tag.addr); + const VIFUnpackFuncTable& ft = VIFfuncTable[upkNum]; + UNPACKFUNCTYPE func = usn ? ft.funcU : ft.funcS; + + // Did a bunch of work to make it so I could optimize this index lookup to outside + // the main loop but it was for naught -- too often the loop is only 1-2 iterations, + // so this setup code ends up being slower (1 iter) or same speed (2 iters). + const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*4) ]; + + const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl; + const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl; + + if (doMask) + setMasks(*vifRegs); + + if (vif->cl >= blockSize) { + + // This condition doesn't appear to ever occur, and really it never should. + // Normally it wouldn't matter, but even simple setup code matters here (see + // optimization notes above) >_< + + vif->cl = 0; + } + + while (vifRegs->num > 0) { + if (vif->cl < cycleSize) { + //if (size <= 0) { DbgCon.WriteLn("_nVifUnpack: Out of Data!"); break; } + if (doMode /*|| doMask*/) { + //if (doMask) + //DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum); + func((u32*)dest, (u32*)data, ft.qsize); + data += ft.gsize; + size -= ft.gsize; + vifRegs->num--; + } + else if (1) { + //DevCon.WriteLn("SSE Unpack!"); + fnbase[aMin(vif->cl, 4) * 4](dest, data); + data += vift; + size -= vift; + vifRegs->num--; + } + else { + //DevCon.WriteLn("SSE Unpack!"); + int c = aMin((cycleSize - vif->cl), 3); + size -= vift * c; + //if (c>1) { DevCon.WriteLn("C > 1!"); } + if (c<0||c>3) { DbgCon.WriteLn("C wtf!"); } + if (size < 0) { DbgCon.WriteLn("Size Shit"); size+=vift*c;c=1;size-=vift*c;} + fnbase[(aMin(vif->cl, 4) * 4) + c-1](dest, data); + data += vift * c; + vifRegs->num -= c; + } + } + else if (isFill) { + func((u32*)dest, (u32*)data, ft.qsize); + vifRegs->num--; + } + incVUptr(idx, dest, 16); + + // Removing this modulo was a huge speedup for God of War start menu. (62->73 fps) + // (GoW and tri-ace games both use a lot of blockSize==1 packets, resulting in tons + // of loops -- so the biggest factor in performance ends up being the top-level + // conditionals of the loop, and also the loop prep code.) --air + + //vif->cl = (vif->cl+1) % blockSize; + if( ++vif->cl == blockSize ) vif->cl = 0; + } +} + +void _nVifUnpack(int idx, u8 *data, u32 size) { + /*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write + if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2); + else VIFunpack<1>((u32*)data, &vif1.tag, size>>2); + return; + } + else*/ { // filling write + + vif = nVif[idx].vif; + vifRegs = nVif[idx].vifRegs; + +#if 1 + _nVifUnpackLoop( idx, data, size ); +#else + // Eh... template attempt, tho it didn't help much. There's too much setup code, + // and the template only optimizes code inside the loop, which often times seems to + // only be run once or twice anyway. Better to use recompilation than templating + // anyway, but I'll leave it in for now for reference. -- air + + const bool doMode = !!vifRegs->mode; + const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl); + + //UnpackLoopTable[idx][doMode][isFill]( data, size ); + + if( idx ) + { + if( doMode ) + { + if( isFill ) + _nVifUnpackLoop<1,true,true>( data, size ); + else + _nVifUnpackLoop<1,true,false>( data, size ); + } + else + { + if( isFill ) + _nVifUnpackLoop<1,false,true>( data, size ); + else + _nVifUnpackLoop<1,false,false>( data, size ); + } + } + else + { + pxFailDev( "No VIF0 support yet, sorry!" ); + } +#endif + //if (isFill) + //DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok")); + //DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num); + + } +} + +//int nVifUnpack(int idx, u32 *data) { +// XMMRegisters::Freeze(); +// BlockBuffer* vB = nVif[idx].vifBlock; +// int ret = aMin(vif1.vifpacketsize, vif1.tag.size); +// //vB->append(data, ret<<2); +// vif1.tag.size -= ret; +// //DevCon.WriteLn("2 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2); +// //if (vif1.tag.size <= 0) { +// //DevCon.WriteLn("3 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2); +// //VIFunpack<1>(vB->getBlock(), &vif1.tag, vB->getSize()>>2); +// //_nVifUnpack(idx, vB->getBlock(), vB->getSize()); +// _nVifUnpack(idx, (u8*)data, ret<<2); +// if (vif1.tag.size <= 0) vif1.tag.size = 0; +// if (vif1.tag.size <= 0) vif1.cmd = 0; +// //vB->clear(); +// //} +// //else { vif1.tag.size+=ret; ret = -1; vB->clear(); } +// XMMRegisters::Thaw(); +// return ret; +//} diff --git a/pcsx2/x86/newVif_UnpackGen.inl b/pcsx2/x86/newVif_UnpackGen.inl index 8a6be76fc9..e735704e62 100644 --- a/pcsx2/x86/newVif_UnpackGen.inl +++ b/pcsx2/x86/newVif_UnpackGen.inl @@ -1,256 +1,255 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#pragma once - -#define xMaskWrite(regX, x) { \ - if (x==0) xMOVAPS(xmm7, ptr32[ecx]); \ - if (x==1) xMOVAPS(xmm7, ptr32[ecx+0x10]); \ - if (x==2) xMOVAPS(xmm7, ptr32[ecx+0x20]); \ - int offX = aMin(curCycle+x, 4); \ - xPAND(regX, ptr32[nVifMask[0][offX]]); \ - xPAND(xmm7, ptr32[nVifMask[1][offX]]); \ - xPOR (regX, ptr32[nVifMask[2][offX]]); \ - xPOR (regX, xmm7); \ - if (x==0) xMOVAPS(ptr32[ecx], regX); \ - if (x==1) xMOVAPS(ptr32[ecx+0x10], regX); \ - if (x==2) xMOVAPS(ptr32[ecx+0x20], regX); \ -} - -#define xMovDest(reg0, reg1, reg2) { \ - if (mask==0) { \ - if (cycles>=0) { xMOVAPS (ptr32[ecx], reg0); } \ - if (cycles>=1) { xMOVAPS (ptr32[ecx+0x10], reg1); } \ - if (cycles>=2) { xMOVAPS (ptr32[ecx+0x20], reg2); } \ - } \ - else { \ - if (cycles>=0) { xMaskWrite(reg0, 0); } \ - if (cycles>=1) { xMaskWrite(reg1, 1); } \ - if (cycles>=2) { xMaskWrite(reg2, 2); } \ - } \ -} - -// xmm2 gets result -void convertRGB() { - xPSLL.D (xmm1, 3); // ABG|R5.000 - xMOVAPS (xmm2, xmm1);// R5.000 (garbage upper bits) - xPSRL.D (xmm1, 8); // ABG - xPSLL.D (xmm1, 3); // AB|G5.000 - xMOVAPS (xmm3, xmm1);// G5.000 (garbage upper bits) - xPSRL.D (xmm1, 8); // AB - xPSLL.D (xmm1, 3); // A|B5.000 - xMOVAPS (xmm4, xmm1);// B5.000 (garbage upper bits) - xPSRL.D (xmm1, 8); // A - xPSLL.D (xmm1, 7); // A.0000000 - - xPSHUF.D (xmm1, xmm1, _v0); // A|A|A|A - xPSHUF.D (xmm3, xmm3, _v0); // G|G|G|G - xPSHUF.D (xmm4, xmm4, _v0); // B|B|B|B - mVUmergeRegs(XMM2, XMM1, 0x3); // A|x|x|R - mVUmergeRegs(XMM2, XMM3, 0x4); // A|x|G|R - mVUmergeRegs(XMM2, XMM4, 0x2); // A|B|G|R - - xPSLL.D (xmm2, 24); // can optimize to - xPSRL.D (xmm2, 24); // single AND... -} - -struct VifUnpackIndexer -{ - int usn, mask; - int curCycle, cyclesToWrite; - - nVifCall& GetCall( int packType ) const - { - int usnpart = usn*2*16; - int maskpart = mask*16; - int packpart = packType; - - int curpart = curCycle*4; - int cycpespart = cyclesToWrite; - - return nVifUpk[((usnpart+maskpart+packpart)*(4*4)) + (curpart+cycpespart)]; - } - - void xSetCall( int packType ) const - { - xAlignPtr(16); - GetCall( packType ) = (nVifCall)xGetPtr(); - } - - void xSetNullCall( int packType ) const - { - GetCall( packType ) = NULL; - } -}; - -// ecx = dest, edx = src -void nVifGen(int usn, int mask, int curCycle, int cycles) { - const VifUnpackIndexer indexer = { usn, mask, curCycle, cycles }; - - indexer.xSetCall(0x0); // S-32 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); - if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); - if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); - if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); - xRET(); - - indexer.xSetCall(0x1); // S-16 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=0) xShiftR (xmm0, 16); - if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); - if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); - if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); - if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); - xRET(); - - indexer.xSetCall(0x2); // S-8 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=0) xShiftR (xmm0, 24); - if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); - if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); - if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); - if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); - xRET(); - - indexer.xSetNullCall(0x3); // ---- - - indexer.xSetCall(0x4); // V2-32 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x10]); - if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); - xRET(); - - indexer.xSetCall(0x5); // V2-16 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 16); - if (cycles>=2) xShiftR (xmm2, 16); - if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); - xRET(); - - indexer.xSetCall(0x6); // V2-8 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); - if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 24); - if (cycles>=2) xShiftR (xmm2, 24); - if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); - xRET(); - - indexer.xSetNullCall(0x7); // ---- - - indexer.xSetCall(0x8); // V3-32 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+12]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+24]); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); - xRET(); - - indexer.xSetCall(0x9); // V3-16 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+6]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+12]); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 16); - if (cycles>=1) xShiftR (xmm1, 16); - if (cycles>=2) xShiftR (xmm2, 16); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); - xRET(); - - indexer.xSetCall(0xa); // V3-8 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+3]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+6]); - if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 24); - if (cycles>=1) xShiftR (xmm1, 24); - if (cycles>=2) xShiftR (xmm2, 24); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); - xRET(); - - indexer.xSetNullCall(0xb); // ---- - - indexer.xSetCall(0xc); // V4-32 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); - xRET(); - - indexer.xSetCall(0xd); // V4-16 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 16); - if (cycles>=1) xShiftR (xmm1, 16); - if (cycles>=2) xShiftR (xmm2, 16); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); - xRET(); - - indexer.xSetCall(0xe); // V4-8 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+4]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+8]); - if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 24); - if (cycles>=1) xShiftR (xmm1, 24); - if (cycles>=2) xShiftR (xmm2, 24); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); - xRET(); - - // A | B5 | G5 | R5 - // ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000 - indexer.xSetCall(0xf); // V4-5 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=0) xMOVAPS (xmm1, xmm0); - if (cycles>=0) convertRGB(); - if (cycles>=0) xMOVAPS (ptr32[ecx], xmm2); - if (cycles>=1) xMOVAPS (xmm1, xmm0); - if (cycles>=1) xPSRL.D (xmm1, 16); - if (cycles>=1) convertRGB(); - if (cycles>=1) xMOVAPS (ptr32[ecx+0x10], xmm2); - if (cycles>=2) xPSHUF.D (xmm1, xmm0, _v1); - if (cycles>=2) convertRGB(); - if (cycles>=2) xMOVAPS (ptr32[ecx+0x20], xmm2); - xRET(); - - pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) ); -} +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +#define xMaskWrite(regX, x) { \ + if (x==0) xMOVAPS(xmm7, ptr32[ecx]); \ + if (x==1) xMOVAPS(xmm7, ptr32[ecx+0x10]); \ + if (x==2) xMOVAPS(xmm7, ptr32[ecx+0x20]); \ + int offX = aMin(curCycle+x, 4); \ + xPAND(regX, ptr32[nVifMask[0][offX]]); \ + xPAND(xmm7, ptr32[nVifMask[1][offX]]); \ + xPOR (regX, ptr32[nVifMask[2][offX]]); \ + xPOR (regX, xmm7); \ + if (x==0) xMOVAPS(ptr32[ecx], regX); \ + if (x==1) xMOVAPS(ptr32[ecx+0x10], regX); \ + if (x==2) xMOVAPS(ptr32[ecx+0x20], regX); \ +} + +#define xMovDest(reg0, reg1, reg2) { \ + if (mask==0) { \ + if (cycles>=0) { xMOVAPS (ptr32[ecx], reg0); } \ + if (cycles>=1) { xMOVAPS (ptr32[ecx+0x10], reg1); } \ + if (cycles>=2) { xMOVAPS (ptr32[ecx+0x20], reg2); } \ + } \ + else { \ + if (cycles>=0) { xMaskWrite(reg0, 0); } \ + if (cycles>=1) { xMaskWrite(reg1, 1); } \ + if (cycles>=2) { xMaskWrite(reg2, 2); } \ + } \ +} + +// xmm2 gets result +void convertRGB() { + xPSLL.D (xmm1, 3); // ABG|R5.000 + xMOVAPS (xmm2, xmm1);// R5.000 (garbage upper bits) + xPSRL.D (xmm1, 8); // ABG + xPSLL.D (xmm1, 3); // AB|G5.000 + xMOVAPS (xmm3, xmm1);// G5.000 (garbage upper bits) + xPSRL.D (xmm1, 8); // AB + xPSLL.D (xmm1, 3); // A|B5.000 + xMOVAPS (xmm4, xmm1);// B5.000 (garbage upper bits) + xPSRL.D (xmm1, 8); // A + xPSLL.D (xmm1, 7); // A.0000000 + + xPSHUF.D (xmm1, xmm1, _v0); // A|A|A|A + xPSHUF.D (xmm3, xmm3, _v0); // G|G|G|G + xPSHUF.D (xmm4, xmm4, _v0); // B|B|B|B + mVUmergeRegs(XMM2, XMM1, 0x3); // A|x|x|R + mVUmergeRegs(XMM2, XMM3, 0x4); // A|x|G|R + mVUmergeRegs(XMM2, XMM4, 0x2); // A|B|G|R + + xPSLL.D (xmm2, 24); // can optimize to + xPSRL.D (xmm2, 24); // single AND... +} + +struct VifUnpackIndexer +{ + int usn, mask; + int curCycle, cyclesToWrite; + + nVifCall& GetCall( int packType ) const + { + int usnpart = usn*2*16; + int maskpart = mask*16; + int packpart = packType; + + int curpart = curCycle*4; + int cycpespart = cyclesToWrite; + + return nVifUpk[((usnpart+maskpart+packpart)*(4*4)) + (curpart+cycpespart)]; + } + + void xSetCall( int packType ) const + { + GetCall( packType ) = (nVifCall)xGetAlignedCallTarget(); + } + + void xSetNullCall( int packType ) const + { + GetCall( packType ) = NULL; + } +}; + +// ecx = dest, edx = src +void nVifGen(int usn, int mask, int curCycle, int cycles) { + const VifUnpackIndexer indexer = { usn, mask, curCycle, cycles }; + + indexer.xSetCall(0x0); // S-32 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); + if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); + if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); + if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); + xRET(); + + indexer.xSetCall(0x1); // S-16 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=0) xShiftR (xmm0, 16); + if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); + if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); + if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); + if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); + xRET(); + + indexer.xSetCall(0x2); // S-8 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=0) xShiftR (xmm0, 24); + if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); + if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); + if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); + if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); + xRET(); + + indexer.xSetNullCall(0x3); // ---- + + indexer.xSetCall(0x4); // V2-32 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x10]); + if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + indexer.xSetCall(0x5); // V2-16 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 16); + if (cycles>=2) xShiftR (xmm2, 16); + if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + indexer.xSetCall(0x6); // V2-8 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); + if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 24); + if (cycles>=2) xShiftR (xmm2, 24); + if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + indexer.xSetNullCall(0x7); // ---- + + indexer.xSetCall(0x8); // V3-32 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+12]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+24]); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + indexer.xSetCall(0x9); // V3-16 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+6]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+12]); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 16); + if (cycles>=1) xShiftR (xmm1, 16); + if (cycles>=2) xShiftR (xmm2, 16); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + indexer.xSetCall(0xa); // V3-8 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+3]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+6]); + if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 24); + if (cycles>=1) xShiftR (xmm1, 24); + if (cycles>=2) xShiftR (xmm2, 24); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + indexer.xSetNullCall(0xb); // ---- + + indexer.xSetCall(0xc); // V4-32 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + indexer.xSetCall(0xd); // V4-16 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 16); + if (cycles>=1) xShiftR (xmm1, 16); + if (cycles>=2) xShiftR (xmm2, 16); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + indexer.xSetCall(0xe); // V4-8 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+4]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+8]); + if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 24); + if (cycles>=1) xShiftR (xmm1, 24); + if (cycles>=2) xShiftR (xmm2, 24); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + // A | B5 | G5 | R5 + // ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000 + indexer.xSetCall(0xf); // V4-5 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=0) xMOVAPS (xmm1, xmm0); + if (cycles>=0) convertRGB(); + if (cycles>=0) xMOVAPS (ptr32[ecx], xmm2); + if (cycles>=1) xMOVAPS (xmm1, xmm0); + if (cycles>=1) xPSRL.D (xmm1, 16); + if (cycles>=1) convertRGB(); + if (cycles>=1) xMOVAPS (ptr32[ecx+0x10], xmm2); + if (cycles>=2) xPSHUF.D (xmm1, xmm0, _v1); + if (cycles>=2) convertRGB(); + if (cycles>=2) xMOVAPS (ptr32[ecx+0x20], xmm2); + xRET(); + + pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) ); +} diff --git a/pcsx2/x86/sVU_Micro.cpp b/pcsx2/x86/sVU_Micro.cpp index b5e28abc37..c5c01f6228 100644 --- a/pcsx2/x86/sVU_Micro.cpp +++ b/pcsx2/x86/sVU_Micro.cpp @@ -1,1739 +1,1739 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#include "PrecompiledHeader.h" - -#include "Common.h" -#include "GS.h" -#include "R5900OpcodeTables.h" -#include "iR5900.h" -#include "iMMI.h" -#include "iFPU.h" -#include "iCOP0.h" -#include "VUmicro.h" -#include "VUflags.h" -#include "sVU_Micro.h" -#include "sVU_Debug.h" -#include "sVU_zerorec.h" - -#ifdef _WIN32 -#pragma warning(disable:4244) -#pragma warning(disable:4761) -#endif -//------------------------------------------------------------------ - -// fixme - VUmicro should really use its own static vars for pc and branch. -// Sharing with the EE's copies of pc and branch is not cool! (air) - -//------------------------------------------------------------------ -// Helper Macros -//------------------------------------------------------------------ -#define _Ft_ (( VU->code >> 16) & 0x1F) // The rt part of the instruction register -#define _Fs_ (( VU->code >> 11) & 0x1F) // The rd part of the instruction register -#define _Fd_ (( VU->code >> 6) & 0x1F) // The sa part of the instruction register -#define _It_ (_Ft_ & 15) -#define _Is_ (_Fs_ & 15) -#define _Id_ (_Fd_ & 15) - -#define _X (( VU->code>>24) & 0x1) -#define _Y (( VU->code>>23) & 0x1) -#define _Z (( VU->code>>22) & 0x1) -#define _W (( VU->code>>21) & 0x1) - -#define _XYZW_SS (_X+_Y+_Z+_W==1) - -#define _Fsf_ (( VU->code >> 21) & 0x03) -#define _Ftf_ (( VU->code >> 23) & 0x03) - -#define _Imm11_ (s32)(VU->code & 0x400 ? 0xfffffc00 | (VU->code & 0x3ff) : VU->code & 0x3ff) -#define _UImm11_ (s32)(VU->code & 0x7ff) - -#define VU_VFx_ADDR(x) (uptr)&VU->VF[x].UL[0] -#define VU_VFy_ADDR(x) (uptr)&VU->VF[x].UL[1] -#define VU_VFz_ADDR(x) (uptr)&VU->VF[x].UL[2] -#define VU_VFw_ADDR(x) (uptr)&VU->VF[x].UL[3] - -#define VU_REGR_ADDR (uptr)&VU->VI[REG_R] -#define VU_REGQ_ADDR (uptr)&VU->VI[REG_Q] -#define VU_REGMAC_ADDR (uptr)&VU->VI[REG_MAC_FLAG] - -#define VU_VI_ADDR(x, read) GetVIAddr(VU, x, read, info) - -#define VU_ACCx_ADDR (uptr)&VU->ACC.UL[0] -#define VU_ACCy_ADDR (uptr)&VU->ACC.UL[1] -#define VU_ACCz_ADDR (uptr)&VU->ACC.UL[2] -#define VU_ACCw_ADDR (uptr)&VU->ACC.UL[3] - -#define _X_Y_Z_W ((( VU->code >> 21 ) & 0xF ) ) -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// Global Variables -//------------------------------------------------------------------ -int vucycle; - -const __aligned16 float s_fones[8] = {1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f}; -const __aligned16 u32 s_mask[4] = {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff}; -const __aligned16 u32 s_expmask[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; -const __aligned16 u32 g_minvals[4] = {0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff}; -const __aligned16 u32 g_maxvals[4] = {0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff}; -const __aligned16 u32 const_clip[8] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, - 0x80000000, 0x80000000, 0x80000000, 0x80000000}; - -const __aligned(64) u32 g_ones[4] = {0x00000001, 0x00000001, 0x00000001, 0x00000001}; - -const __aligned16 u32 g_minvals_XYZW[16][4] = -{ - { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000 - { 0xffffffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //0001 - { 0xffffffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //0010 - { 0xffffffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //0011 - { 0xffffffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //0100 - { 0xffffffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //0101 - { 0xffffffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //0110 - { 0xffffffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //0111 - { 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000 - { 0xff7fffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //1001 - { 0xff7fffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //1010 - { 0xff7fffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //1011 - { 0xff7fffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //1100 - { 0xff7fffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //1101 - { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //1110 - { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //1111 -}; -const __aligned16 u32 g_maxvals_XYZW[16][4] = -{ - { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0000 - { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //0001 - { 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //0010 - { 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //0011 - { 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //0100 - { 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //0101 - { 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //0110 - { 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //0111 - { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000 - { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //1001 - { 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //1010 - { 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //1011 - { 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //1100 - { 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //1101 - { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //1110 - { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //1111 -}; -//------------------------------------------------------------------ - -//------------------------------------------------------------------ -// VU Pipeline/Test Stalls/Analyzing Functions -//------------------------------------------------------------------ -void _recvuFMACflush(VURegs * VU, bool intermediate) { - int i; - - for (i=0; i<8; i++) { - if (VU->fmac[i].enable == 0) continue; - - if( intermediate ) { - if ((vucycle - VU->fmac[i].sCycle) > VU->fmac[i].Cycle) { -// VUM_LOG("flushing FMAC pipe[%d]", i); - VU->fmac[i].enable = 0; - } - } - else { - if ((vucycle - VU->fmac[i].sCycle) >= VU->fmac[i].Cycle) { -// VUM_LOG("flushing FMAC pipe[%d]", i); - VU->fmac[i].enable = 0; - } - } - } -} - -void _recvuFDIVflush(VURegs * VU, bool intermediate) { - if (VU->fdiv.enable == 0) return; - - if( intermediate ) { - if ((vucycle - VU->fdiv.sCycle) > VU->fdiv.Cycle) { -// Console.WriteLn("flushing FDIV pipe"); - VU->fdiv.enable = 0; - } - } - else { - if ((vucycle - VU->fdiv.sCycle) >= VU->fdiv.Cycle) { -// Console.WriteLn("flushing FDIV pipe"); - VU->fdiv.enable = 0; - } - } -} - -void _recvuEFUflush(VURegs * VU, bool intermediate) { - if (VU->efu.enable == 0) return; - - if( intermediate ) { - if ((vucycle - VU->efu.sCycle) > VU->efu.Cycle) { -// Console.WriteLn("flushing FDIV pipe"); - VU->efu.enable = 0; - } - } - else { - if ((vucycle - VU->efu.sCycle) >= VU->efu.Cycle) { -// Console.WriteLn("flushing FDIV pipe"); - VU->efu.enable = 0; - } - } -} - -void _recvuIALUflush(VURegs * VU, bool intermediate) { - int i; - - for (i=0; i<8; i++) { - if (VU->ialu[i].enable == 0) continue; - - if( intermediate ) { - if ((vucycle - VU->ialu[i].sCycle) > VU->ialu[i].Cycle) { -// VUM_LOG("flushing IALU pipe[%d]", i); - VU->ialu[i].enable = 0; - } - } - else { - if ((vucycle - VU->ialu[i].sCycle) >= VU->ialu[i].Cycle) { -// VUM_LOG("flushing IALU pipe[%d]", i); - VU->ialu[i].enable = 0; - } - } - } -} - -void _recvuTestPipes(VURegs * VU, bool intermediate) { // intermediate = true if called by upper FMAC stall detection - _recvuFMACflush(VU, intermediate); - _recvuFDIVflush(VU, intermediate); - _recvuEFUflush(VU, intermediate); - _recvuIALUflush(VU, intermediate); -} - -void _recvuFMACTestStall(VURegs * VU, int reg, int xyzw) { - int cycle; - int i; - u32 mask = 0; - - for (i=0; i<8; i++) { - if (VU->fmac[i].enable == 0) continue; - if (VU->fmac[i].reg == reg && (VU->fmac[i].xyzw & xyzw)) break; - } - - if (i == 8) return; - - // do a perchannel delay - // old code -// cycle = VU->fmac[i].Cycle - (vucycle - VU->fmac[i].sCycle); - - // new code - mask = 4; // w -// if( VU->fmac[i].xyzw & 1 ) mask = 4; // w -// else if( VU->fmac[i].xyzw & 2 ) mask = 3; // z -// else if( VU->fmac[i].xyzw & 4 ) mask = 2; // y -// else { -// assert(VU->fmac[i].xyzw & 8 ); -// mask = 1; // x -// } - -// mask = 0; -// if( VU->fmac[i].xyzw & 1 ) mask++; // w -// else if( VU->fmac[i].xyzw & 2 ) mask++; // z -// else if( VU->fmac[i].xyzw & 4 ) mask++; // y -// else if( VU->fmac[i].xyzw & 8 ) mask++; // x - - assert( (int)VU->fmac[i].sCycle < (int)vucycle ); - cycle = 0; - if( vucycle - VU->fmac[i].sCycle < mask ) - cycle = mask - (vucycle - VU->fmac[i].sCycle); - - VU->fmac[i].enable = 0; - vucycle+= cycle; - _recvuTestPipes(VU, true); // for lower instructions -} - -void _recvuIALUTestStall(VURegs * VU, int reg) { - int cycle; - int i; - u32 latency; - - for (i=0; i<8; i++) { - if (VU->ialu[i].enable == 0) continue; - if (VU->ialu[i].reg == reg) break; - } - - if (i == 8) return; - - latency = VU->ialu[i].Cycle + 1; - cycle = 0; - if( vucycle - VU->ialu[i].sCycle < latency ) - cycle = latency - (vucycle - VU->ialu[i].sCycle); - - VU->ialu[i].enable = 0; - vucycle+= cycle; - _recvuTestPipes(VU, true); -} - -void _recvuFMACAdd(VURegs * VU, int reg, int xyzw) { - int i; - - /* find a free fmac pipe */ - for (i=0; i<8; i++) { - if (VU->fmac[i].enable == 1) continue; - break; - } - - if (i==8) Console.Error("*PCSX2*: error , out of fmacs"); -// VUM_LOG("adding FMAC pipe[%d]; reg %d", i, reg); - - VU->fmac[i].enable = 1; - VU->fmac[i].sCycle = vucycle; - VU->fmac[i].Cycle = 3; - VU->fmac[i].xyzw = xyzw; - VU->fmac[i].reg = reg; -} - -void _recvuFDIVAdd(VURegs * VU, int cycles) { -// Console.WriteLn("adding FDIV pipe"); - VU->fdiv.enable = 1; - VU->fdiv.sCycle = vucycle; - VU->fdiv.Cycle = cycles; -} - -void _recvuEFUAdd(VURegs * VU, int cycles) { -// Console.WriteLn("adding EFU pipe"); - VU->efu.enable = 1; - VU->efu.sCycle = vucycle; - VU->efu.Cycle = cycles; -} - -void _recvuIALUAdd(VURegs * VU, int reg, int cycles) { - int i; - - /* find a free ialu pipe */ - for (i=0; i<8; i++) { - if (VU->ialu[i].enable == 1) continue; - break; - } - - if (i==8) Console.Error("*PCSX2*: error , out of ialus"); - - VU->ialu[i].enable = 1; - VU->ialu[i].sCycle = vucycle; - VU->ialu[i].Cycle = cycles; - VU->ialu[i].reg = reg; -} - -void _recvuTestIALUStalls(VURegs * VU, _VURegsNum *VUregsn) { - - int VIread0 = 0, VIread1 = 0; // max 2 integer registers are read simulataneously - int i; - - for(i=0;i<16;i++) { // find used integer(vi00-vi15) registers - if( (VUregsn->VIread >> i) & 1 ) { - if( VIread0 ) VIread1 = i; - else VIread0 = i; - } - } - - if( VIread0 ) _recvuIALUTestStall(VU, VIread0); - if( VIread1 ) _recvuIALUTestStall(VU, VIread1); -} - -void _recvuAddIALUStalls(VURegs * VU, _VURegsNum *VUregsn) { - if (VUregsn->VIwrite && VUregsn->cycles) { - int VIWrite0 = 0; - int i; - - for(i=0;i<16;i++) { // find used(vi00-vi15) registers - if( (VUregsn->VIwrite >> i) & 1 ) { - VIWrite0 = i; - } - } - if( VIWrite0 ) _recvuIALUAdd(VU, VIWrite0, VUregsn->cycles); - } -} - -void _recvuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn, bool upper) { - - if( VUregsn->VFread0 && (VUregsn->VFread0 == VUregsn->VFread1) ) { - _recvuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw|VUregsn->VFr1xyzw); - } - else { - if (VUregsn->VFread0) _recvuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw); - if (VUregsn->VFread1) _recvuFMACTestStall(VU, VUregsn->VFread1, VUregsn->VFr1xyzw); - } - - if( !upper && VUregsn->VIread ) _recvuTestIALUStalls(VU, VUregsn); // for lower instructions which read integer reg -} - -void _recvuAddFMACStalls(VURegs * VU, _VURegsNum *VUregsn) { - - if (VUregsn->VFwrite) _recvuFMACAdd(VU, VUregsn->VFwrite, VUregsn->VFwxyzw); - else if (VUregsn->VIwrite & (1 << REG_CLIP_FLAG)) _recvuFMACAdd(VU, -REG_CLIP_FLAG, 0); // REG_CLIP_FLAG pipe - else _recvuFMACAdd(VU, 0, 0); // cause no data dependency with fp registers -} - -void _recvuFlushFDIV(VURegs * VU) { - int cycle; - - if (VU->fdiv.enable == 0) return; - - cycle = VU->fdiv.Cycle + 1 - (vucycle - VU->fdiv.sCycle); //VU->fdiv.Cycle contains the latency minus 1 (6 or 12) -// Console.WriteLn("waiting FDIV pipe %d", cycle); - VU->fdiv.enable = 0; - vucycle+= cycle; -} - -void _recvuFlushEFU(VURegs * VU) { - int cycle; - - if (VU->efu.enable == 0) return; - - cycle = VU->efu.Cycle - (vucycle - VU->efu.sCycle); -// Console.WriteLn("waiting FDIV pipe %d", cycle); - VU->efu.enable = 0; - vucycle+= cycle; -} - -void _recvuTestFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) { - _recvuTestFMACStalls(VU,VUregsn, false); - _recvuFlushFDIV(VU); -} - -void _recvuTestEFUStalls(VURegs * VU, _VURegsNum *VUregsn) { - _recvuTestFMACStalls(VU,VUregsn, false); - _recvuFlushEFU(VU); -} - -void _recvuAddFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) { -// _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn); - if (VUregsn->VIwrite & (1 << REG_Q)) { - _recvuFDIVAdd(VU, VUregsn->cycles); - } -} - -void _recvuAddEFUStalls(VURegs * VU, _VURegsNum *VUregsn) { -// _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn); - if (VUregsn->VIwrite & (1 << REG_P)) { - _recvuEFUAdd(VU, VUregsn->cycles); - } -} - -void _recvuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn) { - switch (VUregsn->pipe) { - case VUPIPE_FMAC: _recvuTestFMACStalls(VU, VUregsn, true); break; - } -} - -void _recvuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn) { - switch (VUregsn->pipe) { - case VUPIPE_FMAC: _recvuTestFMACStalls(VU, VUregsn, false); break; - case VUPIPE_FDIV: _recvuTestFDIVStalls(VU, VUregsn); break; - case VUPIPE_EFU: _recvuTestEFUStalls(VU, VUregsn); break; - case VUPIPE_IALU: _recvuTestIALUStalls(VU, VUregsn); break; - case VUPIPE_BRANCH: _recvuTestIALUStalls(VU, VUregsn); break; - } -} - -void _recvuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn) { - switch (VUregsn->pipe) { - case VUPIPE_FMAC: _recvuAddFMACStalls(VU, VUregsn); break; - } -} - -void _recvuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn) { - switch (VUregsn->pipe) { - case VUPIPE_FMAC: _recvuAddFMACStalls(VU, VUregsn); break; - case VUPIPE_FDIV: _recvuAddFDIVStalls(VU, VUregsn); break; - case VUPIPE_EFU: _recvuAddEFUStalls(VU, VUregsn); break; - case VUPIPE_IALU: _recvuAddIALUStalls(VU, VUregsn); break; // note: only ILW and ILWR cause stall in IALU pipe - } -} - -void SuperVUAnalyzeOp(VURegs *VU, _vuopinfo *info, _VURegsNum* pCodeRegs) -{ - _VURegsNum* lregs; - _VURegsNum* uregs; - int *ptr; - - lregs = pCodeRegs; - uregs = pCodeRegs+1; - - ptr = (int*)&VU->Micro[pc]; - pc += 8; - - if (ptr[1] & 0x40000000) { // EOP - branch |= 8; - } - - VU->code = ptr[1]; - if (VU == &VU1) VU1regs_UPPER_OPCODE[VU->code & 0x3f](uregs); - else VU0regs_UPPER_OPCODE[VU->code & 0x3f](uregs); - - _recvuTestUpperStalls(VU, uregs); - switch(VU->code & 0x3f) { - case 0x10: case 0x11: case 0x12: case 0x13: - case 0x14: case 0x15: case 0x16: case 0x17: - case 0x1d: case 0x1f: - case 0x2b: case 0x2f: - break; - - case 0x3c: - switch ((VU->code >> 6) & 0x1f) { - case 0x4: case 0x5: - break; - default: - info->statusflag = 4; - info->macflag = 4; - break; - } - break; - case 0x3d: - switch ((VU->code >> 6) & 0x1f) { - case 0x4: case 0x5: case 0x7: - break; - default: - info->statusflag = 4; - info->macflag = 4; - break; - } - break; - case 0x3e: - switch ((VU->code >> 6) & 0x1f) { - case 0x4: case 0x5: - break; - default: - info->statusflag = 4; - info->macflag = 4; - break; - } - break; - case 0x3f: - switch ((VU->code >> 6) & 0x1f) { - case 0x4: case 0x5: case 0x7: case 0xb: - break; - default: - info->statusflag = 4; - info->macflag = 4; - break; - } - break; - - default: - info->statusflag = 4; - info->macflag = 4; - break; - } - - if (uregs->VIread & (1 << REG_Q)) { info->q |= 2; } - if (uregs->VIread & (1 << REG_P)) { info->p |= 2; assert( VU == &VU1 ); } - - // check upper flags - if (ptr[1] & 0x80000000) { // I flag - info->cycle = vucycle; - memzero(*lregs); - } - else { - - VU->code = ptr[0]; - if (VU == &VU1) VU1regs_LOWER_OPCODE[VU->code >> 25](lregs); - else VU0regs_LOWER_OPCODE[VU->code >> 25](lregs); - - _recvuTestLowerStalls(VU, lregs); - info->cycle = vucycle; - - if (lregs->pipe == VUPIPE_BRANCH) { - branch |= 1; - } - - if (lregs->VIwrite & (1 << REG_Q)) { - info->q |= 4; - info->cycles = lregs->cycles; - info->pqinst = (VU->code&2)>>1; // rsqrt is 2 - } - else if (lregs->pipe == VUPIPE_FDIV) { - info->q |= 8|1; - info->pqinst = 0; - } - - if (lregs->VIwrite & (1 << REG_P)) { - assert( VU == &VU1 ); - info->p |= 4; - info->cycles = lregs->cycles; - - switch( VU->code & 0xff ) { - case 0xfd: info->pqinst = 0; break; //eatan - case 0x7c: info->pqinst = 0; break; //eatanxy - case 0x7d: info->pqinst = 0; break; //eatanzy - case 0xfe: info->pqinst = 1; break; //eexp - case 0xfc: info->pqinst = 2; break; //esin - case 0x3f: info->pqinst = 3; break; //erleng - case 0x3e: info->pqinst = 4; break; //eleng - case 0x3d: info->pqinst = 4; break; //ersadd - case 0xbd: info->pqinst = 4; break; //ersqrt - case 0xbe: info->pqinst = 5; break; //ercpr - case 0xbc: info->pqinst = 5; break; //esqrt - case 0x7e: info->pqinst = 5; break; //esum - case 0x3c: info->pqinst = 6; break; //esadd - default: assert(0); - } - } - else if (lregs->pipe == VUPIPE_EFU) { - info->p |= 8|1; - } - - if (lregs->VIread & (1 << REG_STATUS_FLAG)) info->statusflag|= VUOP_READ; - if (lregs->VIread & (1 << REG_MAC_FLAG)) info->macflag|= VUOP_READ; - - if (lregs->VIwrite & (1 << REG_STATUS_FLAG)) info->statusflag|= VUOP_WRITE; - if (lregs->VIwrite & (1 << REG_MAC_FLAG)) info->macflag|= VUOP_WRITE; - - if (lregs->VIread & (1 << REG_Q)) { info->q |= 2; } - if (lregs->VIread & (1 << REG_P)) { info->p |= 2; assert( VU == &VU1 ); } - - _recvuAddLowerStalls(VU, lregs); - } - - _recvuAddUpperStalls(VU, uregs); - _recvuTestPipes(VU, false); - - vucycle++; -} - -int eeVURecompileCode(VURegs *VU, _VURegsNum* regs) -{ - int info = 0; - int vfread0=-1, vfread1 = -1, vfwrite = -1, vfacc = -1, vftemp=-1; - - assert( regs != NULL ); - - if( regs->VFread0 ) _addNeededVFtoXMMreg(regs->VFread0); - if( regs->VFread1 ) _addNeededVFtoXMMreg(regs->VFread1); - if( regs->VFwrite ) _addNeededVFtoXMMreg(regs->VFwrite); - if( regs->VIread & (1<VIread & (1<VFread0 ) vfread0 = _allocVFtoXMMreg(VU, -1, regs->VFread0, MODE_READ); - else if( regs->VIread & (1<VFread1 ) vfread1 = _allocVFtoXMMreg(VU, -1, regs->VFread1, MODE_READ); - else if( (regs->VIread & (1<VFr1xyzw != 0xff) vfread1 = _allocVFtoXMMreg(VU, -1, 0, MODE_READ); - - if( regs->VIread & (1<VIwrite&(1<VIwrite & (1<VFwxyzw != 0xf?MODE_READ:0)); - } - - if( regs->VFwrite ) { - assert( !(regs->VIwrite&(1<VFwrite, MODE_WRITE|(regs->VFwxyzw != 0xf?MODE_READ:0)); - } - - if( vfacc>= 0 ) info |= PROCESS_EE_SET_ACC(vfacc); - if( vfwrite >= 0 ) { - if( regs->VFwrite == _Ft_ && vfread1 < 0 ) { - info |= PROCESS_EE_SET_T(vfwrite); - } - else { - assert( regs->VFwrite == _Fd_ ); - info |= PROCESS_EE_SET_D(vfwrite); - } - } - - if( vfread0 >= 0 ) info |= PROCESS_EE_SET_S(vfread0); - if( vfread1 >= 0 ) info |= PROCESS_EE_SET_T(vfread1); - - vftemp = _allocTempXMMreg(XMMT_FPS, -1); - info |= PROCESS_VU_SET_TEMP(vftemp); - - if( regs->VIwrite & (1 << REG_CLIP_FLAG) ) { - // CLIP inst, need two extra temp registers, put it EEREC_D and EEREC_ACC - int t1reg = _allocTempXMMreg(XMMT_FPS, -1); - int t2reg = _allocTempXMMreg(XMMT_FPS, -1); - - info |= PROCESS_EE_SET_D(t1reg); - info |= PROCESS_EE_SET_ACC(t2reg); - - _freeXMMreg(t1reg); // don't need - _freeXMMreg(t2reg); // don't need - } - else if( regs->VIwrite & (1<VI[reg].UL; - - if( read != 1 ) { - if( reg == REG_MAC_FLAG ) return (uptr)&VU->macflag; - if( reg == REG_CLIP_FLAG ) return (uptr)&VU->clipflag; - if( reg == REG_STATUS_FLAG ) return (uptr)&VU->statusflag; - if( reg == REG_Q ) return (uptr)&VU->q; - if( reg == REG_P ) return (uptr)&VU->p; - } - - return (uptr)&VU->VI[reg].UL; -} - -// gets a temp reg that is not EEREC_TEMP -int _vuGetTempXMMreg(int info) -{ - int t1reg = -1; - - if( _hasFreeXMMreg() ) { - t1reg = _allocTempXMMreg(XMMT_FPS, -1); - - if( t1reg == EEREC_TEMP ) { - if( _hasFreeXMMreg() ) { - int t = _allocTempXMMreg(XMMT_FPS, -1); - _freeXMMreg(t1reg); - t1reg = t; - } - else { - _freeXMMreg(t1reg); - t1reg = -1; - } - } - } - - return t1reg; -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// Misc VU Reg Flipping/Merging Functions -//------------------------------------------------------------------ -void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw) -{ - switch (xyzw) { - case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break; - case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break; - case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break; - case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break; - } -} - -void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw) -{ - switch (xyzw) { - case 0: SSE_MOVSS_XMM_to_XMM(dstreg, srcreg); break; - case 1: if ( x86caps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0)); - else SSE2_PSHUFLW_XMM_to_XMM(dstreg, srcreg, 0xee); - break; - case 2: SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); break; - case 3: if ( x86caps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0)); - else { SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); SSE2_PSHUFLW_XMM_to_XMM(dstreg, dstreg, 0xee); } - break; - } -} - -void _vuFlipRegSS(VURegs * VU, int reg) -{ - assert( _XYZW_SS ); - if( _Y ) SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e); - else if( _Z ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6); - else if( _W ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27); -} - -void _vuFlipRegSS_xyzw(int reg, int xyzw) -{ - switch ( xyzw ) { - case 1: SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e); break; - case 2: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6); break; - case 3: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27); break; - } -} - -void _vuMoveSS(VURegs * VU, int dstreg, int srcreg) -{ - assert( _XYZW_SS ); - if( _Y ) _unpackVFSS_xyzw(dstreg, srcreg, 1); - else if( _Z ) _unpackVFSS_xyzw(dstreg, srcreg, 2); - else if( _W ) _unpackVFSS_xyzw(dstreg, srcreg, 3); - else _unpackVFSS_xyzw(dstreg, srcreg, 0); -} - -// 1 - src, 0 - dest wzyx -void VU_MERGE0(int dest, int src) { // 0000s -} -void VU_MERGE1(int dest, int src) { // 1000 - SSE_MOVHLPS_XMM_to_XMM(src, dest); - SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4); -} -void VU_MERGE1b(int dest, int src) { // 1000s - SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); -} -void VU_MERGE2(int dest, int src) { // 0100 - SSE_MOVHLPS_XMM_to_XMM(src, dest); - SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64); -} -void VU_MERGE2b(int dest, int src) { // 0100s - SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); -} -void VU_MERGE3(int dest, int src) { // 1100s - SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); -} -void VU_MERGE4(int dest, int src) { // 0010 - SSE_MOVSS_XMM_to_XMM(src, dest); - SSE2_MOVSD_XMM_to_XMM(dest, src); -} -void VU_MERGE4b(int dest, int src) { // 0010s - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); -} -void VU_MERGE5(int dest, int src) { // 1010 - SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd8); -} -void VU_MERGE5b(int dest, int src) { // 1010s - SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); -} -void VU_MERGE6(int dest, int src) { // 0110 - SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x78); -} -void VU_MERGE6b(int dest, int src) { // 0110s - SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); -} -void VU_MERGE7(int dest, int src) { // 1110 - SSE_MOVSS_XMM_to_XMM(src, dest); - SSE_MOVAPS_XMM_to_XMM(dest, src); -} -void VU_MERGE7b(int dest, int src) { // 1110s - SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); -} -void VU_MERGE8(int dest, int src) { // 0001s - SSE_MOVSS_XMM_to_XMM(dest, src); -} -void VU_MERGE9(int dest, int src) { // 1001 - SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd2); -} -void VU_MERGE9b(int dest, int src) { // 1001s - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); -} -void VU_MERGE10(int dest, int src) { // 0101 - SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x72); -} -void VU_MERGE10b(int dest, int src) { // 0101s - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); -} -void VU_MERGE11(int dest, int src) { // 1101s - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); -} -void VU_MERGE12(int dest, int src) { // 0011 - SSE2_MOVSD_XMM_to_XMM(dest, src); -} -void VU_MERGE13(int dest, int src) { // 1011 - SSE_MOVHLPS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, dest, 0x64); - SSE_MOVAPS_XMM_to_XMM(dest, src); -} -void VU_MERGE13b(int dest, int src) { // 1011s - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); -} -void VU_MERGE14(int dest, int src) { // 0111 - SSE_MOVHLPS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, dest, 0xc4); - SSE_MOVAPS_XMM_to_XMM(dest, src); -} -void VU_MERGE14b(int dest, int src) { // 0111s - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); - SSE_MOVSS_XMM_to_XMM(dest, src); - SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); - SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); -} -void VU_MERGE15(int dest, int src) { // 1111s - SSE_MOVAPS_XMM_to_XMM(dest, src); -} - -typedef void (*VUMERGEFN)(int dest, int src); - -static VUMERGEFN s_VuMerge[16] = { - VU_MERGE0, VU_MERGE1, VU_MERGE2, VU_MERGE3, - VU_MERGE4, VU_MERGE5, VU_MERGE6, VU_MERGE7, - VU_MERGE8, VU_MERGE9, VU_MERGE10, VU_MERGE11, - VU_MERGE12, VU_MERGE13, VU_MERGE14, VU_MERGE15 }; - -static VUMERGEFN s_VuMerge2[16] = { - VU_MERGE0, VU_MERGE1b, VU_MERGE2b, VU_MERGE3, - VU_MERGE4b, VU_MERGE5b, VU_MERGE6b, VU_MERGE7b, - VU_MERGE8, VU_MERGE9b, VU_MERGE10b, VU_MERGE11, - VU_MERGE12, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 }; - -// Modifies the Source Reg! -void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) { - xyzw &= 0xf; - if ( (dest != src) && (xyzw != 0) ) { - if ( x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) { - xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3); - SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw); - } - else s_VuMerge[xyzw](dest, src); - } -} -// Doesn't Modify the Source Reg! (ToDo: s_VuMerge2() has room for optimization) -void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw) { - xyzw &= 0xf; - if ( (dest != src) && (xyzw != 0) ) { - if ( x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) { - xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3); - SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw); - } - else s_VuMerge2[xyzw](dest, src); - } -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// Misc VU Reg Clamping/Overflow Functions -//------------------------------------------------------------------ -#define CLAMP_NORMAL_SSE4(n) \ - SSE_MOVAPS_XMM_to_XMM(regTemp, regd);\ - SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[n][0]);\ - SSE2_PSUBD_XMM_to_XMM(regTemp, regd);\ - SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_ones[0]);\ - SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[n][0]);\ - SSE2_PSLLD_I8_to_XMM(regTemp, 31);\ - SSE_XORPS_XMM_to_XMM(regd, regTemp); - -#define CLAMP_SIGN_SSE4(n) \ - SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[n][0]);\ - SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[n][0]); - -void vFloat0(int regd, int regTemp) { } //0000 -void vFloat1(int regd, int regTemp) { //1000 - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); -} -void vFloat1c(int regd, int regTemp) { //1000 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(1); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat2(int regd, int regTemp) { //0100 - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); -} -void vFloat2c(int regd, int regTemp) { //0100 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(2); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat3(int regd, int regTemp) { //1100 - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); -} -void vFloat3b(int regd, int regTemp) { //1100 //regTemp is Modified - SSE2_MOVSD_XMM_to_XMM(regTemp, regd); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - SSE2_MOVSD_XMM_to_XMM(regd, regTemp); -} -void vFloat3c(int regd, int regTemp) { //1100 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(3); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat4(int regd, int regTemp) { //0010 - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); -} -void vFloat4c(int regd, int regTemp) { //0010 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(4); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat5(int regd, int regTemp) { //1010 - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); -} -void vFloat5b(int regd, int regTemp) { //1010 //regTemp is Modified - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_NORMAL_SSE4(5); - } - else { - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); - } -} -void vFloat5c(int regd, int regTemp) { //1010 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(5); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat6(int regd, int regTemp) { //0110 - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); -} -void vFloat6b(int regd, int regTemp) { //0110 //regTemp is Modified - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_NORMAL_SSE4(6); - } - else { - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); - } -} -void vFloat6c(int regd, int regTemp) { //0110 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(6); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat7(int regd, int regTemp) { //1110 - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39); -} -void vFloat7_useEAX(int regd, int regTemp) { //1110 //EAX is Modified - SSE2_MOVD_XMM_to_R(EAX, regd); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - if ( x86caps.hasStreamingSIMD4Extensions ) - SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00); - else { - SSE_PINSRW_R32_to_XMM(regd, EAX, 0); - SHR32ItoR(EAX, 16); - SSE_PINSRW_R32_to_XMM(regd, EAX, 1); - } -} -void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified - SSE_MOVSS_XMM_to_XMM(regTemp, regd); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - SSE_MOVSS_XMM_to_XMM(regd, regTemp); -} -void vFloat7c(int regd, int regTemp) { //1110 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(7); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat7c_useEAX(int regd, int regTemp) { //1110 //EAX is Modified - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(7); - } - else { - SSE2_MOVD_XMM_to_R(EAX, regd); - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - SSE2_MOVD_R_to_XMM(regTemp, EAX); - SSE_MOVSS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat8(int regd, int regTemp) { //0001 - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); -} -void vFloat8c(int regd, int regTemp) { //0001 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(8); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat9(int regd, int regTemp) { //1001 - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); -} -void vFloat9b(int regd, int regTemp) { //1001 //regTemp is Modified - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_NORMAL_SSE4(9); - } - else { - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - } -} -void vFloat9c(int regd, int regTemp) { //1001 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(9); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat10(int regd, int regTemp) { //0101 - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); -} -void vFloat10b(int regd, int regTemp) { //0101 //regTemp is Modified - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_NORMAL_SSE4(10); - } - else { - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - } -} -void vFloat10c(int regd, int regTemp) { //0101 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(10); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat11(int regd, int regTemp) { //1101 - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); -} -void vFloat11_useEAX(int regd, int regTemp) { //1101 //EAX is Modified - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE2_MOVD_XMM_to_R(EAX, regd); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - if ( x86caps.hasStreamingSIMD4Extensions ) - SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00); - else { - SSE_PINSRW_R32_to_XMM(regd, EAX, 0); - SHR32ItoR(EAX, 16); - SSE_PINSRW_R32_to_XMM(regd, EAX, 1); - } - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); -} -void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - SSE_MOVSS_XMM_to_XMM(regTemp, regd); - SSE2_MOVSD_XMM_to_XMM(regd, regTemp); -} -void vFloat11c(int regd, int regTemp) { //1101 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(11); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat11c_useEAX(int regd, int regTemp) { //1101 // EAX is modified - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(11); - } - else { - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE2_MOVD_XMM_to_R(EAX, regd); - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - SSE2_MOVD_R_to_XMM(regTemp, EAX); - SSE_MOVSS_XMM_to_XMM(regd, regTemp); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - } -} -void vFloat12(int regd, int regTemp) { //0011 - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); -} -void vFloat12b(int regd, int regTemp) { //0011 //regTemp is Modified - SSE_MOVHLPS_XMM_to_XMM(regTemp, regd); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - SSE2_PUNPCKLQDQ_XMM_to_XMM(regd, regTemp); -} -void vFloat12c(int regd, int regTemp) { //0011 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(12); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat13(int regd, int regTemp) { //1011 - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); -} -void vFloat13_useEAX(int regd, int regTemp) { //1011 // EAX is modified - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE2_MOVD_XMM_to_R(EAX, regd); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - if ( x86caps.hasStreamingSIMD4Extensions ) - SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00); - else { - SSE_PINSRW_R32_to_XMM(regd, EAX, 0); - SHR32ItoR(EAX, 16); - SSE_PINSRW_R32_to_XMM(regd, EAX, 1); - } - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); -} -void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - SSE_MOVHLPS_XMM_to_XMM(regTemp, regd); - SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x64); -} -void vFloat13c(int regd, int regTemp) { //1011 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(13); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat13c_useEAX(int regd, int regTemp) { //1011 // EAX is modified - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(13); - } - else { - SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0xc6); - SSE2_MOVD_XMM_to_R(EAX, regd); - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - SSE2_MOVD_R_to_XMM(regTemp, EAX); - SSE_MOVSS_XMM_to_XMM(regd, regTemp); - SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0xc6); - } -} -void vFloat14(int regd, int regTemp) { //0111 - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); -} -void vFloat14_useEAX(int regd, int regTemp) { //0111 // EAX is modified - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); - SSE2_MOVD_XMM_to_R(EAX, regd); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - if ( x86caps.hasStreamingSIMD4Extensions ) - SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00); - else { - SSE_PINSRW_R32_to_XMM(regd, EAX, 0); - SHR32ItoR(EAX, 16); - SSE_PINSRW_R32_to_XMM(regd, EAX, 1); - } - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); -} -void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - SSE_MOVHLPS_XMM_to_XMM(regTemp, regd); - SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xc4); -} -void vFloat14c(int regd, int regTemp) { //0111 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(14); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); - SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} -void vFloat14c_useEAX(int regd, int regTemp) { //0111 // EAX is modified - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(14); - } - else { - SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0x27); - SSE2_MOVD_XMM_to_R(EAX, regd); - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - SSE2_MOVD_R_to_XMM(regTemp, EAX); - SSE_MOVSS_XMM_to_XMM(regd, regTemp); - SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0x27); - } -} -void vFloat15(int regd, int regTemp) { //1111 - SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); - SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); -} -void vFloat15c(int regd, int regTemp) { //1111 - if ( x86caps.hasStreamingSIMD4Extensions ) { - CLAMP_SIGN_SSE4(15); - } - else { - SSE_MOVAPS_XMM_to_XMM(regTemp, regd); - SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); - SSE_MINPS_M128_to_XMM(regd, (uptr)&g_maxvals[0]); - SSE_MAXPS_M128_to_XMM(regd, (uptr)&g_minvals[0]); - SSE_ORPS_XMM_to_XMM(regd, regTemp); - } -} - -vFloat vFloats1[16] = { //regTemp is not modified - vFloat0, vFloat1, vFloat2, vFloat3, - vFloat4, vFloat5, vFloat6, vFloat7, - vFloat8, vFloat9, vFloat10, vFloat11, - vFloat12, vFloat13, vFloat14, vFloat15 }; - -vFloat vFloats1_useEAX[16] = { //regTemp is not modified but EAX is used - vFloat0, vFloat1, vFloat2, vFloat3, - vFloat4, vFloat5, vFloat6, vFloat7_useEAX, - vFloat8, vFloat9, vFloat10, vFloat11_useEAX, - vFloat12, vFloat13_useEAX, vFloat14_useEAX, vFloat15 }; - -vFloat vFloats2[16] = { //regTemp is modified - vFloat0, vFloat1, vFloat2, vFloat3b, - vFloat4, vFloat5b, vFloat6b, vFloat7b, - vFloat8, vFloat9b, vFloat10b, vFloat11b, - vFloat12b, vFloat13b, vFloat14b, vFloat15 }; - -vFloat vFloats4[16] = { //regTemp is modified - vFloat0, vFloat1c, vFloat2c, vFloat3c, - vFloat4c, vFloat5c, vFloat6c, vFloat7c, - vFloat8c, vFloat9c, vFloat10c, vFloat11c, - vFloat12c, vFloat13c, vFloat14c, vFloat15c }; - -vFloat vFloats4_useEAX[16] = { //regTemp is modified and EAX is used - vFloat0, vFloat1c, vFloat2c, vFloat3c, - vFloat4c, vFloat5c, vFloat6c, vFloat7c_useEAX, - vFloat8c, vFloat9c, vFloat10c, vFloat11c_useEAX, - vFloat12c, vFloat13c_useEAX, vFloat14c_useEAX, vFloat15c }; - -//------------------------------------------------------------------ -// Clamping Functions (wrapper for vFloat* functions) -// vuFloat : "normal" clamping -// vuFloat_useEAX : "normal" clamping (faster but EAX is modified) -// vuFloat2 : "normal" clamping (fastest but regTemp is modified) -// vuFloat3 : "preserve sign" clamping for pointer -// vuFloat4 : "preserve sign" clamping (regTemp is modified; *FASTEST* on SSE4 CPUs) -// vuFloat4_useEAX : "preserve sign" clamping (faster but regTemp and EAX are modified) -// vuFloat5 : wrapper function for vuFloat2 and vuFloat4 -// vuFloat5_useEAX : wrapper function for vuFloat2 and vuFloat4_useEAX -// vuFloatExtra : for debugging -// -// Notice 1: vuFloat*_useEAX may be slower on AMD CPUs, which have independent execution pipeline for -// vector and scalar instructions (need checks) -// Notice 2: recVUMI_MUL_xyzw_toD and recVUMI_MADD_xyzw_toD use vFloats directly! -//------------------------------------------------------------------ - -// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (doesn't use any temp regs) -void vuFloat( int info, int regd, int XYZW) { - if( CHECK_VU_OVERFLOW ) { - /*if ( (XYZW != 0) && (XYZW != 8) && (XYZW != 0xF) ) { - int t1reg = _vuGetTempXMMreg(info); - if (t1reg >= 0) { - vuFloat2( regd, t1reg, XYZW ); - _freeXMMreg( t1reg ); - return; - } - }*/ - //vuFloatExtra(regd, XYZW); - vFloats1[XYZW](regd, regd); - } -} - -// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses EAX as a temp register; faster but **destroys EAX**) -void vuFloat_useEAX( int info, int regd, int XYZW) { - if( CHECK_VU_OVERFLOW ) { - vFloats1_useEAX[XYZW](regd, regd); - } -} - -// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses a temp reg) -void vuFloat2(int regd, int regTemp, int XYZW) { - if( CHECK_VU_OVERFLOW ) { - //vuFloatExtra(regd, XYZW); - vFloats2[XYZW](regd, regTemp); - } -} - -// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg) -void vuFloat4(int regd, int regTemp, int XYZW) { - if( CHECK_VU_OVERFLOW ) { - vFloats4[XYZW](regd, regTemp); - } -} - -// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg, and uses EAX as a temp register; faster but **destroys EAX**) -void vuFloat4_useEAX(int regd, int regTemp, int XYZW) { - if( CHECK_VU_OVERFLOW ) { - vFloats4_useEAX[XYZW](regd, regTemp); - } -} - -// Uses vuFloat4 or vuFloat2 depending on the CHECK_VU_SIGN_OVERFLOW setting -void vuFloat5(int regd, int regTemp, int XYZW) { - if (CHECK_VU_SIGN_OVERFLOW) { - vuFloat4(regd, regTemp, XYZW); - } - else vuFloat2(regd, regTemp, XYZW); -} - -// Uses vuFloat4_useEAX or vuFloat2 depending on the CHECK_VU_SIGN_OVERFLOW setting (uses EAX as a temp register; faster but **destoroyes EAX**) -void vuFloat5_useEAX(int regd, int regTemp, int XYZW) { - if (CHECK_VU_SIGN_OVERFLOW) { - vuFloat4_useEAX(regd, regTemp, XYZW); - } - else vuFloat2(regd, regTemp, XYZW); -} - -// Clamps +/-infs to +/-fMax, and +/-NaNs to +/-fMax -void vuFloat3(uptr x86ptr) { - u8* pjmp; - - if( CHECK_VU_OVERFLOW ) { - CMP32ItoM(x86ptr, 0x7f800000 ); - pjmp = JL8(0); // Signed Comparison - MOV32ItoM(x86ptr, 0x7f7fffff ); - x86SetJ8(pjmp); - - CMP32ItoM(x86ptr, 0xff800000 ); - pjmp = JB8(0); // Unsigned Comparison - MOV32ItoM(x86ptr, 0xff7fffff ); - x86SetJ8(pjmp); - } -} - -__aligned16 u64 vuFloatData[4]; - -// Makes NaN == 0, Infinities stay the same; Very Slow - Use only for debugging -void vuFloatExtra( int regd, int XYZW) { - int t1reg = (regd == 0) ? (regd + 1) : (regd - 1); - int t2reg = (regd <= 1) ? (regd + 2) : (regd - 2); - SSE_MOVAPS_XMM_to_M128( (uptr)&vuFloatData[0], t1reg ); - SSE_MOVAPS_XMM_to_M128( (uptr)&vuFloatData[2], t2reg ); - - SSE_XORPS_XMM_to_XMM(t1reg, t1reg); - SSE_CMPORDPS_XMM_to_XMM(t1reg, regd); - SSE_MOVAPS_XMM_to_XMM(t2reg, regd); - SSE_ANDPS_XMM_to_XMM(t2reg, t1reg); - VU_MERGE_REGS_CUSTOM(regd, t2reg, XYZW); - - SSE_MOVAPS_M128_to_XMM( t1reg, (uptr)&vuFloatData[0] ); - SSE_MOVAPS_M128_to_XMM( t2reg, (uptr)&vuFloatData[2] ); -} - -static __aligned16 u32 tempRegX[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000}; - -// Called by testWhenOverflow() function -void testPrintOverflow() { - tempRegX[0] &= 0xff800000; - tempRegX[1] &= 0xff800000; - tempRegX[2] &= 0xff800000; - tempRegX[3] &= 0xff800000; - if ( (tempRegX[0] == 0x7f800000) || (tempRegX[1] == 0x7f800000) || (tempRegX[2] == 0x7f800000) || (tempRegX[3] == 0x7f800000) ) - Console.Warning( "VU OVERFLOW!: Changing to +Fmax!!!!!!!!!!!!" ); - if ( (tempRegX[0] == 0xff800000) || (tempRegX[1] == 0xff800000) || (tempRegX[2] == 0xff800000) || (tempRegX[3] == 0xff800000) ) - Console.Warning( "VU OVERFLOW!: Changing to -Fmax!!!!!!!!!!!!" ); -} - -// Outputs to the console when overflow has occured. -void testWhenOverflow(int info, int regd, int t0reg) { - SSE_MOVAPS_XMM_to_M128((uptr)tempRegX, regd); - CALLFunc((uptr)testPrintOverflow); -} +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#include "PrecompiledHeader.h" + +#include "Common.h" +#include "GS.h" +#include "R5900OpcodeTables.h" +#include "iR5900.h" +#include "iMMI.h" +#include "iFPU.h" +#include "iCOP0.h" +#include "VUmicro.h" +#include "VUflags.h" +#include "sVU_Micro.h" +#include "sVU_Debug.h" +#include "sVU_zerorec.h" + +#ifdef _WIN32 +#pragma warning(disable:4244) +#pragma warning(disable:4761) +#endif +//------------------------------------------------------------------ + +// fixme - VUmicro should really use its own static vars for pc and branch. +// Sharing with the EE's copies of pc and branch is not cool! (air) + +//------------------------------------------------------------------ +// Helper Macros +//------------------------------------------------------------------ +#define _Ft_ (( VU->code >> 16) & 0x1F) // The rt part of the instruction register +#define _Fs_ (( VU->code >> 11) & 0x1F) // The rd part of the instruction register +#define _Fd_ (( VU->code >> 6) & 0x1F) // The sa part of the instruction register +#define _It_ (_Ft_ & 15) +#define _Is_ (_Fs_ & 15) +#define _Id_ (_Fd_ & 15) + +#define _X (( VU->code>>24) & 0x1) +#define _Y (( VU->code>>23) & 0x1) +#define _Z (( VU->code>>22) & 0x1) +#define _W (( VU->code>>21) & 0x1) + +#define _XYZW_SS (_X+_Y+_Z+_W==1) + +#define _Fsf_ (( VU->code >> 21) & 0x03) +#define _Ftf_ (( VU->code >> 23) & 0x03) + +#define _Imm11_ (s32)(VU->code & 0x400 ? 0xfffffc00 | (VU->code & 0x3ff) : VU->code & 0x3ff) +#define _UImm11_ (s32)(VU->code & 0x7ff) + +#define VU_VFx_ADDR(x) (uptr)&VU->VF[x].UL[0] +#define VU_VFy_ADDR(x) (uptr)&VU->VF[x].UL[1] +#define VU_VFz_ADDR(x) (uptr)&VU->VF[x].UL[2] +#define VU_VFw_ADDR(x) (uptr)&VU->VF[x].UL[3] + +#define VU_REGR_ADDR (uptr)&VU->VI[REG_R] +#define VU_REGQ_ADDR (uptr)&VU->VI[REG_Q] +#define VU_REGMAC_ADDR (uptr)&VU->VI[REG_MAC_FLAG] + +#define VU_VI_ADDR(x, read) GetVIAddr(VU, x, read, info) + +#define VU_ACCx_ADDR (uptr)&VU->ACC.UL[0] +#define VU_ACCy_ADDR (uptr)&VU->ACC.UL[1] +#define VU_ACCz_ADDR (uptr)&VU->ACC.UL[2] +#define VU_ACCw_ADDR (uptr)&VU->ACC.UL[3] + +#define _X_Y_Z_W ((( VU->code >> 21 ) & 0xF ) ) +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// Global Variables +//------------------------------------------------------------------ +int vucycle; + +const __aligned16 float s_fones[8] = {1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f}; +const __aligned16 u32 s_mask[4] = {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff}; +const __aligned16 u32 s_expmask[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; +const __aligned16 u32 g_minvals[4] = {0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff}; +const __aligned16 u32 g_maxvals[4] = {0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff}; +const __aligned16 u32 const_clip[8] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, + 0x80000000, 0x80000000, 0x80000000, 0x80000000}; + +const __aligned(64) u32 g_ones[4] = {0x00000001, 0x00000001, 0x00000001, 0x00000001}; + +const __aligned16 u32 g_minvals_XYZW[16][4] = +{ + { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000 + { 0xffffffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //0001 + { 0xffffffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //0010 + { 0xffffffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //0011 + { 0xffffffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //0100 + { 0xffffffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //0101 + { 0xffffffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //0110 + { 0xffffffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //0111 + { 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000 + { 0xff7fffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //1001 + { 0xff7fffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //1010 + { 0xff7fffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //1011 + { 0xff7fffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //1100 + { 0xff7fffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //1101 + { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //1110 + { 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //1111 +}; +const __aligned16 u32 g_maxvals_XYZW[16][4] = +{ + { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0000 + { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //0001 + { 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //0010 + { 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //0011 + { 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //0100 + { 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //0101 + { 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //0110 + { 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //0111 + { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000 + { 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //1001 + { 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //1010 + { 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //1011 + { 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //1100 + { 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //1101 + { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //1110 + { 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //1111 +}; +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// VU Pipeline/Test Stalls/Analyzing Functions +//------------------------------------------------------------------ +void _recvuFMACflush(VURegs * VU, bool intermediate) { + int i; + + for (i=0; i<8; i++) { + if (VU->fmac[i].enable == 0) continue; + + if( intermediate ) { + if ((vucycle - VU->fmac[i].sCycle) > VU->fmac[i].Cycle) { +// VUM_LOG("flushing FMAC pipe[%d]", i); + VU->fmac[i].enable = 0; + } + } + else { + if ((vucycle - VU->fmac[i].sCycle) >= VU->fmac[i].Cycle) { +// VUM_LOG("flushing FMAC pipe[%d]", i); + VU->fmac[i].enable = 0; + } + } + } +} + +void _recvuFDIVflush(VURegs * VU, bool intermediate) { + if (VU->fdiv.enable == 0) return; + + if( intermediate ) { + if ((vucycle - VU->fdiv.sCycle) > VU->fdiv.Cycle) { +// Console.WriteLn("flushing FDIV pipe"); + VU->fdiv.enable = 0; + } + } + else { + if ((vucycle - VU->fdiv.sCycle) >= VU->fdiv.Cycle) { +// Console.WriteLn("flushing FDIV pipe"); + VU->fdiv.enable = 0; + } + } +} + +void _recvuEFUflush(VURegs * VU, bool intermediate) { + if (VU->efu.enable == 0) return; + + if( intermediate ) { + if ((vucycle - VU->efu.sCycle) > VU->efu.Cycle) { +// Console.WriteLn("flushing FDIV pipe"); + VU->efu.enable = 0; + } + } + else { + if ((vucycle - VU->efu.sCycle) >= VU->efu.Cycle) { +// Console.WriteLn("flushing FDIV pipe"); + VU->efu.enable = 0; + } + } +} + +void _recvuIALUflush(VURegs * VU, bool intermediate) { + int i; + + for (i=0; i<8; i++) { + if (VU->ialu[i].enable == 0) continue; + + if( intermediate ) { + if ((vucycle - VU->ialu[i].sCycle) > VU->ialu[i].Cycle) { +// VUM_LOG("flushing IALU pipe[%d]", i); + VU->ialu[i].enable = 0; + } + } + else { + if ((vucycle - VU->ialu[i].sCycle) >= VU->ialu[i].Cycle) { +// VUM_LOG("flushing IALU pipe[%d]", i); + VU->ialu[i].enable = 0; + } + } + } +} + +void _recvuTestPipes(VURegs * VU, bool intermediate) { // intermediate = true if called by upper FMAC stall detection + _recvuFMACflush(VU, intermediate); + _recvuFDIVflush(VU, intermediate); + _recvuEFUflush(VU, intermediate); + _recvuIALUflush(VU, intermediate); +} + +void _recvuFMACTestStall(VURegs * VU, int reg, int xyzw) { + int cycle; + int i; + u32 mask = 0; + + for (i=0; i<8; i++) { + if (VU->fmac[i].enable == 0) continue; + if (VU->fmac[i].reg == reg && (VU->fmac[i].xyzw & xyzw)) break; + } + + if (i == 8) return; + + // do a perchannel delay + // old code +// cycle = VU->fmac[i].Cycle - (vucycle - VU->fmac[i].sCycle); + + // new code + mask = 4; // w +// if( VU->fmac[i].xyzw & 1 ) mask = 4; // w +// else if( VU->fmac[i].xyzw & 2 ) mask = 3; // z +// else if( VU->fmac[i].xyzw & 4 ) mask = 2; // y +// else { +// assert(VU->fmac[i].xyzw & 8 ); +// mask = 1; // x +// } + +// mask = 0; +// if( VU->fmac[i].xyzw & 1 ) mask++; // w +// else if( VU->fmac[i].xyzw & 2 ) mask++; // z +// else if( VU->fmac[i].xyzw & 4 ) mask++; // y +// else if( VU->fmac[i].xyzw & 8 ) mask++; // x + + assert( (int)VU->fmac[i].sCycle < (int)vucycle ); + cycle = 0; + if( vucycle - VU->fmac[i].sCycle < mask ) + cycle = mask - (vucycle - VU->fmac[i].sCycle); + + VU->fmac[i].enable = 0; + vucycle+= cycle; + _recvuTestPipes(VU, true); // for lower instructions +} + +void _recvuIALUTestStall(VURegs * VU, int reg) { + int cycle; + int i; + u32 latency; + + for (i=0; i<8; i++) { + if (VU->ialu[i].enable == 0) continue; + if (VU->ialu[i].reg == reg) break; + } + + if (i == 8) return; + + latency = VU->ialu[i].Cycle + 1; + cycle = 0; + if( vucycle - VU->ialu[i].sCycle < latency ) + cycle = latency - (vucycle - VU->ialu[i].sCycle); + + VU->ialu[i].enable = 0; + vucycle+= cycle; + _recvuTestPipes(VU, true); +} + +void _recvuFMACAdd(VURegs * VU, int reg, int xyzw) { + int i; + + /* find a free fmac pipe */ + for (i=0; i<8; i++) { + if (VU->fmac[i].enable == 1) continue; + break; + } + + if (i==8) Console.Error("*PCSX2*: error , out of fmacs"); +// VUM_LOG("adding FMAC pipe[%d]; reg %d", i, reg); + + VU->fmac[i].enable = 1; + VU->fmac[i].sCycle = vucycle; + VU->fmac[i].Cycle = 3; + VU->fmac[i].xyzw = xyzw; + VU->fmac[i].reg = reg; +} + +void _recvuFDIVAdd(VURegs * VU, int cycles) { +// Console.WriteLn("adding FDIV pipe"); + VU->fdiv.enable = 1; + VU->fdiv.sCycle = vucycle; + VU->fdiv.Cycle = cycles; +} + +void _recvuEFUAdd(VURegs * VU, int cycles) { +// Console.WriteLn("adding EFU pipe"); + VU->efu.enable = 1; + VU->efu.sCycle = vucycle; + VU->efu.Cycle = cycles; +} + +void _recvuIALUAdd(VURegs * VU, int reg, int cycles) { + int i; + + /* find a free ialu pipe */ + for (i=0; i<8; i++) { + if (VU->ialu[i].enable == 1) continue; + break; + } + + if (i==8) Console.Error("*PCSX2*: error , out of ialus"); + + VU->ialu[i].enable = 1; + VU->ialu[i].sCycle = vucycle; + VU->ialu[i].Cycle = cycles; + VU->ialu[i].reg = reg; +} + +void _recvuTestIALUStalls(VURegs * VU, _VURegsNum *VUregsn) { + + int VIread0 = 0, VIread1 = 0; // max 2 integer registers are read simulataneously + int i; + + for(i=0;i<16;i++) { // find used integer(vi00-vi15) registers + if( (VUregsn->VIread >> i) & 1 ) { + if( VIread0 ) VIread1 = i; + else VIread0 = i; + } + } + + if( VIread0 ) _recvuIALUTestStall(VU, VIread0); + if( VIread1 ) _recvuIALUTestStall(VU, VIread1); +} + +void _recvuAddIALUStalls(VURegs * VU, _VURegsNum *VUregsn) { + if (VUregsn->VIwrite && VUregsn->cycles) { + int VIWrite0 = 0; + int i; + + for(i=0;i<16;i++) { // find used(vi00-vi15) registers + if( (VUregsn->VIwrite >> i) & 1 ) { + VIWrite0 = i; + } + } + if( VIWrite0 ) _recvuIALUAdd(VU, VIWrite0, VUregsn->cycles); + } +} + +void _recvuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn, bool upper) { + + if( VUregsn->VFread0 && (VUregsn->VFread0 == VUregsn->VFread1) ) { + _recvuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw|VUregsn->VFr1xyzw); + } + else { + if (VUregsn->VFread0) _recvuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw); + if (VUregsn->VFread1) _recvuFMACTestStall(VU, VUregsn->VFread1, VUregsn->VFr1xyzw); + } + + if( !upper && VUregsn->VIread ) _recvuTestIALUStalls(VU, VUregsn); // for lower instructions which read integer reg +} + +void _recvuAddFMACStalls(VURegs * VU, _VURegsNum *VUregsn) { + + if (VUregsn->VFwrite) _recvuFMACAdd(VU, VUregsn->VFwrite, VUregsn->VFwxyzw); + else if (VUregsn->VIwrite & (1 << REG_CLIP_FLAG)) _recvuFMACAdd(VU, -REG_CLIP_FLAG, 0); // REG_CLIP_FLAG pipe + else _recvuFMACAdd(VU, 0, 0); // cause no data dependency with fp registers +} + +void _recvuFlushFDIV(VURegs * VU) { + int cycle; + + if (VU->fdiv.enable == 0) return; + + cycle = VU->fdiv.Cycle + 1 - (vucycle - VU->fdiv.sCycle); //VU->fdiv.Cycle contains the latency minus 1 (6 or 12) +// Console.WriteLn("waiting FDIV pipe %d", cycle); + VU->fdiv.enable = 0; + vucycle+= cycle; +} + +void _recvuFlushEFU(VURegs * VU) { + int cycle; + + if (VU->efu.enable == 0) return; + + cycle = VU->efu.Cycle - (vucycle - VU->efu.sCycle); +// Console.WriteLn("waiting FDIV pipe %d", cycle); + VU->efu.enable = 0; + vucycle+= cycle; +} + +void _recvuTestFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) { + _recvuTestFMACStalls(VU,VUregsn, false); + _recvuFlushFDIV(VU); +} + +void _recvuTestEFUStalls(VURegs * VU, _VURegsNum *VUregsn) { + _recvuTestFMACStalls(VU,VUregsn, false); + _recvuFlushEFU(VU); +} + +void _recvuAddFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) { +// _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn); + if (VUregsn->VIwrite & (1 << REG_Q)) { + _recvuFDIVAdd(VU, VUregsn->cycles); + } +} + +void _recvuAddEFUStalls(VURegs * VU, _VURegsNum *VUregsn) { +// _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn); + if (VUregsn->VIwrite & (1 << REG_P)) { + _recvuEFUAdd(VU, VUregsn->cycles); + } +} + +void _recvuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn) { + switch (VUregsn->pipe) { + case VUPIPE_FMAC: _recvuTestFMACStalls(VU, VUregsn, true); break; + } +} + +void _recvuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn) { + switch (VUregsn->pipe) { + case VUPIPE_FMAC: _recvuTestFMACStalls(VU, VUregsn, false); break; + case VUPIPE_FDIV: _recvuTestFDIVStalls(VU, VUregsn); break; + case VUPIPE_EFU: _recvuTestEFUStalls(VU, VUregsn); break; + case VUPIPE_IALU: _recvuTestIALUStalls(VU, VUregsn); break; + case VUPIPE_BRANCH: _recvuTestIALUStalls(VU, VUregsn); break; + } +} + +void _recvuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn) { + switch (VUregsn->pipe) { + case VUPIPE_FMAC: _recvuAddFMACStalls(VU, VUregsn); break; + } +} + +void _recvuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn) { + switch (VUregsn->pipe) { + case VUPIPE_FMAC: _recvuAddFMACStalls(VU, VUregsn); break; + case VUPIPE_FDIV: _recvuAddFDIVStalls(VU, VUregsn); break; + case VUPIPE_EFU: _recvuAddEFUStalls(VU, VUregsn); break; + case VUPIPE_IALU: _recvuAddIALUStalls(VU, VUregsn); break; // note: only ILW and ILWR cause stall in IALU pipe + } +} + +void SuperVUAnalyzeOp(VURegs *VU, _vuopinfo *info, _VURegsNum* pCodeRegs) +{ + _VURegsNum* lregs; + _VURegsNum* uregs; + int *ptr; + + lregs = pCodeRegs; + uregs = pCodeRegs+1; + + ptr = (int*)&VU->Micro[pc]; + pc += 8; + + if (ptr[1] & 0x40000000) { // EOP + branch |= 8; + } + + VU->code = ptr[1]; + if (VU == &VU1) VU1regs_UPPER_OPCODE[VU->code & 0x3f](uregs); + else VU0regs_UPPER_OPCODE[VU->code & 0x3f](uregs); + + _recvuTestUpperStalls(VU, uregs); + switch(VU->code & 0x3f) { + case 0x10: case 0x11: case 0x12: case 0x13: + case 0x14: case 0x15: case 0x16: case 0x17: + case 0x1d: case 0x1f: + case 0x2b: case 0x2f: + break; + + case 0x3c: + switch ((VU->code >> 6) & 0x1f) { + case 0x4: case 0x5: + break; + default: + info->statusflag = 4; + info->macflag = 4; + break; + } + break; + case 0x3d: + switch ((VU->code >> 6) & 0x1f) { + case 0x4: case 0x5: case 0x7: + break; + default: + info->statusflag = 4; + info->macflag = 4; + break; + } + break; + case 0x3e: + switch ((VU->code >> 6) & 0x1f) { + case 0x4: case 0x5: + break; + default: + info->statusflag = 4; + info->macflag = 4; + break; + } + break; + case 0x3f: + switch ((VU->code >> 6) & 0x1f) { + case 0x4: case 0x5: case 0x7: case 0xb: + break; + default: + info->statusflag = 4; + info->macflag = 4; + break; + } + break; + + default: + info->statusflag = 4; + info->macflag = 4; + break; + } + + if (uregs->VIread & (1 << REG_Q)) { info->q |= 2; } + if (uregs->VIread & (1 << REG_P)) { info->p |= 2; assert( VU == &VU1 ); } + + // check upper flags + if (ptr[1] & 0x80000000) { // I flag + info->cycle = vucycle; + memzero(*lregs); + } + else { + + VU->code = ptr[0]; + if (VU == &VU1) VU1regs_LOWER_OPCODE[VU->code >> 25](lregs); + else VU0regs_LOWER_OPCODE[VU->code >> 25](lregs); + + _recvuTestLowerStalls(VU, lregs); + info->cycle = vucycle; + + if (lregs->pipe == VUPIPE_BRANCH) { + branch |= 1; + } + + if (lregs->VIwrite & (1 << REG_Q)) { + info->q |= 4; + info->cycles = lregs->cycles; + info->pqinst = (VU->code&2)>>1; // rsqrt is 2 + } + else if (lregs->pipe == VUPIPE_FDIV) { + info->q |= 8|1; + info->pqinst = 0; + } + + if (lregs->VIwrite & (1 << REG_P)) { + assert( VU == &VU1 ); + info->p |= 4; + info->cycles = lregs->cycles; + + switch( VU->code & 0xff ) { + case 0xfd: info->pqinst = 0; break; //eatan + case 0x7c: info->pqinst = 0; break; //eatanxy + case 0x7d: info->pqinst = 0; break; //eatanzy + case 0xfe: info->pqinst = 1; break; //eexp + case 0xfc: info->pqinst = 2; break; //esin + case 0x3f: info->pqinst = 3; break; //erleng + case 0x3e: info->pqinst = 4; break; //eleng + case 0x3d: info->pqinst = 4; break; //ersadd + case 0xbd: info->pqinst = 4; break; //ersqrt + case 0xbe: info->pqinst = 5; break; //ercpr + case 0xbc: info->pqinst = 5; break; //esqrt + case 0x7e: info->pqinst = 5; break; //esum + case 0x3c: info->pqinst = 6; break; //esadd + default: assert(0); + } + } + else if (lregs->pipe == VUPIPE_EFU) { + info->p |= 8|1; + } + + if (lregs->VIread & (1 << REG_STATUS_FLAG)) info->statusflag|= VUOP_READ; + if (lregs->VIread & (1 << REG_MAC_FLAG)) info->macflag|= VUOP_READ; + + if (lregs->VIwrite & (1 << REG_STATUS_FLAG)) info->statusflag|= VUOP_WRITE; + if (lregs->VIwrite & (1 << REG_MAC_FLAG)) info->macflag|= VUOP_WRITE; + + if (lregs->VIread & (1 << REG_Q)) { info->q |= 2; } + if (lregs->VIread & (1 << REG_P)) { info->p |= 2; assert( VU == &VU1 ); } + + _recvuAddLowerStalls(VU, lregs); + } + + _recvuAddUpperStalls(VU, uregs); + _recvuTestPipes(VU, false); + + vucycle++; +} + +int eeVURecompileCode(VURegs *VU, _VURegsNum* regs) +{ + int info = 0; + int vfread0=-1, vfread1 = -1, vfwrite = -1, vfacc = -1, vftemp=-1; + + assert( regs != NULL ); + + if( regs->VFread0 ) _addNeededVFtoXMMreg(regs->VFread0); + if( regs->VFread1 ) _addNeededVFtoXMMreg(regs->VFread1); + if( regs->VFwrite ) _addNeededVFtoXMMreg(regs->VFwrite); + if( regs->VIread & (1<VIread & (1<VFread0 ) vfread0 = _allocVFtoXMMreg(VU, -1, regs->VFread0, MODE_READ); + else if( regs->VIread & (1<VFread1 ) vfread1 = _allocVFtoXMMreg(VU, -1, regs->VFread1, MODE_READ); + else if( (regs->VIread & (1<VFr1xyzw != 0xff) vfread1 = _allocVFtoXMMreg(VU, -1, 0, MODE_READ); + + if( regs->VIread & (1<VIwrite&(1<VIwrite & (1<VFwxyzw != 0xf?MODE_READ:0)); + } + + if( regs->VFwrite ) { + assert( !(regs->VIwrite&(1<VFwrite, MODE_WRITE|(regs->VFwxyzw != 0xf?MODE_READ:0)); + } + + if( vfacc>= 0 ) info |= PROCESS_EE_SET_ACC(vfacc); + if( vfwrite >= 0 ) { + if( regs->VFwrite == _Ft_ && vfread1 < 0 ) { + info |= PROCESS_EE_SET_T(vfwrite); + } + else { + assert( regs->VFwrite == _Fd_ ); + info |= PROCESS_EE_SET_D(vfwrite); + } + } + + if( vfread0 >= 0 ) info |= PROCESS_EE_SET_S(vfread0); + if( vfread1 >= 0 ) info |= PROCESS_EE_SET_T(vfread1); + + vftemp = _allocTempXMMreg(XMMT_FPS, -1); + info |= PROCESS_VU_SET_TEMP(vftemp); + + if( regs->VIwrite & (1 << REG_CLIP_FLAG) ) { + // CLIP inst, need two extra temp registers, put it EEREC_D and EEREC_ACC + int t1reg = _allocTempXMMreg(XMMT_FPS, -1); + int t2reg = _allocTempXMMreg(XMMT_FPS, -1); + + info |= PROCESS_EE_SET_D(t1reg); + info |= PROCESS_EE_SET_ACC(t2reg); + + _freeXMMreg(t1reg); // don't need + _freeXMMreg(t2reg); // don't need + } + else if( regs->VIwrite & (1<VI[reg].UL; + + if( read != 1 ) { + if( reg == REG_MAC_FLAG ) return (uptr)&VU->macflag; + if( reg == REG_CLIP_FLAG ) return (uptr)&VU->clipflag; + if( reg == REG_STATUS_FLAG ) return (uptr)&VU->statusflag; + if( reg == REG_Q ) return (uptr)&VU->q; + if( reg == REG_P ) return (uptr)&VU->p; + } + + return (uptr)&VU->VI[reg].UL; +} + +// gets a temp reg that is not EEREC_TEMP +int _vuGetTempXMMreg(int info) +{ + int t1reg = -1; + + if( _hasFreeXMMreg() ) { + t1reg = _allocTempXMMreg(XMMT_FPS, -1); + + if( t1reg == EEREC_TEMP ) { + if( _hasFreeXMMreg() ) { + int t = _allocTempXMMreg(XMMT_FPS, -1); + _freeXMMreg(t1reg); + t1reg = t; + } + else { + _freeXMMreg(t1reg); + t1reg = -1; + } + } + } + + return t1reg; +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// Misc VU Reg Flipping/Merging Functions +//------------------------------------------------------------------ +void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw) +{ + switch (xyzw) { + case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break; + case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break; + case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break; + case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break; + } +} + +void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw) +{ + switch (xyzw) { + case 0: SSE_MOVSS_XMM_to_XMM(dstreg, srcreg); break; + case 1: if ( x86caps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0)); + else SSE2_PSHUFLW_XMM_to_XMM(dstreg, srcreg, 0xee); + break; + case 2: SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); break; + case 3: if ( x86caps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0)); + else { SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); SSE2_PSHUFLW_XMM_to_XMM(dstreg, dstreg, 0xee); } + break; + } +} + +void _vuFlipRegSS(VURegs * VU, int reg) +{ + assert( _XYZW_SS ); + if( _Y ) SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e); + else if( _Z ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6); + else if( _W ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27); +} + +void _vuFlipRegSS_xyzw(int reg, int xyzw) +{ + switch ( xyzw ) { + case 1: SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e); break; + case 2: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6); break; + case 3: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27); break; + } +} + +void _vuMoveSS(VURegs * VU, int dstreg, int srcreg) +{ + assert( _XYZW_SS ); + if( _Y ) _unpackVFSS_xyzw(dstreg, srcreg, 1); + else if( _Z ) _unpackVFSS_xyzw(dstreg, srcreg, 2); + else if( _W ) _unpackVFSS_xyzw(dstreg, srcreg, 3); + else _unpackVFSS_xyzw(dstreg, srcreg, 0); +} + +// 1 - src, 0 - dest wzyx +void VU_MERGE0(int dest, int src) { // 0000s +} +void VU_MERGE1(int dest, int src) { // 1000 + SSE_MOVHLPS_XMM_to_XMM(src, dest); + SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4); +} +void VU_MERGE1b(int dest, int src) { // 1000s + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); +} +void VU_MERGE2(int dest, int src) { // 0100 + SSE_MOVHLPS_XMM_to_XMM(src, dest); + SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64); +} +void VU_MERGE2b(int dest, int src) { // 0100s + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); +} +void VU_MERGE3(int dest, int src) { // 1100s + SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); +} +void VU_MERGE4(int dest, int src) { // 0010 + SSE_MOVSS_XMM_to_XMM(src, dest); + SSE2_MOVSD_XMM_to_XMM(dest, src); +} +void VU_MERGE4b(int dest, int src) { // 0010s + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} +void VU_MERGE5(int dest, int src) { // 1010 + SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd8); +} +void VU_MERGE5b(int dest, int src) { // 1010s + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} +void VU_MERGE6(int dest, int src) { // 0110 + SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x78); +} +void VU_MERGE6b(int dest, int src) { // 0110s + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} +void VU_MERGE7(int dest, int src) { // 1110 + SSE_MOVSS_XMM_to_XMM(src, dest); + SSE_MOVAPS_XMM_to_XMM(dest, src); +} +void VU_MERGE7b(int dest, int src) { // 1110s + SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} +void VU_MERGE8(int dest, int src) { // 0001s + SSE_MOVSS_XMM_to_XMM(dest, src); +} +void VU_MERGE9(int dest, int src) { // 1001 + SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd2); +} +void VU_MERGE9b(int dest, int src) { // 1001s + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); +} +void VU_MERGE10(int dest, int src) { // 0101 + SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x72); +} +void VU_MERGE10b(int dest, int src) { // 0101s + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); +} +void VU_MERGE11(int dest, int src) { // 1101s + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); +} +void VU_MERGE12(int dest, int src) { // 0011 + SSE2_MOVSD_XMM_to_XMM(dest, src); +} +void VU_MERGE13(int dest, int src) { // 1011 + SSE_MOVHLPS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, dest, 0x64); + SSE_MOVAPS_XMM_to_XMM(dest, src); +} +void VU_MERGE13b(int dest, int src) { // 1011s + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0x27); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); +} +void VU_MERGE14(int dest, int src) { // 0111 + SSE_MOVHLPS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, dest, 0xc4); + SSE_MOVAPS_XMM_to_XMM(dest, src); +} +void VU_MERGE14b(int dest, int src) { // 0111s + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); + SSE_MOVSS_XMM_to_XMM(dest, src); + SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6); + SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6); +} +void VU_MERGE15(int dest, int src) { // 1111s + SSE_MOVAPS_XMM_to_XMM(dest, src); +} + +typedef void (*VUMERGEFN)(int dest, int src); + +static VUMERGEFN s_VuMerge[16] = { + VU_MERGE0, VU_MERGE1, VU_MERGE2, VU_MERGE3, + VU_MERGE4, VU_MERGE5, VU_MERGE6, VU_MERGE7, + VU_MERGE8, VU_MERGE9, VU_MERGE10, VU_MERGE11, + VU_MERGE12, VU_MERGE13, VU_MERGE14, VU_MERGE15 }; + +static VUMERGEFN s_VuMerge2[16] = { + VU_MERGE0, VU_MERGE1b, VU_MERGE2b, VU_MERGE3, + VU_MERGE4b, VU_MERGE5b, VU_MERGE6b, VU_MERGE7b, + VU_MERGE8, VU_MERGE9b, VU_MERGE10b, VU_MERGE11, + VU_MERGE12, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 }; + +// Modifies the Source Reg! +void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) { + xyzw &= 0xf; + if ( (dest != src) && (xyzw != 0) ) { + if ( x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) { + xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3); + SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw); + } + else s_VuMerge[xyzw](dest, src); + } +} +// Doesn't Modify the Source Reg! (ToDo: s_VuMerge2() has room for optimization) +void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw) { + xyzw &= 0xf; + if ( (dest != src) && (xyzw != 0) ) { + if ( x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) { + xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3); + SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw); + } + else s_VuMerge2[xyzw](dest, src); + } +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// Misc VU Reg Clamping/Overflow Functions +//------------------------------------------------------------------ +#define CLAMP_NORMAL_SSE4(n) \ + SSE_MOVAPS_XMM_to_XMM(regTemp, regd);\ + SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[n][0]);\ + SSE2_PSUBD_XMM_to_XMM(regTemp, regd);\ + SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_ones[0]);\ + SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[n][0]);\ + SSE2_PSLLD_I8_to_XMM(regTemp, 31);\ + SSE_XORPS_XMM_to_XMM(regd, regTemp); + +#define CLAMP_SIGN_SSE4(n) \ + SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[n][0]);\ + SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[n][0]); + +void vFloat0(int regd, int regTemp) { } //0000 +void vFloat1(int regd, int regTemp) { //1000 + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); +} +void vFloat1c(int regd, int regTemp) { //1000 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(1); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat2(int regd, int regTemp) { //0100 + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); +} +void vFloat2c(int regd, int regTemp) { //0100 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(2); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat3(int regd, int regTemp) { //1100 + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); +} +void vFloat3b(int regd, int regTemp) { //1100 //regTemp is Modified + SSE2_MOVSD_XMM_to_XMM(regTemp, regd); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + SSE2_MOVSD_XMM_to_XMM(regd, regTemp); +} +void vFloat3c(int regd, int regTemp) { //1100 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(3); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat4(int regd, int regTemp) { //0010 + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); +} +void vFloat4c(int regd, int regTemp) { //0010 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(4); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat5(int regd, int regTemp) { //1010 + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); +} +void vFloat5b(int regd, int regTemp) { //1010 //regTemp is Modified + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_NORMAL_SSE4(5); + } + else { + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); + } +} +void vFloat5c(int regd, int regTemp) { //1010 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(5); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat6(int regd, int regTemp) { //0110 + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); +} +void vFloat6b(int regd, int regTemp) { //0110 //regTemp is Modified + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_NORMAL_SSE4(6); + } + else { + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); + } +} +void vFloat6c(int regd, int regTemp) { //0110 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(6); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat7(int regd, int regTemp) { //1110 + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39); +} +void vFloat7_useEAX(int regd, int regTemp) { //1110 //EAX is Modified + SSE2_MOVD_XMM_to_R(EAX, regd); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + if ( x86caps.hasStreamingSIMD4Extensions ) + SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00); + else { + SSE_PINSRW_R32_to_XMM(regd, EAX, 0); + SHR32ItoR(EAX, 16); + SSE_PINSRW_R32_to_XMM(regd, EAX, 1); + } +} +void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified + SSE_MOVSS_XMM_to_XMM(regTemp, regd); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + SSE_MOVSS_XMM_to_XMM(regd, regTemp); +} +void vFloat7c(int regd, int regTemp) { //1110 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(7); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat7c_useEAX(int regd, int regTemp) { //1110 //EAX is Modified + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(7); + } + else { + SSE2_MOVD_XMM_to_R(EAX, regd); + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + SSE2_MOVD_R_to_XMM(regTemp, EAX); + SSE_MOVSS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat8(int regd, int regTemp) { //0001 + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); +} +void vFloat8c(int regd, int regTemp) { //0001 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(8); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat9(int regd, int regTemp) { //1001 + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); +} +void vFloat9b(int regd, int regTemp) { //1001 //regTemp is Modified + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_NORMAL_SSE4(9); + } + else { + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + } +} +void vFloat9c(int regd, int regTemp) { //1001 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(9); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat10(int regd, int regTemp) { //0101 + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); +} +void vFloat10b(int regd, int regTemp) { //0101 //regTemp is Modified + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_NORMAL_SSE4(10); + } + else { + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + } +} +void vFloat10c(int regd, int regTemp) { //0101 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(10); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat11(int regd, int regTemp) { //1101 + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); +} +void vFloat11_useEAX(int regd, int regTemp) { //1101 //EAX is Modified + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE2_MOVD_XMM_to_R(EAX, regd); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + if ( x86caps.hasStreamingSIMD4Extensions ) + SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00); + else { + SSE_PINSRW_R32_to_XMM(regd, EAX, 0); + SHR32ItoR(EAX, 16); + SSE_PINSRW_R32_to_XMM(regd, EAX, 1); + } + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); +} +void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + SSE_MOVSS_XMM_to_XMM(regTemp, regd); + SSE2_MOVSD_XMM_to_XMM(regd, regTemp); +} +void vFloat11c(int regd, int regTemp) { //1101 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(11); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat11c_useEAX(int regd, int regTemp) { //1101 // EAX is modified + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(11); + } + else { + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE2_MOVD_XMM_to_R(EAX, regd); + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + SSE2_MOVD_R_to_XMM(regTemp, EAX); + SSE_MOVSS_XMM_to_XMM(regd, regTemp); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + } +} +void vFloat12(int regd, int regTemp) { //0011 + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); +} +void vFloat12b(int regd, int regTemp) { //0011 //regTemp is Modified + SSE_MOVHLPS_XMM_to_XMM(regTemp, regd); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + SSE2_PUNPCKLQDQ_XMM_to_XMM(regd, regTemp); +} +void vFloat12c(int regd, int regTemp) { //0011 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(12); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat13(int regd, int regTemp) { //1011 + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); +} +void vFloat13_useEAX(int regd, int regTemp) { //1011 // EAX is modified + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE2_MOVD_XMM_to_R(EAX, regd); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + if ( x86caps.hasStreamingSIMD4Extensions ) + SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00); + else { + SSE_PINSRW_R32_to_XMM(regd, EAX, 0); + SHR32ItoR(EAX, 16); + SSE_PINSRW_R32_to_XMM(regd, EAX, 1); + } + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); +} +void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + SSE_MOVHLPS_XMM_to_XMM(regTemp, regd); + SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x64); +} +void vFloat13c(int regd, int regTemp) { //1011 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(13); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat13c_useEAX(int regd, int regTemp) { //1011 // EAX is modified + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(13); + } + else { + SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0xc6); + SSE2_MOVD_XMM_to_R(EAX, regd); + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + SSE2_MOVD_R_to_XMM(regTemp, EAX); + SSE_MOVSS_XMM_to_XMM(regd, regTemp); + SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0xc6); + } +} +void vFloat14(int regd, int regTemp) { //0111 + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); +} +void vFloat14_useEAX(int regd, int regTemp) { //0111 // EAX is modified + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); + SSE2_MOVD_XMM_to_R(EAX, regd); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + if ( x86caps.hasStreamingSIMD4Extensions ) + SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00); + else { + SSE_PINSRW_R32_to_XMM(regd, EAX, 0); + SHR32ItoR(EAX, 16); + SSE_PINSRW_R32_to_XMM(regd, EAX, 1); + } + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); +} +void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + SSE_MOVHLPS_XMM_to_XMM(regTemp, regd); + SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xc4); +} +void vFloat14c(int regd, int regTemp) { //0111 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(14); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); + SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} +void vFloat14c_useEAX(int regd, int regTemp) { //0111 // EAX is modified + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(14); + } + else { + SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0x27); + SSE2_MOVD_XMM_to_R(EAX, regd); + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + SSE2_MOVD_R_to_XMM(regTemp, EAX); + SSE_MOVSS_XMM_to_XMM(regd, regTemp); + SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0x27); + } +} +void vFloat15(int regd, int regTemp) { //1111 + SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); + SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); +} +void vFloat15c(int regd, int regTemp) { //1111 + if ( x86caps.hasStreamingSIMD4Extensions ) { + CLAMP_SIGN_SSE4(15); + } + else { + SSE_MOVAPS_XMM_to_XMM(regTemp, regd); + SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); + SSE_MINPS_M128_to_XMM(regd, (uptr)&g_maxvals[0]); + SSE_MAXPS_M128_to_XMM(regd, (uptr)&g_minvals[0]); + SSE_ORPS_XMM_to_XMM(regd, regTemp); + } +} + +vFloat vFloats1[16] = { //regTemp is not modified + vFloat0, vFloat1, vFloat2, vFloat3, + vFloat4, vFloat5, vFloat6, vFloat7, + vFloat8, vFloat9, vFloat10, vFloat11, + vFloat12, vFloat13, vFloat14, vFloat15 }; + +vFloat vFloats1_useEAX[16] = { //regTemp is not modified but EAX is used + vFloat0, vFloat1, vFloat2, vFloat3, + vFloat4, vFloat5, vFloat6, vFloat7_useEAX, + vFloat8, vFloat9, vFloat10, vFloat11_useEAX, + vFloat12, vFloat13_useEAX, vFloat14_useEAX, vFloat15 }; + +vFloat vFloats2[16] = { //regTemp is modified + vFloat0, vFloat1, vFloat2, vFloat3b, + vFloat4, vFloat5b, vFloat6b, vFloat7b, + vFloat8, vFloat9b, vFloat10b, vFloat11b, + vFloat12b, vFloat13b, vFloat14b, vFloat15 }; + +vFloat vFloats4[16] = { //regTemp is modified + vFloat0, vFloat1c, vFloat2c, vFloat3c, + vFloat4c, vFloat5c, vFloat6c, vFloat7c, + vFloat8c, vFloat9c, vFloat10c, vFloat11c, + vFloat12c, vFloat13c, vFloat14c, vFloat15c }; + +vFloat vFloats4_useEAX[16] = { //regTemp is modified and EAX is used + vFloat0, vFloat1c, vFloat2c, vFloat3c, + vFloat4c, vFloat5c, vFloat6c, vFloat7c_useEAX, + vFloat8c, vFloat9c, vFloat10c, vFloat11c_useEAX, + vFloat12c, vFloat13c_useEAX, vFloat14c_useEAX, vFloat15c }; + +//------------------------------------------------------------------ +// Clamping Functions (wrapper for vFloat* functions) +// vuFloat : "normal" clamping +// vuFloat_useEAX : "normal" clamping (faster but EAX is modified) +// vuFloat2 : "normal" clamping (fastest but regTemp is modified) +// vuFloat3 : "preserve sign" clamping for pointer +// vuFloat4 : "preserve sign" clamping (regTemp is modified; *FASTEST* on SSE4 CPUs) +// vuFloat4_useEAX : "preserve sign" clamping (faster but regTemp and EAX are modified) +// vuFloat5 : wrapper function for vuFloat2 and vuFloat4 +// vuFloat5_useEAX : wrapper function for vuFloat2 and vuFloat4_useEAX +// vuFloatExtra : for debugging +// +// Notice 1: vuFloat*_useEAX may be slower on AMD CPUs, which have independent execution pipeline for +// vector and scalar instructions (need checks) +// Notice 2: recVUMI_MUL_xyzw_toD and recVUMI_MADD_xyzw_toD use vFloats directly! +//------------------------------------------------------------------ + +// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (doesn't use any temp regs) +void vuFloat( int info, int regd, int XYZW) { + if( CHECK_VU_OVERFLOW ) { + /*if ( (XYZW != 0) && (XYZW != 8) && (XYZW != 0xF) ) { + int t1reg = _vuGetTempXMMreg(info); + if (t1reg >= 0) { + vuFloat2( regd, t1reg, XYZW ); + _freeXMMreg( t1reg ); + return; + } + }*/ + //vuFloatExtra(regd, XYZW); + vFloats1[XYZW](regd, regd); + } +} + +// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses EAX as a temp register; faster but **destroys EAX**) +void vuFloat_useEAX( int info, int regd, int XYZW) { + if( CHECK_VU_OVERFLOW ) { + vFloats1_useEAX[XYZW](regd, regd); + } +} + +// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses a temp reg) +void vuFloat2(int regd, int regTemp, int XYZW) { + if( CHECK_VU_OVERFLOW ) { + //vuFloatExtra(regd, XYZW); + vFloats2[XYZW](regd, regTemp); + } +} + +// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg) +void vuFloat4(int regd, int regTemp, int XYZW) { + if( CHECK_VU_OVERFLOW ) { + vFloats4[XYZW](regd, regTemp); + } +} + +// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg, and uses EAX as a temp register; faster but **destroys EAX**) +void vuFloat4_useEAX(int regd, int regTemp, int XYZW) { + if( CHECK_VU_OVERFLOW ) { + vFloats4_useEAX[XYZW](regd, regTemp); + } +} + +// Uses vuFloat4 or vuFloat2 depending on the CHECK_VU_SIGN_OVERFLOW setting +void vuFloat5(int regd, int regTemp, int XYZW) { + if (CHECK_VU_SIGN_OVERFLOW) { + vuFloat4(regd, regTemp, XYZW); + } + else vuFloat2(regd, regTemp, XYZW); +} + +// Uses vuFloat4_useEAX or vuFloat2 depending on the CHECK_VU_SIGN_OVERFLOW setting (uses EAX as a temp register; faster but **destoroyes EAX**) +void vuFloat5_useEAX(int regd, int regTemp, int XYZW) { + if (CHECK_VU_SIGN_OVERFLOW) { + vuFloat4_useEAX(regd, regTemp, XYZW); + } + else vuFloat2(regd, regTemp, XYZW); +} + +// Clamps +/-infs to +/-fMax, and +/-NaNs to +/-fMax +void vuFloat3(uptr x86ptr) { + u8* pjmp; + + if( CHECK_VU_OVERFLOW ) { + CMP32ItoM(x86ptr, 0x7f800000 ); + pjmp = JL8(0); // Signed Comparison + MOV32ItoM(x86ptr, 0x7f7fffff ); + x86SetJ8(pjmp); + + CMP32ItoM(x86ptr, 0xff800000 ); + pjmp = JB8(0); // Unsigned Comparison + MOV32ItoM(x86ptr, 0xff7fffff ); + x86SetJ8(pjmp); + } +} + +__aligned16 u64 vuFloatData[4]; + +// Makes NaN == 0, Infinities stay the same; Very Slow - Use only for debugging +void vuFloatExtra( int regd, int XYZW) { + int t1reg = (regd == 0) ? (regd + 1) : (regd - 1); + int t2reg = (regd <= 1) ? (regd + 2) : (regd - 2); + SSE_MOVAPS_XMM_to_M128( (uptr)&vuFloatData[0], t1reg ); + SSE_MOVAPS_XMM_to_M128( (uptr)&vuFloatData[2], t2reg ); + + SSE_XORPS_XMM_to_XMM(t1reg, t1reg); + SSE_CMPORDPS_XMM_to_XMM(t1reg, regd); + SSE_MOVAPS_XMM_to_XMM(t2reg, regd); + SSE_ANDPS_XMM_to_XMM(t2reg, t1reg); + VU_MERGE_REGS_CUSTOM(regd, t2reg, XYZW); + + SSE_MOVAPS_M128_to_XMM( t1reg, (uptr)&vuFloatData[0] ); + SSE_MOVAPS_M128_to_XMM( t2reg, (uptr)&vuFloatData[2] ); +} + +static __aligned16 u32 tempRegX[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000}; + +// Called by testWhenOverflow() function +void testPrintOverflow() { + tempRegX[0] &= 0xff800000; + tempRegX[1] &= 0xff800000; + tempRegX[2] &= 0xff800000; + tempRegX[3] &= 0xff800000; + if ( (tempRegX[0] == 0x7f800000) || (tempRegX[1] == 0x7f800000) || (tempRegX[2] == 0x7f800000) || (tempRegX[3] == 0x7f800000) ) + Console.Warning( "VU OVERFLOW!: Changing to +Fmax!!!!!!!!!!!!" ); + if ( (tempRegX[0] == 0xff800000) || (tempRegX[1] == 0xff800000) || (tempRegX[2] == 0xff800000) || (tempRegX[3] == 0xff800000) ) + Console.Warning( "VU OVERFLOW!: Changing to -Fmax!!!!!!!!!!!!" ); +} + +// Outputs to the console when overflow has occured. +void testWhenOverflow(int info, int regd, int t0reg) { + SSE_MOVAPS_XMM_to_M128((uptr)tempRegX, regd); + CALLFunc((uptr)testPrintOverflow); +} diff --git a/pcsx2/x86/sVU_Micro.h b/pcsx2/x86/sVU_Micro.h index 9d7415f380..4aec2425ed 100644 --- a/pcsx2/x86/sVU_Micro.h +++ b/pcsx2/x86/sVU_Micro.h @@ -1,283 +1,283 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#pragma once - -#include "VUmicro.h" - -extern u32 vudump; - -#define VU0_MEMSIZE 0x1000 -#define VU1_MEMSIZE 0x4000 - -void recResetVU0(); -void recExecuteVU0Block(); -void recClearVU0( u32 Addr, u32 Size ); - -void recVU1Init(); -void recVU1Shutdown(); -void recResetVU1(); -void recExecuteVU1Block(); -void recClearVU1( u32 Addr, u32 Size ); - - -u32 GetVIAddr(VURegs * VU, int reg, int read, int info); // returns the correct VI addr -void recUpdateFlags(VURegs * VU, int reg, int info); - -void _recvuTestPipes(VURegs * VU); -void _recvuFlushFDIV(VURegs * VU); -void _recvuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn); -void _recvuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn); -void _recvuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn); -void _recvuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn); - -#define VUOP_READ 2 -#define VUOP_WRITE 4 - -// save on mem -struct _vuopinfo { - int cycle; - int cycles; - u8 statusflag; - u8 macflag; - u8 clipflag; - u8 dummy; - u8 q; - u8 p; - u16 pqinst; // bit of instruction specifying index (srec only) -}; - -void SuperVUAnalyzeOp(VURegs *VU, _vuopinfo *info, _VURegsNum* pCodeRegs); -int eeVURecompileCode(VURegs *VU, _VURegsNum* regs); // allocates all the necessary regs and returns the indices -void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr); // used for MTGS in XGKICK - -extern int vucycle; -typedef void (*vFloat)(int regd, int regTemp); -extern vFloat vFloats1[16]; -extern vFloat vFloats1_useEAX[16]; -extern vFloat vFloats2[16]; -extern vFloat vFloats4[16]; -extern vFloat vFloats4_useEAX[16]; -extern const __aligned16 float s_fones[8]; -extern const __aligned16 u32 s_mask[4]; -extern const __aligned16 u32 s_expmask[4]; -extern const __aligned16 u32 g_minvals[4]; -extern const __aligned16 u32 g_maxvals[4]; -extern const __aligned16 u32 const_clip[8]; - -u32 GetVIAddr(VURegs * VU, int reg, int read, int info); -int _vuGetTempXMMreg(int info); -void vuFloat(int info, int regd, int XYZW); -void vuFloat_useEAX(int regd, int regTemp, int XYZW); -void vuFloat2(int regd, int regTemp, int XYZW); -void vuFloat3(uptr x86ptr); -void vuFloat4(int regd, int regTemp, int XYZW); -void vuFloat4_useEAX(int regd, int regTemp, int XYZW); -void vuFloat5(int regd, int regTemp, int XYZW); -void vuFloat5_useEAX(int regd, int regTemp, int XYZW); -void _vuFlipRegSS(VURegs * VU, int reg); -void _vuFlipRegSS_xyzw(int reg, int xyzw); -void _vuMoveSS(VURegs * VU, int dstreg, int srcreg); -void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw); -void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw); -void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw); -void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw); -#define VU_MERGE_REGS(dest, src) { \ - VU_MERGE_REGS_CUSTOM(dest, src, _X_Y_Z_W); \ -} - -// use for allocating vi regs -#define ALLOCTEMPX86(mode) _allocX86reg(-1, X86TYPE_TEMP, 0, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode) -#define ALLOCVI(vi, mode) _allocX86reg(-1, X86TYPE_VI|((VU==&VU1)?X86TYPE_VU1:0), vi, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode) -#define ADD_VI_NEEDED(vi) _addNeededX86reg(X86TYPE_VI|(VU==&VU1?X86TYPE_VU1:0), vi); - -#define SWAP(x, y) *(u32*)&y ^= *(u32*)&x ^= *(u32*)&y ^= *(u32*)&x; - -/***************************************** - VU Micromode Upper instructions -*****************************************/ - -void recVUMI_ABS(VURegs *vuRegs, int info); -void recVUMI_ADD(VURegs *vuRegs, int info); -void recVUMI_ADDi(VURegs *vuRegs, int info); -void recVUMI_ADDq(VURegs *vuRegs, int info); -void recVUMI_ADDx(VURegs *vuRegs, int info); -void recVUMI_ADDy(VURegs *vuRegs, int info); -void recVUMI_ADDz(VURegs *vuRegs, int info); -void recVUMI_ADDw(VURegs *vuRegs, int info); -void recVUMI_ADDA(VURegs *vuRegs, int info); -void recVUMI_ADDAi(VURegs *vuRegs, int info); -void recVUMI_ADDAq(VURegs *vuRegs, int info); -void recVUMI_ADDAx(VURegs *vuRegs, int info); -void recVUMI_ADDAy(VURegs *vuRegs, int info); -void recVUMI_ADDAz(VURegs *vuRegs, int info); -void recVUMI_ADDAw(VURegs *vuRegs, int info); -void recVUMI_SUB(VURegs *vuRegs, int info); -void recVUMI_SUBi(VURegs *vuRegs, int info); -void recVUMI_SUBq(VURegs *vuRegs, int info); -void recVUMI_SUBx(VURegs *vuRegs, int info); -void recVUMI_SUBy(VURegs *vuRegs, int info); -void recVUMI_SUBz(VURegs *vuRegs, int info); -void recVUMI_SUBw(VURegs *vuRegs, int info); -void recVUMI_SUBA(VURegs *vuRegs, int info); -void recVUMI_SUBAi(VURegs *vuRegs, int info); -void recVUMI_SUBAq(VURegs *vuRegs, int info); -void recVUMI_SUBAx(VURegs *vuRegs, int info); -void recVUMI_SUBAy(VURegs *vuRegs, int info); -void recVUMI_SUBAz(VURegs *vuRegs, int info); -void recVUMI_SUBAw(VURegs *vuRegs, int info); -void recVUMI_MUL(VURegs *vuRegs, int info); -void recVUMI_MULi(VURegs *vuRegs, int info); -void recVUMI_MULq(VURegs *vuRegs, int info); -void recVUMI_MULx(VURegs *vuRegs, int info); -void recVUMI_MULy(VURegs *vuRegs, int info); -void recVUMI_MULz(VURegs *vuRegs, int info); -void recVUMI_MULw(VURegs *vuRegs, int info); -void recVUMI_MULA(VURegs *vuRegs, int info); -void recVUMI_MULAi(VURegs *vuRegs, int info); -void recVUMI_MULAq(VURegs *vuRegs, int info); -void recVUMI_MULAx(VURegs *vuRegs, int info); -void recVUMI_MULAy(VURegs *vuRegs, int info); -void recVUMI_MULAz(VURegs *vuRegs, int info); -void recVUMI_MULAw(VURegs *vuRegs, int info); -void recVUMI_MADD(VURegs *vuRegs, int info); -void recVUMI_MADDi(VURegs *vuRegs, int info); -void recVUMI_MADDq(VURegs *vuRegs, int info); -void recVUMI_MADDx(VURegs *vuRegs, int info); -void recVUMI_MADDy(VURegs *vuRegs, int info); -void recVUMI_MADDz(VURegs *vuRegs, int info); -void recVUMI_MADDw(VURegs *vuRegs, int info); -void recVUMI_MADDA(VURegs *vuRegs, int info); -void recVUMI_MADDAi(VURegs *vuRegs, int info); -void recVUMI_MADDAq(VURegs *vuRegs, int info); -void recVUMI_MADDAx(VURegs *vuRegs, int info); -void recVUMI_MADDAy(VURegs *vuRegs, int info); -void recVUMI_MADDAz(VURegs *vuRegs, int info); -void recVUMI_MADDAw(VURegs *vuRegs, int info); -void recVUMI_MSUB(VURegs *vuRegs, int info); -void recVUMI_MSUBi(VURegs *vuRegs, int info); -void recVUMI_MSUBq(VURegs *vuRegs, int info); -void recVUMI_MSUBx(VURegs *vuRegs, int info); -void recVUMI_MSUBy(VURegs *vuRegs, int info); -void recVUMI_MSUBz(VURegs *vuRegs, int info); -void recVUMI_MSUBw(VURegs *vuRegs, int info); -void recVUMI_MSUBA(VURegs *vuRegs, int info); -void recVUMI_MSUBAi(VURegs *vuRegs, int info); -void recVUMI_MSUBAq(VURegs *vuRegs, int info); -void recVUMI_MSUBAx(VURegs *vuRegs, int info); -void recVUMI_MSUBAy(VURegs *vuRegs, int info); -void recVUMI_MSUBAz(VURegs *vuRegs, int info); -void recVUMI_MSUBAw(VURegs *vuRegs, int info); -void recVUMI_MAX(VURegs *vuRegs, int info); -void recVUMI_MAXi(VURegs *vuRegs, int info); -void recVUMI_MAXx(VURegs *vuRegs, int info); -void recVUMI_MAXy(VURegs *vuRegs, int info); -void recVUMI_MAXz(VURegs *vuRegs, int info); -void recVUMI_MAXw(VURegs *vuRegs, int info); -void recVUMI_MINI(VURegs *vuRegs, int info); -void recVUMI_MINIi(VURegs *vuRegs, int info); -void recVUMI_MINIx(VURegs *vuRegs, int info); -void recVUMI_MINIy(VURegs *vuRegs, int info); -void recVUMI_MINIz(VURegs *vuRegs, int info); -void recVUMI_MINIw(VURegs *vuRegs, int info); -void recVUMI_OPMULA(VURegs *vuRegs, int info); -void recVUMI_OPMSUB(VURegs *vuRegs, int info); -void recVUMI_NOP(VURegs *vuRegs, int info); -void recVUMI_FTOI0(VURegs *vuRegs, int info); -void recVUMI_FTOI4(VURegs *vuRegs, int info); -void recVUMI_FTOI12(VURegs *vuRegs, int info); -void recVUMI_FTOI15(VURegs *vuRegs, int info); -void recVUMI_ITOF0(VURegs *vuRegs, int info); -void recVUMI_ITOF4(VURegs *vuRegs, int info); -void recVUMI_ITOF12(VURegs *vuRegs, int info); -void recVUMI_ITOF15(VURegs *vuRegs, int info); -void recVUMI_CLIP(VURegs *vuRegs, int info); - -/***************************************** - VU Micromode Lower instructions -*****************************************/ - -void recVUMI_DIV(VURegs *vuRegs, int info); -void recVUMI_SQRT(VURegs *vuRegs, int info); -void recVUMI_RSQRT(VURegs *vuRegs, int info); -void recVUMI_IADD(VURegs *vuRegs, int info); -void recVUMI_IADDI(VURegs *vuRegs, int info); -void recVUMI_IADDIU(VURegs *vuRegs, int info); -void recVUMI_IAND(VURegs *vuRegs, int info); -void recVUMI_IOR(VURegs *vuRegs, int info); -void recVUMI_ISUB(VURegs *vuRegs, int info); -void recVUMI_ISUBIU(VURegs *vuRegs, int info); -void recVUMI_MOVE(VURegs *vuRegs, int info); -void recVUMI_MFIR(VURegs *vuRegs, int info); -void recVUMI_MTIR(VURegs *vuRegs, int info); -void recVUMI_MR32(VURegs *vuRegs, int info); -void recVUMI_LQ(VURegs *vuRegs, int info); -void recVUMI_LQD(VURegs *vuRegs, int info); -void recVUMI_LQI(VURegs *vuRegs, int info); -void recVUMI_SQ(VURegs *vuRegs, int info); -void recVUMI_SQD(VURegs *vuRegs, int info); -void recVUMI_SQI(VURegs *vuRegs, int info); -void recVUMI_ILW(VURegs *vuRegs, int info); -void recVUMI_ISW(VURegs *vuRegs, int info); -void recVUMI_ILWR(VURegs *vuRegs, int info); -void recVUMI_ISWR(VURegs *vuRegs, int info); -void recVUMI_LOI(VURegs *vuRegs, int info); -void recVUMI_RINIT(VURegs *vuRegs, int info); -void recVUMI_RGET(VURegs *vuRegs, int info); -void recVUMI_RNEXT(VURegs *vuRegs, int info); -void recVUMI_RXOR(VURegs *vuRegs, int info); -void recVUMI_WAITQ(VURegs *vuRegs, int info); -void recVUMI_FSAND(VURegs *vuRegs, int info); -void recVUMI_FSEQ(VURegs *vuRegs, int info); -void recVUMI_FSOR(VURegs *vuRegs, int info); -void recVUMI_FSSET(VURegs *vuRegs, int info); -void recVUMI_FMAND(VURegs *vuRegs, int info); -void recVUMI_FMEQ(VURegs *vuRegs, int info); -void recVUMI_FMOR(VURegs *vuRegs, int info); -void recVUMI_FCAND(VURegs *vuRegs, int info); -void recVUMI_FCEQ(VURegs *vuRegs, int info); -void recVUMI_FCOR(VURegs *vuRegs, int info); -void recVUMI_FCSET(VURegs *vuRegs, int info); -void recVUMI_FCGET(VURegs *vuRegs, int info); -void recVUMI_IBEQ(VURegs *vuRegs, int info); -void recVUMI_IBGEZ(VURegs *vuRegs, int info); -void recVUMI_IBGTZ(VURegs *vuRegs, int info); -void recVUMI_IBLTZ(VURegs *vuRegs, int info); -void recVUMI_IBLEZ(VURegs *vuRegs, int info); -void recVUMI_IBNE(VURegs *vuRegs, int info); -void recVUMI_B(VURegs *vuRegs, int info); -void recVUMI_BAL(VURegs *vuRegs, int info); -void recVUMI_JR(VURegs *vuRegs, int info); -void recVUMI_JALR(VURegs *vuRegs, int info); -void recVUMI_MFP(VURegs *vuRegs, int info); -void recVUMI_WAITP(VURegs *vuRegs, int info); -void recVUMI_ESADD(VURegs *vuRegs, int info); -void recVUMI_ERSADD(VURegs *vuRegs, int info); -void recVUMI_ELENG(VURegs *vuRegs, int info); -void recVUMI_ERLENG(VURegs *vuRegs, int info); -void recVUMI_EATANxy(VURegs *vuRegs, int info); -void recVUMI_EATANxz(VURegs *vuRegs, int info); -void recVUMI_ESUM(VURegs *vuRegs, int info); -void recVUMI_ERCPR(VURegs *vuRegs, int info); -void recVUMI_ESQRT(VURegs *vuRegs, int info); -void recVUMI_ERSQRT(VURegs *vuRegs, int info); -void recVUMI_ESIN(VURegs *vuRegs, int info); -void recVUMI_EATAN(VURegs *vuRegs, int info); -void recVUMI_EEXP(VURegs *vuRegs, int info); -void recVUMI_XGKICK(VURegs *vuRegs, int info); -void recVUMI_XTOP(VURegs *vuRegs, int info); -void recVUMI_XITOP(VURegs *vuRegs, int info); -void recVUMI_XTOP( VURegs *VU , int info); - +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +#include "VUmicro.h" + +extern u32 vudump; + +#define VU0_MEMSIZE 0x1000 +#define VU1_MEMSIZE 0x4000 + +void recResetVU0(); +void recExecuteVU0Block(); +void recClearVU0( u32 Addr, u32 Size ); + +void recVU1Init(); +void recVU1Shutdown(); +void recResetVU1(); +void recExecuteVU1Block(); +void recClearVU1( u32 Addr, u32 Size ); + + +u32 GetVIAddr(VURegs * VU, int reg, int read, int info); // returns the correct VI addr +void recUpdateFlags(VURegs * VU, int reg, int info); + +void _recvuTestPipes(VURegs * VU); +void _recvuFlushFDIV(VURegs * VU); +void _recvuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn); +void _recvuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn); +void _recvuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn); +void _recvuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn); + +#define VUOP_READ 2 +#define VUOP_WRITE 4 + +// save on mem +struct _vuopinfo { + int cycle; + int cycles; + u8 statusflag; + u8 macflag; + u8 clipflag; + u8 dummy; + u8 q; + u8 p; + u16 pqinst; // bit of instruction specifying index (srec only) +}; + +void SuperVUAnalyzeOp(VURegs *VU, _vuopinfo *info, _VURegsNum* pCodeRegs); +int eeVURecompileCode(VURegs *VU, _VURegsNum* regs); // allocates all the necessary regs and returns the indices +void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr); // used for MTGS in XGKICK + +extern int vucycle; +typedef void (*vFloat)(int regd, int regTemp); +extern vFloat vFloats1[16]; +extern vFloat vFloats1_useEAX[16]; +extern vFloat vFloats2[16]; +extern vFloat vFloats4[16]; +extern vFloat vFloats4_useEAX[16]; +extern const __aligned16 float s_fones[8]; +extern const __aligned16 u32 s_mask[4]; +extern const __aligned16 u32 s_expmask[4]; +extern const __aligned16 u32 g_minvals[4]; +extern const __aligned16 u32 g_maxvals[4]; +extern const __aligned16 u32 const_clip[8]; + +u32 GetVIAddr(VURegs * VU, int reg, int read, int info); +int _vuGetTempXMMreg(int info); +void vuFloat(int info, int regd, int XYZW); +void vuFloat_useEAX(int regd, int regTemp, int XYZW); +void vuFloat2(int regd, int regTemp, int XYZW); +void vuFloat3(uptr x86ptr); +void vuFloat4(int regd, int regTemp, int XYZW); +void vuFloat4_useEAX(int regd, int regTemp, int XYZW); +void vuFloat5(int regd, int regTemp, int XYZW); +void vuFloat5_useEAX(int regd, int regTemp, int XYZW); +void _vuFlipRegSS(VURegs * VU, int reg); +void _vuFlipRegSS_xyzw(int reg, int xyzw); +void _vuMoveSS(VURegs * VU, int dstreg, int srcreg); +void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw); +void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw); +void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw); +void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw); +#define VU_MERGE_REGS(dest, src) { \ + VU_MERGE_REGS_CUSTOM(dest, src, _X_Y_Z_W); \ +} + +// use for allocating vi regs +#define ALLOCTEMPX86(mode) _allocX86reg(-1, X86TYPE_TEMP, 0, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode) +#define ALLOCVI(vi, mode) _allocX86reg(-1, X86TYPE_VI|((VU==&VU1)?X86TYPE_VU1:0), vi, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode) +#define ADD_VI_NEEDED(vi) _addNeededX86reg(X86TYPE_VI|(VU==&VU1?X86TYPE_VU1:0), vi); + +#define SWAP(x, y) *(u32*)&y ^= *(u32*)&x ^= *(u32*)&y ^= *(u32*)&x; + +/***************************************** + VU Micromode Upper instructions +*****************************************/ + +void recVUMI_ABS(VURegs *vuRegs, int info); +void recVUMI_ADD(VURegs *vuRegs, int info); +void recVUMI_ADDi(VURegs *vuRegs, int info); +void recVUMI_ADDq(VURegs *vuRegs, int info); +void recVUMI_ADDx(VURegs *vuRegs, int info); +void recVUMI_ADDy(VURegs *vuRegs, int info); +void recVUMI_ADDz(VURegs *vuRegs, int info); +void recVUMI_ADDw(VURegs *vuRegs, int info); +void recVUMI_ADDA(VURegs *vuRegs, int info); +void recVUMI_ADDAi(VURegs *vuRegs, int info); +void recVUMI_ADDAq(VURegs *vuRegs, int info); +void recVUMI_ADDAx(VURegs *vuRegs, int info); +void recVUMI_ADDAy(VURegs *vuRegs, int info); +void recVUMI_ADDAz(VURegs *vuRegs, int info); +void recVUMI_ADDAw(VURegs *vuRegs, int info); +void recVUMI_SUB(VURegs *vuRegs, int info); +void recVUMI_SUBi(VURegs *vuRegs, int info); +void recVUMI_SUBq(VURegs *vuRegs, int info); +void recVUMI_SUBx(VURegs *vuRegs, int info); +void recVUMI_SUBy(VURegs *vuRegs, int info); +void recVUMI_SUBz(VURegs *vuRegs, int info); +void recVUMI_SUBw(VURegs *vuRegs, int info); +void recVUMI_SUBA(VURegs *vuRegs, int info); +void recVUMI_SUBAi(VURegs *vuRegs, int info); +void recVUMI_SUBAq(VURegs *vuRegs, int info); +void recVUMI_SUBAx(VURegs *vuRegs, int info); +void recVUMI_SUBAy(VURegs *vuRegs, int info); +void recVUMI_SUBAz(VURegs *vuRegs, int info); +void recVUMI_SUBAw(VURegs *vuRegs, int info); +void recVUMI_MUL(VURegs *vuRegs, int info); +void recVUMI_MULi(VURegs *vuRegs, int info); +void recVUMI_MULq(VURegs *vuRegs, int info); +void recVUMI_MULx(VURegs *vuRegs, int info); +void recVUMI_MULy(VURegs *vuRegs, int info); +void recVUMI_MULz(VURegs *vuRegs, int info); +void recVUMI_MULw(VURegs *vuRegs, int info); +void recVUMI_MULA(VURegs *vuRegs, int info); +void recVUMI_MULAi(VURegs *vuRegs, int info); +void recVUMI_MULAq(VURegs *vuRegs, int info); +void recVUMI_MULAx(VURegs *vuRegs, int info); +void recVUMI_MULAy(VURegs *vuRegs, int info); +void recVUMI_MULAz(VURegs *vuRegs, int info); +void recVUMI_MULAw(VURegs *vuRegs, int info); +void recVUMI_MADD(VURegs *vuRegs, int info); +void recVUMI_MADDi(VURegs *vuRegs, int info); +void recVUMI_MADDq(VURegs *vuRegs, int info); +void recVUMI_MADDx(VURegs *vuRegs, int info); +void recVUMI_MADDy(VURegs *vuRegs, int info); +void recVUMI_MADDz(VURegs *vuRegs, int info); +void recVUMI_MADDw(VURegs *vuRegs, int info); +void recVUMI_MADDA(VURegs *vuRegs, int info); +void recVUMI_MADDAi(VURegs *vuRegs, int info); +void recVUMI_MADDAq(VURegs *vuRegs, int info); +void recVUMI_MADDAx(VURegs *vuRegs, int info); +void recVUMI_MADDAy(VURegs *vuRegs, int info); +void recVUMI_MADDAz(VURegs *vuRegs, int info); +void recVUMI_MADDAw(VURegs *vuRegs, int info); +void recVUMI_MSUB(VURegs *vuRegs, int info); +void recVUMI_MSUBi(VURegs *vuRegs, int info); +void recVUMI_MSUBq(VURegs *vuRegs, int info); +void recVUMI_MSUBx(VURegs *vuRegs, int info); +void recVUMI_MSUBy(VURegs *vuRegs, int info); +void recVUMI_MSUBz(VURegs *vuRegs, int info); +void recVUMI_MSUBw(VURegs *vuRegs, int info); +void recVUMI_MSUBA(VURegs *vuRegs, int info); +void recVUMI_MSUBAi(VURegs *vuRegs, int info); +void recVUMI_MSUBAq(VURegs *vuRegs, int info); +void recVUMI_MSUBAx(VURegs *vuRegs, int info); +void recVUMI_MSUBAy(VURegs *vuRegs, int info); +void recVUMI_MSUBAz(VURegs *vuRegs, int info); +void recVUMI_MSUBAw(VURegs *vuRegs, int info); +void recVUMI_MAX(VURegs *vuRegs, int info); +void recVUMI_MAXi(VURegs *vuRegs, int info); +void recVUMI_MAXx(VURegs *vuRegs, int info); +void recVUMI_MAXy(VURegs *vuRegs, int info); +void recVUMI_MAXz(VURegs *vuRegs, int info); +void recVUMI_MAXw(VURegs *vuRegs, int info); +void recVUMI_MINI(VURegs *vuRegs, int info); +void recVUMI_MINIi(VURegs *vuRegs, int info); +void recVUMI_MINIx(VURegs *vuRegs, int info); +void recVUMI_MINIy(VURegs *vuRegs, int info); +void recVUMI_MINIz(VURegs *vuRegs, int info); +void recVUMI_MINIw(VURegs *vuRegs, int info); +void recVUMI_OPMULA(VURegs *vuRegs, int info); +void recVUMI_OPMSUB(VURegs *vuRegs, int info); +void recVUMI_NOP(VURegs *vuRegs, int info); +void recVUMI_FTOI0(VURegs *vuRegs, int info); +void recVUMI_FTOI4(VURegs *vuRegs, int info); +void recVUMI_FTOI12(VURegs *vuRegs, int info); +void recVUMI_FTOI15(VURegs *vuRegs, int info); +void recVUMI_ITOF0(VURegs *vuRegs, int info); +void recVUMI_ITOF4(VURegs *vuRegs, int info); +void recVUMI_ITOF12(VURegs *vuRegs, int info); +void recVUMI_ITOF15(VURegs *vuRegs, int info); +void recVUMI_CLIP(VURegs *vuRegs, int info); + +/***************************************** + VU Micromode Lower instructions +*****************************************/ + +void recVUMI_DIV(VURegs *vuRegs, int info); +void recVUMI_SQRT(VURegs *vuRegs, int info); +void recVUMI_RSQRT(VURegs *vuRegs, int info); +void recVUMI_IADD(VURegs *vuRegs, int info); +void recVUMI_IADDI(VURegs *vuRegs, int info); +void recVUMI_IADDIU(VURegs *vuRegs, int info); +void recVUMI_IAND(VURegs *vuRegs, int info); +void recVUMI_IOR(VURegs *vuRegs, int info); +void recVUMI_ISUB(VURegs *vuRegs, int info); +void recVUMI_ISUBIU(VURegs *vuRegs, int info); +void recVUMI_MOVE(VURegs *vuRegs, int info); +void recVUMI_MFIR(VURegs *vuRegs, int info); +void recVUMI_MTIR(VURegs *vuRegs, int info); +void recVUMI_MR32(VURegs *vuRegs, int info); +void recVUMI_LQ(VURegs *vuRegs, int info); +void recVUMI_LQD(VURegs *vuRegs, int info); +void recVUMI_LQI(VURegs *vuRegs, int info); +void recVUMI_SQ(VURegs *vuRegs, int info); +void recVUMI_SQD(VURegs *vuRegs, int info); +void recVUMI_SQI(VURegs *vuRegs, int info); +void recVUMI_ILW(VURegs *vuRegs, int info); +void recVUMI_ISW(VURegs *vuRegs, int info); +void recVUMI_ILWR(VURegs *vuRegs, int info); +void recVUMI_ISWR(VURegs *vuRegs, int info); +void recVUMI_LOI(VURegs *vuRegs, int info); +void recVUMI_RINIT(VURegs *vuRegs, int info); +void recVUMI_RGET(VURegs *vuRegs, int info); +void recVUMI_RNEXT(VURegs *vuRegs, int info); +void recVUMI_RXOR(VURegs *vuRegs, int info); +void recVUMI_WAITQ(VURegs *vuRegs, int info); +void recVUMI_FSAND(VURegs *vuRegs, int info); +void recVUMI_FSEQ(VURegs *vuRegs, int info); +void recVUMI_FSOR(VURegs *vuRegs, int info); +void recVUMI_FSSET(VURegs *vuRegs, int info); +void recVUMI_FMAND(VURegs *vuRegs, int info); +void recVUMI_FMEQ(VURegs *vuRegs, int info); +void recVUMI_FMOR(VURegs *vuRegs, int info); +void recVUMI_FCAND(VURegs *vuRegs, int info); +void recVUMI_FCEQ(VURegs *vuRegs, int info); +void recVUMI_FCOR(VURegs *vuRegs, int info); +void recVUMI_FCSET(VURegs *vuRegs, int info); +void recVUMI_FCGET(VURegs *vuRegs, int info); +void recVUMI_IBEQ(VURegs *vuRegs, int info); +void recVUMI_IBGEZ(VURegs *vuRegs, int info); +void recVUMI_IBGTZ(VURegs *vuRegs, int info); +void recVUMI_IBLTZ(VURegs *vuRegs, int info); +void recVUMI_IBLEZ(VURegs *vuRegs, int info); +void recVUMI_IBNE(VURegs *vuRegs, int info); +void recVUMI_B(VURegs *vuRegs, int info); +void recVUMI_BAL(VURegs *vuRegs, int info); +void recVUMI_JR(VURegs *vuRegs, int info); +void recVUMI_JALR(VURegs *vuRegs, int info); +void recVUMI_MFP(VURegs *vuRegs, int info); +void recVUMI_WAITP(VURegs *vuRegs, int info); +void recVUMI_ESADD(VURegs *vuRegs, int info); +void recVUMI_ERSADD(VURegs *vuRegs, int info); +void recVUMI_ELENG(VURegs *vuRegs, int info); +void recVUMI_ERLENG(VURegs *vuRegs, int info); +void recVUMI_EATANxy(VURegs *vuRegs, int info); +void recVUMI_EATANxz(VURegs *vuRegs, int info); +void recVUMI_ESUM(VURegs *vuRegs, int info); +void recVUMI_ERCPR(VURegs *vuRegs, int info); +void recVUMI_ESQRT(VURegs *vuRegs, int info); +void recVUMI_ERSQRT(VURegs *vuRegs, int info); +void recVUMI_ESIN(VURegs *vuRegs, int info); +void recVUMI_EATAN(VURegs *vuRegs, int info); +void recVUMI_EEXP(VURegs *vuRegs, int info); +void recVUMI_XGKICK(VURegs *vuRegs, int info); +void recVUMI_XTOP(VURegs *vuRegs, int info); +void recVUMI_XITOP(VURegs *vuRegs, int info); +void recVUMI_XTOP( VURegs *VU , int info); + diff --git a/pcsx2/x86/sVU_Upper.cpp b/pcsx2/x86/sVU_Upper.cpp index 525d770596..09aba70625 100644 --- a/pcsx2/x86/sVU_Upper.cpp +++ b/pcsx2/x86/sVU_Upper.cpp @@ -1,3069 +1,3069 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#include "PrecompiledHeader.h" - -#include "Common.h" -#include "GS.h" -#include "R5900OpcodeTables.h" -#include "iR5900.h" -#include "iMMI.h" -#include "iFPU.h" -#include "iCOP0.h" -#include "VUmicro.h" -#include "VUflags.h" -#include "sVU_Micro.h" -#include "sVU_Debug.h" -#include "sVU_zerorec.h" -//------------------------------------------------------------------ -#define MINMAXFIX 1 -//------------------------------------------------------------------ -// Helper Macros -//------------------------------------------------------------------ -#define _Ft_ (( VU->code >> 16) & 0x1F) // The rt part of the instruction register -#define _Fs_ (( VU->code >> 11) & 0x1F) // The rd part of the instruction register -#define _Fd_ (( VU->code >> 6) & 0x1F) // The sa part of the instruction register - -#define _X (( VU->code>>24) & 0x1) -#define _Y (( VU->code>>23) & 0x1) -#define _Z (( VU->code>>22) & 0x1) -#define _W (( VU->code>>21) & 0x1) - -#define _XYZW_SS (_X+_Y+_Z+_W==1) - -#define _Fsf_ (( VU->code >> 21) & 0x03) -#define _Ftf_ (( VU->code >> 23) & 0x03) - -#define _Imm11_ (s32)(VU->code & 0x400 ? 0xfffffc00 | (VU->code & 0x3ff) : VU->code & 0x3ff) -#define _UImm11_ (s32)(VU->code & 0x7ff) - -#define VU_VFx_ADDR(x) (uptr)&VU->VF[x].UL[0] -#define VU_VFy_ADDR(x) (uptr)&VU->VF[x].UL[1] -#define VU_VFz_ADDR(x) (uptr)&VU->VF[x].UL[2] -#define VU_VFw_ADDR(x) (uptr)&VU->VF[x].UL[3] - -#define VU_REGR_ADDR (uptr)&VU->VI[REG_R] -#define VU_REGQ_ADDR (uptr)&VU->VI[REG_Q] -#define VU_REGMAC_ADDR (uptr)&VU->VI[REG_MAC_FLAG] - -#define VU_VI_ADDR(x, read) GetVIAddr(VU, x, read, info) - -#define VU_ACCx_ADDR (uptr)&VU->ACC.UL[0] -#define VU_ACCy_ADDR (uptr)&VU->ACC.UL[1] -#define VU_ACCz_ADDR (uptr)&VU->ACC.UL[2] -#define VU_ACCw_ADDR (uptr)&VU->ACC.UL[3] - -#define _X_Y_Z_W ((( VU->code >> 21 ) & 0xF ) ) -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// Global Variables -//------------------------------------------------------------------ -static const __aligned16 int SSEmovMask[ 16 ][ 4 ] = -{ - { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, - { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, - { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, - { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, - { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, - { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, - { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, - { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, - { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, - { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, - { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, - { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, - { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, - { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, - { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } -}; - -static const __aligned16 u32 const_abs_table[16][4] = -{ - { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000 - { 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, //0001 - { 0xffffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, //0010 - { 0xffffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, //0011 - { 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, //0100 - { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, //0101 - { 0xffffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, //0110 - { 0xffffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0111 - { 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000 - { 0x7fffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, //1001 - { 0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, //1010 - { 0x7fffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, //1011 - { 0x7fffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, //1100 - { 0x7fffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, //1101 - { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, //1110 - { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1111 -}; - -static const __aligned16 float recMult_float_to_int4[4] = { 16.0, 16.0, 16.0, 16.0 }; -static const __aligned16 float recMult_float_to_int12[4] = { 4096.0, 4096.0, 4096.0, 4096.0 }; -static const __aligned16 float recMult_float_to_int15[4] = { 32768.0, 32768.0, 32768.0, 32768.0 }; - -static const __aligned16 float recMult_int_to_float4[4] = { 0.0625f, 0.0625f, 0.0625f, 0.0625f }; -static const __aligned16 float recMult_int_to_float12[4] = { 0.000244140625, 0.000244140625, 0.000244140625, 0.000244140625 }; -static const __aligned16 float recMult_int_to_float15[4] = { 0.000030517578125, 0.000030517578125, 0.000030517578125, 0.000030517578125 }; - -static const __aligned16 u32 VU_Underflow_Mask1[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; -static const __aligned16 u32 VU_Underflow_Mask2[4] = {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff}; -static const __aligned16 u32 VU_Zero_Mask[4] = {0x00000000, 0x00000000, 0x00000000, 0x00000000}; -static const __aligned16 u32 VU_Zero_Helper_Mask[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; -static const __aligned16 u32 VU_Signed_Zero_Mask[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -static const __aligned16 u32 VU_Pos_Infinity[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; -static const __aligned16 u32 VU_Neg_Infinity[4] = {0xff800000, 0xff800000, 0xff800000, 0xff800000}; -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// recUpdateFlags() - Computes the flags for the Upper Opcodes -// -// Note: Computes under/overflow flags if CHECK_VU_EXTRA_FLAGS is 1 -//------------------------------------------------------------------ -static __aligned16 u64 TEMPXMMData[2]; -void recUpdateFlags(VURegs * VU, int reg, int info) -{ - static u8 *pjmp, *pjmp2; - static u32 *pjmp32; - static u32 macaddr, stataddr, prevstataddr; - static int x86macflag, x86statflag, x86temp; - static int t1reg, t1regBoolean; - static const int flipMask[16] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15}; - - if( !(info & PROCESS_VU_UPDATEFLAGS) ) { - if (CHECK_VU_EXTRA_OVERFLOW) { - if (reg != EEREC_TEMP) vuFloat2(reg, EEREC_TEMP, _X_Y_Z_W); - else vuFloat_useEAX(info, reg, _X_Y_Z_W); - } - return; - } - - //Console.WriteLn ("recUpdateFlags"); - - macaddr = VU_VI_ADDR(REG_MAC_FLAG, 0); - stataddr = VU_VI_ADDR(REG_STATUS_FLAG, 0); // write address - prevstataddr = VU_VI_ADDR(REG_STATUS_FLAG, 2); // previous address - - if( stataddr == 0 ) stataddr = prevstataddr; - if( macaddr == 0 ) { - Console.WriteLn( "VU ALLOCATION WARNING: Using Mac Flag Previous Address!" ); - macaddr = VU_VI_ADDR(REG_MAC_FLAG, 2); - } - - x86macflag = ALLOCTEMPX86(0); - x86statflag = ALLOCTEMPX86(0); - - if (reg == EEREC_TEMP) { - t1reg = _vuGetTempXMMreg(info); - if (t1reg < 0) { - //Console.WriteLn( "VU ALLOCATION ERROR: Temp reg can't be allocated!!!!" ); - t1reg = (reg == 0) ? 1 : 0; // Make t1reg != reg - SSE_MOVAPS_XMM_to_M128( (uptr)TEMPXMMData, t1reg ); // Backup data to temp address - t1regBoolean = 1; - } - else t1regBoolean = 0; - } - else { - t1reg = EEREC_TEMP; - t1regBoolean = 2; - } - - SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip wzyx to xyzw - MOV32MtoR(x86statflag, prevstataddr); // Load the previous status in to x86statflag - AND16ItoR(x86statflag, 0xff0); // Keep Sticky and D/I flags - - - if (CHECK_VU_EXTRA_FLAGS) { // Checks all flags - - x86temp = ALLOCTEMPX86(0); - - //-------------------------Check for Overflow flags------------------------------ - - //SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg - //SSE_CMPUNORDPS_XMM_to_XMM(t1reg, reg); // If reg == NaN then set Vector to 0xFFFFFFFF - - //SSE_MOVAPS_XMM_to_XMM(t1reg, reg); - //SSE_MINPS_M128_to_XMM(t1reg, (uptr)g_maxvals); - //SSE_MAXPS_M128_to_XMM(t1reg, (uptr)g_minvals); - //SSE_CMPNEPS_XMM_to_XMM(t1reg, reg); // If they're not equal, then overflow has occured - - SSE_MOVAPS_XMM_to_XMM(t1reg, reg); - SSE_ANDPS_M128_to_XMM(t1reg, (uptr)VU_Zero_Helper_Mask); - SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)VU_Pos_Infinity); // If infinity, then overflow has occured (NaN's don't report as overflow) - - SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the previous calculation - - AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified) - pjmp = JZ8(0); // Skip if none are - OR16ItoR(x86statflag, 0x208); // OS, O flags - SHL16ItoR(x86macflag, 12); - if (_XYZW_SS) pjmp32 = JMP32(0); // Skip Underflow Check - x86SetJ8(pjmp); - - //-------------------------Check for Underflow flags------------------------------ - - SSE_MOVAPS_XMM_to_XMM(t1reg, reg); // t1reg <- reg - - SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask1[ 0 ]); - SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg == zero exponent) then set Vector to 0xFFFFFFFF - - SSE_ANDPS_XMM_to_XMM(t1reg, reg); - SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask2[ 0 ]); - SSE_CMPNEPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg != zero mantisa) then set Vector to 0xFFFFFFFF - - SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation - - AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Has Underflowed" bits from the previous calculation - pjmp = JZ8(0); // Skip if none are - OR16ItoR(x86statflag, 0x104); // US, U flags - SHL16ItoR(EAX, 8); - OR32RtoR(x86macflag, EAX); - x86SetJ8(pjmp); - - //-------------------------Optional Code: Denormals Are Zero------------------------------ - if (CHECK_VU_UNDERFLOW) { // Sets underflow/denormals to zero - SSE_ANDNPS_XMM_to_XMM(t1reg, reg); // t1reg = !t1reg & reg (t1reg = denormals are positive zero) - VU_MERGE_REGS_SAFE(t1reg, reg, (15 - flipMask[_X_Y_Z_W])); // Send t1reg the vectors that shouldn't be modified (since reg was flipped, we need a mask to get the unmodified vectors) - // Now we have Denormals are Positive Zero in t1reg; the next two lines take Signed Zero into account - SSE_ANDPS_M128_to_XMM(reg, (uptr)&VU_Signed_Zero_Mask[ 0 ]); // Only keep the sign bit for each vector - SSE_ORPS_XMM_to_XMM(reg, t1reg); // Denormals are Signed Zero, and unmodified vectors stay the same! - } - - if (_XYZW_SS) x86SetJ32(pjmp32); // If we skipped the Underflow Flag Checking (when we had an Overflow), return here - - vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask) - - //-------------------------Check for Signed flags------------------------------ - - // The following code makes sure the Signed Bit isn't set with Negative Zero - SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg - SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero - SSE_MOVMSKPS_XMM_to_R32(x86temp, t1reg); // Used for Zero Flag Calculation - SSE_ANDNPS_XMM_to_XMM(t1reg, reg); - - SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg - - AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation - pjmp = JZ8(0); // Skip if none are - OR16ItoR(x86statflag, 0x82); // SS, S flags - SHL16ItoR(EAX, 4); - OR32RtoR(x86macflag, EAX); - if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking - x86SetJ8(pjmp); - - //-------------------------Check for Zero flags------------------------------ - - AND16ItoR(x86temp, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation - pjmp = JZ8(0); // Skip if none are - OR16ItoR(x86statflag, 0x41); // ZS, Z flags - OR32RtoR(x86macflag, x86temp); - x86SetJ8(pjmp); - - _freeX86reg(x86temp); - } - else { // Only Checks for Sign and Zero Flags - - vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask) - - //-------------------------Check for Signed flags------------------------------ - - // The following code makes sure the Signed Bit isn't set with Negative Zero - SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg - SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero - SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Used for Zero Flag Calculation - SSE_ANDNPS_XMM_to_XMM(t1reg, reg); - - SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the t1reg - - AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation - pjmp = JZ8(0); // Skip if none are - OR16ItoR(x86statflag, 0x82); // SS, S flags - SHL16ItoR(x86macflag, 4); - if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking - x86SetJ8(pjmp); - - //-------------------------Check for Zero flags------------------------------ - - AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation - pjmp = JZ8(0); // Skip if none are - OR16ItoR(x86statflag, 0x41); // ZS, Z flags - OR32RtoR(x86macflag, EAX); - x86SetJ8(pjmp); - } - //-------------------------Finally: Send the Flags to the Mac Flag Address------------------------------ - - if (_XYZW_SS) x86SetJ8(pjmp2); // If we skipped the Zero Flag Checking, return here - - if (t1regBoolean == 2) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip back reg to wzyx (have to do this because reg != EEREC_TEMP) - else if (t1regBoolean == 1) SSE_MOVAPS_M128_to_XMM( t1reg, (uptr)TEMPXMMData ); // Restore data from temo address - else _freeXMMreg(t1reg); // Free temp reg - - MOV16RtoM(macaddr, x86macflag); - MOV16RtoM(stataddr, x86statflag); - - _freeX86reg(x86macflag); - _freeX86reg(x86statflag); -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// Custom VU ADD/SUB routines by Nneeve -// -// Note: See FPU_ADD_SUB() for more info on what this is doing. -//------------------------------------------------------------------ -static __aligned16 u32 VU_addsuband[2][4]; -static __aligned16 u32 VU_addsub_reg[2][4]; - -static u32 tempECX; - -void VU_ADD_SUB(u32 regd, u32 regt, int is_sub, int info) -{ - u8 *localptr[4][8]; - - MOV32RtoM((uptr)&tempECX, ECX); - - int temp1 = ECX; //receives regd - int temp2 = ALLOCTEMPX86(0); - - if (temp2 == ECX) - { - temp2 = ALLOCTEMPX86(0); - _freeX86reg(ECX); - } - - SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd); - SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt); - - SSE2_PCMPEQB_XMM_to_XMM(regd, regd); - SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd); - SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd); - SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); - - SSE2_PSLLD_I8_to_XMM(regd, 1); - SSE2_PSLLD_I8_to_XMM(regt, 1); - - SSE2_PSRLD_I8_to_XMM(regd, 24); - SSE2_PSRLD_I8_to_XMM(regt, 24); - - SSE2_PSUBD_XMM_to_XMM(regd, regt); - -#define PERFORM(i) \ - \ - SSE_PEXTRW_XMM_to_R32(temp1, regd, i*2); \ - MOVSX32R16toR(temp1, temp1); \ - CMP32ItoR(temp1, 25);\ - localptr[i][0] = JGE8(0);\ - CMP32ItoR(temp1, 0);\ - localptr[i][1] = JG8(0);\ - localptr[i][2] = JE8(0);\ - CMP32ItoR(temp1, -25);\ - localptr[i][3] = JLE8(0);\ - \ - NEG32R(temp1); \ - DEC32R(temp1);\ - MOV32ItoR(temp2, 0xffffffff); \ - SHL32CLtoR(temp2); \ - MOV32RtoM((uptr)&VU_addsuband[0][i], temp2);\ - localptr[i][4] = JMP8(0);\ - \ - x86SetJ8(localptr[i][0]);\ - MOV32ItoM((uptr)&VU_addsuband[1][i], 0x80000000);\ - localptr[i][5] = JMP8(0);\ - \ - x86SetJ8(localptr[i][1]);\ - DEC32R(temp1);\ - MOV32ItoR(temp2, 0xffffffff);\ - SHL32CLtoR(temp2); \ - MOV32RtoM((uptr)&VU_addsuband[1][i], temp2);\ - localptr[i][6] = JMP8(0);\ - \ - x86SetJ8(localptr[i][3]);\ - MOV32ItoM((uptr)&VU_addsuband[0][i], 0x80000000);\ - localptr[i][7] = JMP8(0);\ - \ - x86SetJ8(localptr[i][2]);\ - \ - x86SetJ8(localptr[i][4]);\ - x86SetJ8(localptr[i][5]);\ - x86SetJ8(localptr[i][6]);\ - x86SetJ8(localptr[i][7]); - - PERFORM(0); - PERFORM(1); - PERFORM(2); - PERFORM(3); -#undef PERFORM - - SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); - SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); - - SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[0][0]); - SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsuband[1][0]); - - if (is_sub) SSE_SUBPS_XMM_to_XMM(regd, regt); - else SSE_ADDPS_XMM_to_XMM(regd, regt); - - SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); - - _freeX86reg(temp2); - - MOV32MtoR(ECX, (uptr)&tempECX); -} - -void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info) -{ - u8 *localptr[8]; - u32 addrt = regt; //for case is_mem - - MOV32RtoM((uptr)&tempECX, ECX); - - int temp1 = ECX; //receives regd - int temp2 = ALLOCTEMPX86(0); - - if (temp2 == ECX) - { - temp2 = ALLOCTEMPX86(0); - _freeX86reg(ECX); - } - - SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd); - if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt); - - SSE2_MOVD_XMM_to_R(temp1, regd); - SHR32ItoR(temp1, 23); - - if (is_mem) { - MOV32MtoR(temp2, addrt); - MOV32RtoM((uptr)&VU_addsub_reg[1][0], temp2); - SHR32ItoR(temp2, 23); - } - else { - SSE2_MOVD_XMM_to_R(temp2, regt); - SHR32ItoR(temp2, 23); - } - - AND32ItoR(temp1, 0xff); - AND32ItoR(temp2, 0xff); - - SUB32RtoR(temp1, temp2); //temp1 = exponent difference - - CMP32ItoR(temp1, 25); - localptr[0] = JGE8(0); - CMP32ItoR(temp1, 0); - localptr[1] = JG8(0); - localptr[2] = JE8(0); - CMP32ItoR(temp1, -25); - localptr[3] = JLE8(0); - - NEG32R(temp1); - DEC32R(temp1); - MOV32ItoR(temp2, 0xffffffff); - SHL32CLtoR(temp2); - SSE2_PCMPEQB_XMM_to_XMM(regd, regd); - if (is_mem) { - SSE_PINSRW_R32_to_XMM(regd, temp2, 0); - SHR32ItoR(temp2, 16); - SSE_PINSRW_R32_to_XMM(regd, temp2, 1); - } - else { - SSE2_MOVD_R_to_XMM(regt, temp2); - SSE_MOVSS_XMM_to_XMM(regd, regt); - SSE2_PCMPEQB_XMM_to_XMM(regt, regt); - } - localptr[4] = JMP8(0); - - x86SetJ8(localptr[0]); - MOV32ItoR(temp2, 0x80000000); - if (is_mem) - AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2); - else { - SSE2_PCMPEQB_XMM_to_XMM(regt, regt); - SSE2_MOVD_R_to_XMM(regd, temp2); - SSE_MOVSS_XMM_to_XMM(regt, regd); - } - SSE2_PCMPEQB_XMM_to_XMM(regd, regd); - localptr[5] = JMP8(0); - - x86SetJ8(localptr[1]); - DEC32R(temp1); - MOV32ItoR(temp2, 0xffffffff); - SHL32CLtoR(temp2); - if (is_mem) - AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2); - else { - SSE2_PCMPEQB_XMM_to_XMM(regt, regt); - SSE2_MOVD_R_to_XMM(regd, temp2); - SSE_MOVSS_XMM_to_XMM(regt, regd); - } - SSE2_PCMPEQB_XMM_to_XMM(regd, regd); - localptr[6] = JMP8(0); - - x86SetJ8(localptr[3]); - MOV32ItoR(temp2, 0x80000000); - SSE2_PCMPEQB_XMM_to_XMM(regd, regd); - if (is_mem) { - SSE_PINSRW_R32_to_XMM(regd, temp2, 0); - SHR32ItoR(temp2, 16); - SSE_PINSRW_R32_to_XMM(regd, temp2, 1); - } - else { - SSE2_MOVD_R_to_XMM(regt, temp2); - SSE_MOVSS_XMM_to_XMM(regd, regt); - SSE2_PCMPEQB_XMM_to_XMM(regt, regt); - } - localptr[7] = JMP8(0); - - x86SetJ8(localptr[2]); - x86SetJ8(localptr[4]); - x86SetJ8(localptr[5]); - x86SetJ8(localptr[6]); - x86SetJ8(localptr[7]); - - if (is_mem) - { - SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask - - if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]); - else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]); - } - else - { - SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask - SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask - - if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt); - else SSE_ADDSS_XMM_to_XMM(regd, regt); - - SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); - } - - _freeX86reg(temp2); - - MOV32MtoR(ECX, (uptr)&tempECX); -} - -void SSE_ADDPS_XMM_to_XMM_custom(int info, int regd, int regt) { - if (CHECK_VUADDSUBHACK) { - VU_ADD_SUB(regd, regt, 0, info); - } - else SSE_ADDPS_XMM_to_XMM(regd, regt); -} -void SSE_SUBPS_XMM_to_XMM_custom(int info, int regd, int regt) { - if (CHECK_VUADDSUBHACK) { - VU_ADD_SUB(regd, regt, 1, info); - } - else SSE_SUBPS_XMM_to_XMM(regd, regt); -} -void SSE_ADDSS_XMM_to_XMM_custom(int info, int regd, int regt) { - if (CHECK_VUADDSUBHACK) { - VU_ADD_SUB_SS(regd, regt, 0, 0, info); - } - else SSE_ADDSS_XMM_to_XMM(regd, regt); -} -void SSE_SUBSS_XMM_to_XMM_custom(int info, int regd, int regt) { - if (CHECK_VUADDSUBHACK) { - VU_ADD_SUB_SS(regd, regt, 1, 0, info); - } - else SSE_SUBSS_XMM_to_XMM(regd, regt); -} -void SSE_ADDSS_M32_to_XMM_custom(int info, int regd, int regt) { - if (CHECK_VUADDSUBHACK) { - VU_ADD_SUB_SS(regd, regt, 0, 1, info); - } - else SSE_ADDSS_M32_to_XMM(regd, regt); -} -void SSE_SUBSS_M32_to_XMM_custom(int info, int regd, int regt) { - if (CHECK_VUADDSUBHACK) { - VU_ADD_SUB_SS(regd, regt, 1, 1, info); - } - else SSE_SUBSS_M32_to_XMM(regd, regt); -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// *VU Upper Instructions!* -// -// Note: * = Checked for errors by cottonvibes -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// ABS* -//------------------------------------------------------------------ -void recVUMI_ABS(VURegs *VU, int info) -{ - //Console.WriteLn("recVUMI_ABS()"); - if ( (_Ft_ == 0) || (_X_Y_Z_W == 0) ) return; - - if ((_X_Y_Z_W == 0x8) || (_X_Y_Z_W == 0xf)) { - VU_MERGE_REGS(EEREC_T, EEREC_S); - SSE_ANDPS_M128_to_XMM(EEREC_T, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] ); - } - else { // Use a temp reg because VU_MERGE_REGS() modifies source reg! - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] ); - VU_MERGE_REGS(EEREC_T, EEREC_TEMP); - } -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// ADD*, ADD_iq*, ADD_xyzw* -//------------------------------------------------------------------ -static const __aligned16 float s_two[4] = {0,0,0,2}; -void recVUMI_ADD(VURegs *VU, int info) -{ - //Console.WriteLn("recVUMI_ADD()"); - if ( _X_Y_Z_W == 0 ) goto flagUpdate; // Don't do anything and just clear flags - if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); - - if ( _Fs_ == 0 && _Ft_ == 0 ) { // if adding VF00 with VF00, then the result is always 0,0,0,2 - if ( _X_Y_Z_W != 0xf ) { - SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)s_two); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)s_two); - } - else { - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); - } - if( _X_Y_Z_W == 8 ) { // If only adding x, then we can do a Scalar Add - if (EEREC_D == EEREC_S) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T); - else if (EEREC_D == EEREC_T) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_S); - else { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - else if (_X_Y_Z_W != 0xf) { // If xyzw != 1111, then we have to use a temp reg - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { // All xyzw being modified (xyzw == 1111) - if (EEREC_D == EEREC_S) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_T); - else if (EEREC_D == EEREC_T) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - } -flagUpdate: - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_ADD_iq(VURegs *VU, uptr addr, int info) -{ - //Console.WriteLn("recVUMI_ADD_iq()"); - if ( _X_Y_Z_W == 0 ) goto flagUpdate; - if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); - if (CHECK_VU_EXTRA_OVERFLOW) { - vuFloat3(addr); - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - } - - if ( _XYZW_SS ) { - if ( EEREC_D == EEREC_TEMP ) { - _vuFlipRegSS(VU, EEREC_S); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_ADDSS_M32_to_XMM(EEREC_D, addr); - _vuFlipRegSS(VU, EEREC_S); - _vuFlipRegSS(VU, EEREC_D); // have to flip over EEREC_D for computing flags! - } - else if ( EEREC_D == EEREC_S ) { - _vuFlipRegSS(VU, EEREC_D); - SSE_ADDSS_M32_to_XMM(EEREC_D, addr); - _vuFlipRegSS(VU, EEREC_D); - } - else { - if ( _X ) { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_ADDSS_M32_to_XMM_custom(info, EEREC_D, addr); - } - else { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - SSE_ADDPS_XMM_to_XMM_custom(info, EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - } - } - else { - if ( (_X_Y_Z_W != 0xf) || (EEREC_D == EEREC_S) || (EEREC_D == EEREC_TEMP) ) { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - } - - if (_X_Y_Z_W != 0xf) { - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - if ( EEREC_D == EEREC_TEMP ) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); - else if ( EEREC_D == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - else { - SSE_MOVSS_M32_to_XMM(EEREC_D, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00); - SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); - } - } - } -flagUpdate: - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_ADD_xyzw(VURegs *VU, int xyzw, int info) -{ - //Console.WriteLn("recVUMI_ADD_xyzw()"); - if ( _X_Y_Z_W == 0 ) goto flagUpdate; - if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); - } - - if ( _Ft_ == 0 && xyzw < 3 ) { // just move since adding zero - if ( _X_Y_Z_W == 0x8 ) { VU_MERGE_REGS(EEREC_D, EEREC_S); } - else if ( _X_Y_Z_W != 0xf ) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); - } - else if ( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP) ) { - if ( xyzw == 0 ) { - if ( EEREC_D == EEREC_T ) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_S); - else { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - else { - _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - } - else if( _Fs_ == 0 && !_W ) { // just move - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - if ( _X_Y_Z_W != 0xf ) { - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - if( EEREC_D == EEREC_TEMP ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); } - else if( EEREC_D == EEREC_S ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); } - else { _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); } - } - } -flagUpdate: - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_ADDi(VURegs *VU, int info) { recVUMI_ADD_iq(VU, VU_VI_ADDR(REG_I, 1), info); } -void recVUMI_ADDq(VURegs *VU, int info) { recVUMI_ADD_iq(VU, VU_REGQ_ADDR, info); } -void recVUMI_ADDx(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 0, info); } -void recVUMI_ADDy(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 1, info); } -void recVUMI_ADDz(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 2, info); } -void recVUMI_ADDw(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 3, info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// ADDA*, ADDA_iq*, ADDA_xyzw* -//------------------------------------------------------------------ -void recVUMI_ADDA(VURegs *VU, int info) -{ - //Console.WriteLn("recVUMI_ADDA()"); - if ( _X_Y_Z_W == 0 ) goto flagUpdate; - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); - } - - if( _X_Y_Z_W == 8 ) { - if (EEREC_ACC == EEREC_S) SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T); // Can this case happen? (cottonvibes) - else if (EEREC_ACC == EEREC_T) SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_S); // Can this case happen? - else { - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T); - } - } - else if (_X_Y_Z_W != 0xf) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - - VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); - } - else { - if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_T); // Can this case happen? - else if( EEREC_ACC == EEREC_T ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S); // Can this case happen? - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_T); - } - } -flagUpdate: - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_ADDA_iq(VURegs *VU, uptr addr, int info) -{ - //Console.WriteLn("recVUMI_ADDA_iq()"); - if ( _X_Y_Z_W == 0 ) goto flagUpdate; - if (CHECK_VU_EXTRA_OVERFLOW) { - vuFloat3(addr); - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - } - - if( _XYZW_SS ) { - assert( EEREC_ACC != EEREC_TEMP ); - if( EEREC_ACC == EEREC_S ) { - _vuFlipRegSS(VU, EEREC_ACC); - SSE_ADDSS_M32_to_XMM(EEREC_ACC, addr); - _vuFlipRegSS(VU, EEREC_ACC); - } - else { - if( _X ) { - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_ADDSS_M32_to_XMM(EEREC_ACC, addr); - } - else { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); - } - } - } - else { - if( _X_Y_Z_W != 0xf || EEREC_ACC == EEREC_S ) { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - } - - if (_X_Y_Z_W != 0xf) { - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); - } - else { - if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); - else { - SSE_MOVSS_M32_to_XMM(EEREC_ACC, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_ACC, EEREC_ACC, 0x00); - SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S); - } - } - } -flagUpdate: - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_ADDA_xyzw(VURegs *VU, int xyzw, int info) -{ - //Console.WriteLn("recVUMI_ADDA_xyzw()"); - if ( _X_Y_Z_W == 0 ) goto flagUpdate; - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); - } - - if( _X_Y_Z_W == 8 ) { - assert( EEREC_ACC != EEREC_T ); - if( xyzw == 0 ) { - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T); - } - else { - _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); - if( _Fs_ == 0 ) { - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); - } - else { - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); - } - } - } - else { - if( _X_Y_Z_W != 0xf || EEREC_ACC == EEREC_S ) - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - - if (_X_Y_Z_W != 0xf) { - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); - } - else { - if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); - else { - _unpackVF_xyzw(EEREC_ACC, EEREC_T, xyzw); - SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S); - } - } - } -flagUpdate: - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_ADDAi(VURegs *VU, int info) { recVUMI_ADDA_iq(VU, VU_VI_ADDR(REG_I, 1), info); } -void recVUMI_ADDAq(VURegs *VU, int info) { recVUMI_ADDA_iq(VU, VU_REGQ_ADDR, info); } -void recVUMI_ADDAx(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 0, info); } -void recVUMI_ADDAy(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 1, info); } -void recVUMI_ADDAz(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 2, info); } -void recVUMI_ADDAw(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 3, info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// SUB*, SUB_iq*, SUB_xyzw* -//------------------------------------------------------------------ -void recVUMI_SUB(VURegs *VU, int info) -{ - //Console.WriteLn("recVUMI_SUB()"); - if ( _X_Y_Z_W == 0 ) goto flagUpdate; - if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); - - if( EEREC_S == EEREC_T ) { - if (_X_Y_Z_W != 0xf) SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&SSEmovMask[15-_X_Y_Z_W][0]); - else SSE_XORPS_XMM_to_XMM(EEREC_D, EEREC_D); - } - else if( _X_Y_Z_W == 8 ) { - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); - } - if (EEREC_D == EEREC_S) { - if (_Ft_) SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T); - } - else if (EEREC_D == EEREC_T) { - if (_Ft_) { - SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_SUBSS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - else SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - } - else { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - if (_Ft_) SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - else { - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); - } - if (_X_Y_Z_W != 0xf) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if( ( _Ft_ > 0 ) || _W ) SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - if (EEREC_D == EEREC_S) SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_T); - else if (EEREC_D == EEREC_T) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - } -flagUpdate: - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_SUB_iq(VURegs *VU, uptr addr, int info) -{ - //Console.WriteLn("recVUMI_SUB_iq()"); - if ( _X_Y_Z_W == 0 ) goto flagUpdate; - if (CHECK_VU_EXTRA_OVERFLOW) { - vuFloat3(addr); - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - } - if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); - - if( _XYZW_SS ) { - if( EEREC_D == EEREC_TEMP ) { - _vuFlipRegSS(VU, EEREC_S); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_SUBSS_M32_to_XMM(EEREC_D, addr); - _vuFlipRegSS(VU, EEREC_S); - _vuFlipRegSS(VU, EEREC_D); - } - else if( EEREC_D == EEREC_S ) { - _vuFlipRegSS(VU, EEREC_D); - SSE_SUBSS_M32_to_XMM(EEREC_D, addr); - _vuFlipRegSS(VU, EEREC_D); - } - else { - if( _X ) { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_SUBSS_M32_to_XMM(EEREC_D, addr); - } - else { - _vuMoveSS(VU, EEREC_TEMP, EEREC_S); - _vuFlipRegSS(VU, EEREC_D); - SSE_SUBSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - _vuFlipRegSS(VU, EEREC_D); - } - } - } - else { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - - if (_X_Y_Z_W != 0xf) { - int t1reg = _vuGetTempXMMreg(info); - - if( t1reg >= 0 ) { - SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S); - SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); - - VU_MERGE_REGS(EEREC_D, t1reg); - _freeXMMreg(t1reg); - } - else { - // negate - SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - } - else { - if( EEREC_D == EEREC_TEMP ) { - SSE_XORPS_M128_to_XMM(EEREC_D, (uptr)&const_clip[4]); - SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); - } - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - } - } -flagUpdate: - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_SUB_xyzw(VURegs *VU, int xyzw, int info) -{ - //Console.WriteLn("recVUMI_SUB_xyzw()"); - if ( _X_Y_Z_W == 0 ) goto flagUpdate; - if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); - } - - if ( _X_Y_Z_W == 8 ) { - if ( (xyzw == 0) && (_Ft_ == _Fs_) ) { - SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&SSEmovMask[7][0]); - } - else if ( EEREC_D == EEREC_TEMP ) { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - if ( (_Ft_ > 0) || (xyzw == 3) ) { - _vuFlipRegSS_xyzw(EEREC_T, xyzw); - SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T); - _vuFlipRegSS_xyzw(EEREC_T, xyzw); - } - } - else { - if ( (_Ft_ > 0) || (xyzw == 3) ) { - _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - else SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - } - } - else { - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - - if (_X_Y_Z_W != 0xf) { - int t1reg = _vuGetTempXMMreg(info); - - if( t1reg >= 0 ) { - SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S); - SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); - - VU_MERGE_REGS(EEREC_D, t1reg); - _freeXMMreg(t1reg); - } - else { - // negate - SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - } - else { - if( EEREC_D == EEREC_TEMP ) { - SSE_XORPS_M128_to_XMM(EEREC_D, (uptr)&const_clip[4]); - SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); - } - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - } - } -flagUpdate: - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_SUBi(VURegs *VU, int info) { recVUMI_SUB_iq(VU, VU_VI_ADDR(REG_I, 1), info); } -void recVUMI_SUBq(VURegs *VU, int info) { recVUMI_SUB_iq(VU, VU_REGQ_ADDR, info); } -void recVUMI_SUBx(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 0, info); } -void recVUMI_SUBy(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 1, info); } -void recVUMI_SUBz(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 2, info); } -void recVUMI_SUBw(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 3, info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// SUBA*, SUBA_iq, SUBA_xyzw -//------------------------------------------------------------------ -void recVUMI_SUBA(VURegs *VU, int info) -{ - //Console.WriteLn("recVUMI_SUBA()"); - if ( _X_Y_Z_W == 0 ) goto flagUpdate; - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); - } - - if( EEREC_S == EEREC_T ) { - if (_X_Y_Z_W != 0xf) SSE_ANDPS_M128_to_XMM(EEREC_ACC, (uptr)&SSEmovMask[15-_X_Y_Z_W][0]); - else SSE_XORPS_XMM_to_XMM(EEREC_ACC, EEREC_ACC); - } - else if( _X_Y_Z_W == 8 ) { - if (EEREC_ACC == EEREC_S) SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T); - else if (EEREC_ACC == EEREC_T) { - SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_SUBSS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); - } - else { - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T); - } - } - else if (_X_Y_Z_W != 0xf) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - - VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); - } - else { - if( EEREC_ACC == EEREC_S ) SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_T); - else if( EEREC_ACC == EEREC_T ) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); - } - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_T); - } - } -flagUpdate: - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_SUBA_iq(VURegs *VU, uptr addr, int info) -{ - //Console.WriteLn ("recVUMI_SUBA_iq"); - if (CHECK_VU_EXTRA_OVERFLOW) { - vuFloat3(addr); - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - } - - if( _XYZW_SS ) { - if( EEREC_ACC == EEREC_S ) { - _vuFlipRegSS(VU, EEREC_ACC); - SSE_SUBSS_M32_to_XMM(EEREC_ACC, addr); - _vuFlipRegSS(VU, EEREC_ACC); - } - else { - if( _X ) { - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_SUBSS_M32_to_XMM(EEREC_ACC, addr); - } - else { - _vuMoveSS(VU, EEREC_TEMP, EEREC_S); - _vuFlipRegSS(VU, EEREC_ACC); - SSE_SUBSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); - _vuFlipRegSS(VU, EEREC_ACC); - } - } - } - else { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - - if (_X_Y_Z_W != 0xf) { - int t1reg = _vuGetTempXMMreg(info); - - if( t1reg >= 0 ) { - SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S); - SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); - - VU_MERGE_REGS(EEREC_ACC, t1reg); - _freeXMMreg(t1reg); - } - else { - // negate - SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); - } - } - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); - } - } - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_SUBA_xyzw(VURegs *VU, int xyzw, int info) -{ - //Console.WriteLn ("recVUMI_SUBA_xyzw"); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); - } - - if( _X_Y_Z_W == 8 ) { - if( xyzw == 0 ) { - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T); - } - else { - _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); - } - } - else { - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - - if (_X_Y_Z_W != 0xf) { - int t1reg = _vuGetTempXMMreg(info); - - if( t1reg >= 0 ) { - SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S); - SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); - - VU_MERGE_REGS(EEREC_ACC, t1reg); - _freeXMMreg(t1reg); - } - else { - // negate - SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); - } - } - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S); - SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); - } - } - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_SUBAi(VURegs *VU, int info) { recVUMI_SUBA_iq(VU, VU_VI_ADDR(REG_I, 1), info); } -void recVUMI_SUBAq(VURegs *VU, int info) { recVUMI_SUBA_iq(VU, VU_REGQ_ADDR, info); } -void recVUMI_SUBAx(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 0, info); } -void recVUMI_SUBAy(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 1, info); } -void recVUMI_SUBAz(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 2, info); } -void recVUMI_SUBAw(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 3, info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// MUL -//------------------------------------------------------------------ -void recVUMI_MUL_toD(VURegs *VU, int regd, int info) -{ - //Console.WriteLn ("recVUMI_MUL_toD"); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); - } - - if (_X_Y_Z_W == 1 && (_Ft_ == 0 || _Fs_==0) ) { // W - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, _Ft_ ? EEREC_T : EEREC_S); - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else if( _Fd_ == _Fs_ && _Fs_ == _Ft_ && _XYZW_SS ) { - _vuFlipRegSS(VU, EEREC_D); - SSE_MULSS_XMM_to_XMM(EEREC_D, EEREC_D); - _vuFlipRegSS(VU, EEREC_D); - } - else if( _X_Y_Z_W == 8 ) { - if (regd == EEREC_S) SSE_MULSS_XMM_to_XMM(regd, EEREC_T); - else if (regd == EEREC_T) SSE_MULSS_XMM_to_XMM(regd, EEREC_S); - else { - SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); - SSE_MULSS_XMM_to_XMM(regd, EEREC_T); - } - } - else if (_X_Y_Z_W != 0xf) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else { - if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_T); - else if (regd == EEREC_T) SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - else { - SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S); - SSE_MULPS_XMM_to_XMM(regd, EEREC_T); - } - } -} - -void recVUMI_MUL_iq_toD(VURegs *VU, uptr addr, int regd, int info) -{ - //Console.WriteLn ("recVUMI_MUL_iq_toD"); - if (CHECK_VU_EXTRA_OVERFLOW) { - vuFloat3(addr); - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - } - - if( _XYZW_SS ) { - if( regd == EEREC_TEMP ) { - _vuFlipRegSS(VU, EEREC_S); - SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); - SSE_MULSS_M32_to_XMM(regd, addr); - _vuFlipRegSS(VU, EEREC_S); - _vuFlipRegSS(VU, regd); - } - else if( regd == EEREC_S ) { - _vuFlipRegSS(VU, regd); - SSE_MULSS_M32_to_XMM(regd, addr); - _vuFlipRegSS(VU, regd); - } - else { - if( _X ) { - SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); - SSE_MULSS_M32_to_XMM(regd, addr); - } - else { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(regd, EEREC_TEMP); - } - } - } - else { - if( _X_Y_Z_W != 0xf || regd == EEREC_TEMP || regd == EEREC_S ) { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - } - - if (_X_Y_Z_W != 0xf) { - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else { - if( regd == EEREC_TEMP ) SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - else if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP); - else { - SSE_MOVSS_M32_to_XMM(regd, addr); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x00); - SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - } - } - } -} - -void recVUMI_MUL_xyzw_toD(VURegs *VU, int xyzw, int regd, int info) -{ - //Console.WriteLn ("recVUMI_MUL_xyzw_toD"); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); - } - if (_Fs_) { // This is needed for alot of games; so always clamp this operand - if (CHECK_VU_SIGN_OVERFLOW) vFloats4_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set - else vFloats2[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set - } - if( _Ft_ == 0 ) { - if( xyzw < 3 ) { - if (_X_Y_Z_W != 0xf) { - SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else SSE_XORPS_XMM_to_XMM(regd, regd); - } - else { - assert(xyzw==3); - if (_X_Y_Z_W != 0xf) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S); - } - } - else if( _X_Y_Z_W == 8 ) { - if( regd == EEREC_TEMP ) { - _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MULSS_XMM_to_XMM(regd, EEREC_S); - } - else { - if( xyzw == 0 ) { - if( regd == EEREC_T ) { - SSE_MULSS_XMM_to_XMM(regd, EEREC_S); - } - else { - SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); - SSE_MULSS_XMM_to_XMM(regd, EEREC_T); - } - } - else { - _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); - SSE_MULSS_XMM_to_XMM(regd, EEREC_TEMP); - } - } - } - else { - if( _X_Y_Z_W != 0xf || regd == EEREC_TEMP || regd == EEREC_S ) - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - - if (_X_Y_Z_W != 0xf) { - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else { - if( regd == EEREC_TEMP ) SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - else if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP); - else { - _unpackVF_xyzw(regd, EEREC_T, xyzw); - SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - } - } - } -} - -void recVUMI_MUL(VURegs *VU, int info) -{ - //Console.WriteLn ("recVUMI_MUL"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MUL_toD(VU, EEREC_D, info); - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_MUL_iq(VURegs *VU, int addr, int info) -{ - //Console.WriteLn ("recVUMI_MUL_iq"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MUL_iq_toD(VU, addr, EEREC_D, info); - recUpdateFlags(VU, EEREC_D, info); - // spacefisherman needs overflow checking on MULi.z -} - -void recVUMI_MUL_xyzw(VURegs *VU, int xyzw, int info) -{ - //Console.WriteLn ("recVUMI_MUL_xyzw"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MUL_xyzw_toD(VU, xyzw, EEREC_D, info); - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_MULi(VURegs *VU, int info) { recVUMI_MUL_iq(VU, VU_VI_ADDR(REG_I, 1), info); } -void recVUMI_MULq(VURegs *VU, int info) { recVUMI_MUL_iq(VU, VU_REGQ_ADDR, info); } -void recVUMI_MULx(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 0, info); } -void recVUMI_MULy(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 1, info); } -void recVUMI_MULz(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 2, info); } -void recVUMI_MULw(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 3, info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// MULA -//------------------------------------------------------------------ -void recVUMI_MULA( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_MULA"); - recVUMI_MUL_toD(VU, EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MULA_iq(VURegs *VU, int addr, int info) -{ - //Console.WriteLn ("recVUMI_MULA_iq"); - recVUMI_MUL_iq_toD(VU, addr, EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MULA_xyzw(VURegs *VU, int xyzw, int info) -{ - //Console.WriteLn ("recVUMI_MULA_xyzw"); - recVUMI_MUL_xyzw_toD(VU, xyzw, EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MULAi(VURegs *VU, int info) { recVUMI_MULA_iq(VU, VU_VI_ADDR(REG_I, 1), info); } -void recVUMI_MULAq(VURegs *VU, int info) { recVUMI_MULA_iq(VU, VU_REGQ_ADDR, info); } -void recVUMI_MULAx(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 0, info); } -void recVUMI_MULAy(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 1, info); } -void recVUMI_MULAz(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 2, info); } -void recVUMI_MULAw(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 3, info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// MADD -//------------------------------------------------------------------ -void recVUMI_MADD_toD(VURegs *VU, int regd, int info) -{ - //Console.WriteLn ("recVUMI_MADD_toD"); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); - vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); - } - - - if( _X_Y_Z_W == 8 ) { - if( regd == EEREC_ACC ) { - SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP); - } - else if (regd == EEREC_T) { - SSE_MULSS_XMM_to_XMM(regd, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); - } - else if (regd == EEREC_S) { - SSE_MULSS_XMM_to_XMM(regd, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); - } - else { - SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); - SSE_MULSS_XMM_to_XMM(regd, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); - } - } - else if (_X_Y_Z_W != 0xf) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); - - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else { - if( regd == EEREC_ACC ) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP); - } - else if (regd == EEREC_T) { - SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); - } - else if (regd == EEREC_S) { - SSE_MULPS_XMM_to_XMM(regd, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); - } - else { - SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S); - SSE_MULPS_XMM_to_XMM(regd, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); - } - } -} - -void recVUMI_MADD_iq_toD(VURegs *VU, uptr addr, int regd, int info) -{ - //Console.WriteLn ("recVUMI_MADD_iq_toD"); - if (CHECK_VU_EXTRA_OVERFLOW) { - vuFloat3(addr); - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); - } - - if( _X_Y_Z_W == 8 ) { - if( _Fs_ == 0 ) { - // do nothing if regd == ACC (ACCx <= ACCx + 0.0 * *addr) - if( regd != EEREC_ACC ) { - SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC); - } - return; - } - - if( regd == EEREC_ACC ) { - assert( EEREC_TEMP < iREGCNT_XMM ); - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP); - } - else if( regd == EEREC_S ) { - SSE_MULSS_M32_to_XMM(regd, addr); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); - } - else { - SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); - SSE_MULSS_M32_to_XMM(regd, addr); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); - } - } - else { - if( _Fs_ == 0 ) { - if( regd == EEREC_ACC ) { // ACCxyz is unchanged, ACCw <= ACCw + *addr - if( _W ) { // if _W is zero, do nothing - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); // { *addr, 0, 0, 0 } - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x27); // { 0, 0, 0, *addr } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP); // { ACCx, ACCy, ACCz, ACCw + *addr } - } - } - else { // DESTxyz <= ACCxyz, DESTw <= ACCw + *addr - if( _W ) { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); // { *addr, 0, 0, 0 } - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x27); // { 0, 0, 0, *addr } - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); // { ACCx, ACCy, ACCz, ACCw + *addr } - } - else SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); - VU_MERGE_REGS(regd, EEREC_TEMP); - } - - return; - } - - if( _X_Y_Z_W != 0xf || regd == EEREC_ACC || regd == EEREC_TEMP || regd == EEREC_S ) { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - } - - if (_X_Y_Z_W != 0xf) { - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); - - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else { - if( regd == EEREC_ACC ) { - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP); - } - else if( regd == EEREC_S ) { - SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); - } - else if( regd == EEREC_TEMP ) { - SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); - } - else { - SSE_MOVSS_M32_to_XMM(regd, addr); - SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x00); - SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); - } - } - } -} - -void recVUMI_MADD_xyzw_toD(VURegs *VU, int xyzw, int regd, int info) -{ - //Console.WriteLn ("recVUMI_MADD_xyzw_toD"); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); - vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); - } - if (_Fs_) { // This is needed for alot of games; so always clamp this operand - if (CHECK_VU_SIGN_OVERFLOW) vFloats4_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set - else vFloats2[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set - } - if( _Ft_ == 0 ) { - - if( xyzw == 3 ) { - // just add - if( _X_Y_Z_W == 8 ) { - if( regd == EEREC_S ) SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); - else { - SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC); - SSE_ADDSS_XMM_to_XMM(regd, EEREC_S); - } - } - else { - if( _X_Y_Z_W != 0xf ) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); - - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else { - if( regd == EEREC_S ) SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); - else { - SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC); - SSE_ADDPS_XMM_to_XMM(regd, EEREC_S); - } - } - } - } - else { - // just move acc to regd - if( _X_Y_Z_W != 0xf ) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC); - } - - return; - } - - if( _X_Y_Z_W == 8 ) { - _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); - - if( regd == EEREC_ACC ) { - SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP); - } - else if( regd == EEREC_S ) { - SSE_MULSS_XMM_to_XMM(regd, EEREC_TEMP); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); - } - else if( regd == EEREC_TEMP ) { - SSE_MULSS_XMM_to_XMM(regd, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); - } - else { - SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC); - SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); } - SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP); - } - } - else { - if( _X_Y_Z_W != 0xf || regd == EEREC_ACC || regd == EEREC_TEMP || regd == EEREC_S ) { - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - } - - if (_X_Y_Z_W != 0xf) { - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); - - VU_MERGE_REGS(regd, EEREC_TEMP); - } - else { - if( regd == EEREC_ACC ) { - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP); - } - else if( regd == EEREC_S ) { - SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); - } - else if( regd == EEREC_TEMP ) { - SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); - } - else { - _unpackVF_xyzw(regd, EEREC_T, xyzw); - SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); - } - } - } -} - -void recVUMI_MADD(VURegs *VU, int info) -{ - //Console.WriteLn ("recVUMI_MADD"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MADD_toD(VU, EEREC_D, info); - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_MADD_iq(VURegs *VU, int addr, int info) -{ - //Console.WriteLn ("recVUMI_MADD_iq"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MADD_iq_toD(VU, addr, EEREC_D, info); - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_MADD_xyzw(VURegs *VU, int xyzw, int info) -{ - //Console.WriteLn ("recVUMI_MADD_xyzw"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MADD_xyzw_toD(VU, xyzw, EEREC_D, info); - recUpdateFlags(VU, EEREC_D, info); - // super bust-a-move arrows needs overflow clamping -} - -void recVUMI_MADDi(VURegs *VU, int info) { recVUMI_MADD_iq(VU, VU_VI_ADDR(REG_I, 1), info); } -void recVUMI_MADDq(VURegs *VU, int info) { recVUMI_MADD_iq(VU, VU_REGQ_ADDR, info); } -void recVUMI_MADDx(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 0, info); } -void recVUMI_MADDy(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 1, info); } -void recVUMI_MADDz(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 2, info); } -void recVUMI_MADDw(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 3, info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// MADDA -//------------------------------------------------------------------ -void recVUMI_MADDA( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_MADDA"); - recVUMI_MADD_toD(VU, EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MADDAi( VURegs *VU , int info) -{ - //Console.WriteLn ("recVUMI_MADDAi"); - recVUMI_MADD_iq_toD( VU, VU_VI_ADDR(REG_I, 1), EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MADDAq( VURegs *VU , int info) -{ - //Console.WriteLn ("recVUMI_MADDAq "); - recVUMI_MADD_iq_toD( VU, VU_REGQ_ADDR, EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MADDAx( VURegs *VU , int info) -{ - //Console.WriteLn ("recVUMI_MADDAx"); - recVUMI_MADD_xyzw_toD(VU, 0, EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MADDAy( VURegs *VU , int info) -{ - //Console.WriteLn ("recVUMI_MADDAy"); - recVUMI_MADD_xyzw_toD(VU, 1, EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MADDAz( VURegs *VU , int info) -{ - //Console.WriteLn ("recVUMI_MADDAz"); - recVUMI_MADD_xyzw_toD(VU, 2, EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MADDAw( VURegs *VU , int info) -{ - //Console.WriteLn ("recVUMI_MADDAw"); - recVUMI_MADD_xyzw_toD(VU, 3, EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// MSUB -//------------------------------------------------------------------ -void recVUMI_MSUB_toD(VURegs *VU, int regd, int info) -{ - //Console.WriteLn ("recVUMI_MSUB_toD"); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); - vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); - } - - if (_X_Y_Z_W != 0xf) { - int t1reg = _vuGetTempXMMreg(info); - - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - - if( t1reg >= 0 ) { - SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_ACC); - SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); - - VU_MERGE_REGS(regd, t1reg); - _freeXMMreg(t1reg); - } - else { - SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); - VU_MERGE_REGS(regd, EEREC_TEMP); - } - } - else { - if( regd == EEREC_S ) { - assert( regd != EEREC_ACC ); - SSE_MULPS_XMM_to_XMM(regd, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC); - SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]); - } - else if( regd == EEREC_T ) { - assert( regd != EEREC_ACC ); - SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC); - SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]); - } - else if( regd == EEREC_TEMP ) { - SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S); - SSE_MULPS_XMM_to_XMM(regd, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC); - SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]); - } - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC); - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP); - } - } -} - -void recVUMI_MSUB_temp_toD(VURegs *VU, int regd, int info) -{ - //Console.WriteLn ("recVUMI_MSUB_temp_toD"); - - if (_X_Y_Z_W != 0xf) { - int t1reg = _vuGetTempXMMreg(info); - - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - - if( t1reg >= 0 ) { - SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_ACC); - SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); - - if ( regd != EEREC_TEMP ) { VU_MERGE_REGS(regd, t1reg); } - else SSE_MOVAPS_XMM_to_XMM(regd, t1reg); - - _freeXMMreg(t1reg); - } - else { - SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); - VU_MERGE_REGS(regd, EEREC_TEMP); - } - } - else { - if( regd == EEREC_ACC ) { - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP); - } - else if( regd == EEREC_S ) { - SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC); - SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]); - } - else if( regd == EEREC_TEMP ) { - SSE_MULPS_XMM_to_XMM(regd, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } - SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC); - SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]); - } - else { - SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC); - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } - SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP); - } - } -} - -void recVUMI_MSUB_iq_toD(VURegs *VU, int regd, int addr, int info) -{ - //Console.WriteLn ("recVUMI_MSUB_iq_toD"); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); - vuFloat3(addr); - } - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - recVUMI_MSUB_temp_toD(VU, regd, info); -} - -void recVUMI_MSUB_xyzw_toD(VURegs *VU, int regd, int xyzw, int info) -{ - //Console.WriteLn ("recVUMI_MSUB_xyzw_toD"); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 1 << (3 - xyzw)); - vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); - } - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - recVUMI_MSUB_temp_toD(VU, regd, info); -} - -void recVUMI_MSUB(VURegs *VU, int info) -{ - //Console.WriteLn ("recVUMI_MSUB"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MSUB_toD(VU, EEREC_D, info); - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_MSUB_iq(VURegs *VU, int addr, int info) -{ - //Console.WriteLn ("recVUMI_MSUB_iq"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MSUB_iq_toD(VU, EEREC_D, addr, info); - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_MSUBi(VURegs *VU, int info) { recVUMI_MSUB_iq(VU, VU_VI_ADDR(REG_I, 1), info); } -void recVUMI_MSUBq(VURegs *VU, int info) { recVUMI_MSUB_iq(VU, VU_REGQ_ADDR, info); } -void recVUMI_MSUBx(VURegs *VU, int info) -{ - //Console.WriteLn ("recVUMI_MSUBx"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 0, info); - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_MSUBy(VURegs *VU, int info) -{ - //Console.WriteLn ("recVUMI_MSUBy"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 1, info); - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_MSUBz(VURegs *VU, int info) -{ - //Console.WriteLn ("recVUMI_MSUBz"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 2, info); - recUpdateFlags(VU, EEREC_D, info); -} - -void recVUMI_MSUBw(VURegs *VU, int info) -{ - //Console.WriteLn ("recVUMI_MSUBw"); - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 3, info); - recUpdateFlags(VU, EEREC_D, info); -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// MSUBA -//------------------------------------------------------------------ -void recVUMI_MSUBA( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_MSUBA"); - recVUMI_MSUB_toD(VU, EEREC_ACC, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MSUBAi( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_MSUBAi "); - recVUMI_MSUB_iq_toD( VU, EEREC_ACC, VU_VI_ADDR(REG_I, 1), info ); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MSUBAq( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_MSUBAq"); - recVUMI_MSUB_iq_toD( VU, EEREC_ACC, VU_REGQ_ADDR, info ); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MSUBAx( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_MSUBAx"); - recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 0, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MSUBAy( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_MSUBAy"); - recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 1, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MSUBAz( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_MSUBAz "); - recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 2, info); - recUpdateFlags(VU, EEREC_ACC, info); -} - -void recVUMI_MSUBAw( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_MSUBAw"); - recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 3, info); - recUpdateFlags(VU, EEREC_ACC, info); -} -//------------------------------------------------------------------ - - -static const __aligned16 u32 special_mask[4] = {0xffffffff, 0x80000000, 0xffffffff, 0x80000000}; -static const __aligned16 u32 special_mask2[4] = {0, 0x40000000, 0, 0x40000000}; - -__aligned16 u32 temp_loc[4]; -__aligned16 u32 temp_loc2[4]; - -//MAX/MINI are non-arithmetic operations that implicitly support numbers with the EXP field being 0 ("denormals"). -// -//As such, they are sometimes used for integer move and (positive!) integer max/min, knowing that integers that -//represent denormals will not be flushed to 0. -// -//As such, this implementation performs a non-arithmetic operation that supports "denormals" and "infs/nans". -//There might be an easier way to do it but here, MAX/MIN is performed with PMAXPD/PMINPD. -//Fake double-precision numbers are constructed by copying the sign of the original numbers, clearing the upper 32 bits, -//setting the 62nd bit to 1 (to ensure double-precision number is "normalized") and having the lower 32bits -//being the same as the original number. - -void MINMAXlogical(VURegs *VU, int info, int min, int mode, uptr addr = 0, int xyzw = 0) -//mode1 = iq, mode2 = xyzw, mode0 = normal -{ - int t1regbool = 0; - int t1reg = _vuGetTempXMMreg(info); - if (t1reg < 0) - { - t1regbool = 1; - for (t1reg = 0; ( (t1reg == EEREC_D) || (t1reg == EEREC_S) || (mode != 1 && t1reg == EEREC_T) - || (t1reg == EEREC_TEMP) ); t1reg++); // Find unused reg (For first temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)temp_loc, t1reg); // Backup t1reg XMM reg - } - int t2regbool = -1; - int t2reg = EEREC_TEMP; - if (EEREC_TEMP == EEREC_D || EEREC_TEMP == EEREC_S || (mode != 1 && EEREC_TEMP == EEREC_T)) - { - t2regbool = 0; - t2reg = _vuGetTempXMMreg(info); - if (t2reg < 0) - { - t2regbool = 1; - for (t2reg = 0; ( (t2reg == EEREC_D) || (t2reg == EEREC_S) || (mode != 1 && t2reg == EEREC_T) || - (t2reg == t1reg) || (t2reg == EEREC_TEMP) ); t2reg++); // Find unused reg (For second temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)temp_loc2, t2reg); // Backup t2reg XMM reg - } - } - - if (_X || _Y) - { - SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0x50); - SSE2_PAND_M128_to_XMM(t1reg, (uptr)special_mask); - SSE2_POR_M128_to_XMM(t1reg, (uptr)special_mask2); - if (mode == 0) - SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0x50); - else if (mode == 1) - { - SSE2_MOVD_M32_to_XMM(t2reg, addr); - SSE2_PSHUFD_XMM_to_XMM(t2reg, t2reg, 0x00); - } - else if (mode == 2) - _unpackVF_xyzw(t2reg, EEREC_T, xyzw); - SSE2_PAND_M128_to_XMM(t2reg, (uptr)special_mask); - SSE2_POR_M128_to_XMM(t2reg, (uptr)special_mask2); - if (min) - SSE2_MINPD_XMM_to_XMM(t1reg, t2reg); - else - SSE2_MAXPD_XMM_to_XMM(t1reg, t2reg); - SSE2_PSHUFD_XMM_to_XMM(t1reg, t1reg, 0x88); - VU_MERGE_REGS_CUSTOM(EEREC_D, t1reg, 0xc & _X_Y_Z_W); - } - - if (_Z || _W) - { - SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0xfa); - SSE2_PAND_M128_to_XMM(t1reg, (uptr)special_mask); - SSE2_POR_M128_to_XMM(t1reg, (uptr)special_mask2); - if (mode == 0) - SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0xfa); - else if (mode == 1) - { - SSE2_MOVD_M32_to_XMM(t2reg, addr); - SSE2_PSHUFD_XMM_to_XMM(t2reg, t2reg, 0x00); - } - else if (mode == 2) - _unpackVF_xyzw(t2reg, EEREC_T, xyzw); - SSE2_PAND_M128_to_XMM(t2reg, (uptr)special_mask); - SSE2_POR_M128_to_XMM(t2reg, (uptr)special_mask2); - if (min) - SSE2_MINPD_XMM_to_XMM(t1reg, t2reg); - else - SSE2_MAXPD_XMM_to_XMM(t1reg, t2reg); - SSE2_PSHUFD_XMM_to_XMM(t1reg, t1reg, 0x88); - VU_MERGE_REGS_CUSTOM(EEREC_D, t1reg, 0x3 & _X_Y_Z_W); - } - - if (t1regbool == 0) - _freeXMMreg(t1reg); - else if (t1regbool == 1) - SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)temp_loc); // Restore t1reg XMM reg - if (t2regbool == 0) - _freeXMMreg(t2reg); - else if (t2regbool == 1) - SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)temp_loc2); // Restore t2reg XMM reg -} - -//------------------------------------------------------------------ -// MAX -//------------------------------------------------------------------ - -void recVUMI_MAX(VURegs *VU, int info) -{ - if ( _Fd_ == 0 ) return; - //Console.WriteLn ("recVUMI_MAX"); - - if (MINMAXFIX) - MINMAXlogical(VU, info, 0, 0); - else - { - - if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping - if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); - - if( _X_Y_Z_W == 8 ) { - if (EEREC_D == EEREC_S) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T); - else if (EEREC_D == EEREC_T) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_S); - else { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - else if (_X_Y_Z_W != 0xf) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - if( EEREC_D == EEREC_S ) SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_T); - else if( EEREC_D == EEREC_T ) SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S); - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - } -} - -void recVUMI_MAX_iq(VURegs *VU, uptr addr, int info) -{ - if ( _Fd_ == 0 ) return; - //Console.WriteLn ("recVUMI_MAX_iq"); - - if (MINMAXFIX) - MINMAXlogical(VU, info, 0, 1, addr); - else - { - if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping - vuFloat3(addr); - - if( _XYZW_SS ) { - if( EEREC_D == EEREC_TEMP ) { - _vuFlipRegSS(VU, EEREC_S); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MAXSS_M32_to_XMM(EEREC_D, addr); - _vuFlipRegSS(VU, EEREC_S); - - // have to flip over EEREC_D if computing flags! - //if( (info & PROCESS_VU_UPDATEFLAGS) ) - _vuFlipRegSS(VU, EEREC_D); - } - else if( EEREC_D == EEREC_S ) { - _vuFlipRegSS(VU, EEREC_D); - SSE_MAXSS_M32_to_XMM(EEREC_D, addr); - _vuFlipRegSS(VU, EEREC_D); - } - else { - if( _X ) { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MAXSS_M32_to_XMM(EEREC_D, addr); - } - else { - _vuMoveSS(VU, EEREC_TEMP, EEREC_S); - _vuFlipRegSS(VU, EEREC_D); - SSE_MAXSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - _vuFlipRegSS(VU, EEREC_D); - } - } - } - else if (_X_Y_Z_W != 0xf) { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - if(EEREC_D == EEREC_S) { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - else { - SSE_MOVSS_M32_to_XMM(EEREC_D, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00); - SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S); - } - } - } -} - -void recVUMI_MAX_xyzw(VURegs *VU, int xyzw, int info) -{ - if ( _Fd_ == 0 ) return; - //Console.WriteLn ("recVUMI_MAX_xyzw"); - - if (_Fs_ == 0 && _Ft_ == 0) - { - if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) { - if( xyzw < 3 ) { - SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - else { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)s_fones); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - } - else if (_X_Y_Z_W != 0xf) { - if( xyzw < 3 ) { - if( _X_Y_Z_W & 1 ) SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[0]); // w included, so insert the whole reg - else SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); // w not included, can zero out - } - else SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)s_fones); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - //If VF0.w isnt chosen as the constant, then its going to be MAX( 0, VF0 ), so the result is VF0 - if( xyzw < 3 ) { SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)&VU->VF[0].UL[0]); } - else SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)s_fones); - } - return; - } - - if (MINMAXFIX) - MINMAXlogical(VU, info, 0, 2, 0, xyzw); - else - { - if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping - if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); - - if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) { - if( xyzw == 0 ) { - if( EEREC_D == EEREC_S ) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T); - else if( EEREC_D == EEREC_T ) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_S); - else { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - else { - _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - } - else if (_X_Y_Z_W != 0xf) { - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - if (EEREC_D == EEREC_S) { - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - else { - _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw); - SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S); - } - } - } -} - -void recVUMI_MAXi(VURegs *VU, int info) { recVUMI_MAX_iq(VU, VU_VI_ADDR(REG_I, 1), info); } -void recVUMI_MAXx(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 0, info); } -void recVUMI_MAXy(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 1, info); } -void recVUMI_MAXz(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 2, info); } -void recVUMI_MAXw(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 3, info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// MINI -//------------------------------------------------------------------ -void recVUMI_MINI(VURegs *VU, int info) -{ - if ( _Fd_ == 0 ) return; - //Console.WriteLn ("recVUMI_MINI"); - - if (MINMAXFIX) - MINMAXlogical(VU, info, 1, 0); - else - { - - if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping - if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); - - if( _X_Y_Z_W == 8 ) { - if (EEREC_D == EEREC_S) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T); - else if (EEREC_D == EEREC_T) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_S); - else { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - else if (_X_Y_Z_W != 0xf) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - if( EEREC_D == EEREC_S ) { - //ClampUnordered(EEREC_T, EEREC_TEMP, 0); // need for GT4 vu0rec - SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_T); - } - else if( EEREC_D == EEREC_T ) { - //ClampUnordered(EEREC_S, EEREC_TEMP, 0); // need for GT4 vu0rec - SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S); - } - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - } -} - -void recVUMI_MINI_iq(VURegs *VU, uptr addr, int info) -{ - if ( _Fd_ == 0 ) return; - //Console.WriteLn ("recVUMI_MINI_iq"); - - if (MINMAXFIX) - MINMAXlogical(VU, info, 1, 1, addr); - else - { - - if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping - vuFloat3(addr); - - if( _XYZW_SS ) { - if( EEREC_D == EEREC_TEMP ) { - _vuFlipRegSS(VU, EEREC_S); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MINSS_M32_to_XMM(EEREC_D, addr); - _vuFlipRegSS(VU, EEREC_S); - - // have to flip over EEREC_D if computing flags! - //if( (info & PROCESS_VU_UPDATEFLAGS) ) - _vuFlipRegSS(VU, EEREC_D); - } - else if( EEREC_D == EEREC_S ) { - _vuFlipRegSS(VU, EEREC_D); - SSE_MINSS_M32_to_XMM(EEREC_D, addr); - _vuFlipRegSS(VU, EEREC_D); - } - else { - if( _X ) { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MINSS_M32_to_XMM(EEREC_D, addr); - } - else { - _vuMoveSS(VU, EEREC_TEMP, EEREC_S); - _vuFlipRegSS(VU, EEREC_D); - SSE_MINSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - _vuFlipRegSS(VU, EEREC_D); - } - } - } - else if (_X_Y_Z_W != 0xf) { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - if(EEREC_D == EEREC_S) { - SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); - SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - else { - SSE_MOVSS_M32_to_XMM(EEREC_D, addr); - SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00); - SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S); - } - } - } -} - -void recVUMI_MINI_xyzw(VURegs *VU, int xyzw, int info) -{ - if ( _Fd_ == 0 ) return; - //Console.WriteLn ("recVUMI_MINI_xyzw"); - - if (_Fs_ == 0 && _Ft_ == 0) - { - if( _X_Y_Z_W == 0xf ) - { - //If VF0.w is the constant, the result will match VF0, else its all 0's - if(xyzw == 3) SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)&VU->VF[0].UL[0]); - else SSE_XORPS_XMM_to_XMM(EEREC_D, EEREC_D); - } - else - { - //If VF0.w is the constant, the result will match VF0, else its all 0's - if(xyzw == 3) SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[0]); - else SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - return; - } - if (MINMAXFIX) - MINMAXlogical(VU, info, 1, 2, 0, xyzw); - else - { - if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping - if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); - - if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) { - if( xyzw == 0 ) { - if( EEREC_D == EEREC_S ) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T); - else if( EEREC_D == EEREC_T ) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_S); - else { - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T); - } - } - else { - _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - } - else if (_X_Y_Z_W != 0xf) { - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - VU_MERGE_REGS(EEREC_D, EEREC_TEMP); - } - else { - if (EEREC_D == EEREC_S) { - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); - SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); - } - else { - _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw); - SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S); - } - } - } -} - -void recVUMI_MINIi(VURegs *VU, int info) { recVUMI_MINI_iq(VU, VU_VI_ADDR(REG_I, 1), info); } -void recVUMI_MINIx(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 0, info); } -void recVUMI_MINIy(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 1, info); } -void recVUMI_MINIz(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 2, info); } -void recVUMI_MINIw(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 3, info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// OPMULA -//------------------------------------------------------------------ -void recVUMI_OPMULA( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_OPMULA"); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, 0xE); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 0xE); - } - - SSE_MOVAPS_XMM_to_XMM( EEREC_TEMP, EEREC_S ); - SSE_SHUFPS_XMM_to_XMM( EEREC_T, EEREC_T, 0xD2 ); // EEREC_T = WYXZ - SSE_SHUFPS_XMM_to_XMM( EEREC_TEMP, EEREC_TEMP, 0xC9 ); // EEREC_TEMP = WXZY - SSE_MULPS_XMM_to_XMM( EEREC_TEMP, EEREC_T ); - - VU_MERGE_REGS_CUSTOM(EEREC_ACC, EEREC_TEMP, 14); - - // revert EEREC_T - if( EEREC_T != EEREC_ACC ) - SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xC9); - - recUpdateFlags(VU, EEREC_ACC, info); -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// OPMSUB -//------------------------------------------------------------------ -void recVUMI_OPMSUB( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_OPMSUB"); - if (CHECK_VU_EXTRA_OVERFLOW) { - if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, 0xE); - if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 0xE); - } - - if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xD2); // EEREC_T = WYXZ - SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0xC9); // EEREC_TEMP = WXZY - SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); - - // negate and add - SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); - SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); - VU_MERGE_REGS_CUSTOM(EEREC_D, EEREC_TEMP, 14); - - // revert EEREC_T - if( EEREC_T != EEREC_D ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xC9); - - recUpdateFlags(VU, EEREC_D, info); -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// NOP -//------------------------------------------------------------------ -void recVUMI_NOP( VURegs *VU, int info ) -{ - //Console.WriteLn ("recVUMI_NOP"); -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// recVUMI_FTOI_Saturate() - Saturates result from FTOI Instructions -//------------------------------------------------------------------ - -// unused, but leaving here for possible reference.. -//static const __aligned16 int rec_const_0x8000000[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; - -void recVUMI_FTOI_Saturate(int rec_s, int rec_t, int rec_tmp1, int rec_tmp2) -{ - //Console.WriteLn ("recVUMI_FTOI_Saturate"); - //Duplicate the xor'd sign bit to the whole value - //FFFF FFFF for positive, 0 for negative - SSE_MOVAPS_XMM_to_XMM(rec_tmp1, rec_s); - SSE2_PXOR_M128_to_XMM(rec_tmp1, (uptr)&const_clip[4]); - SSE2_PSRAD_I8_to_XMM(rec_tmp1, 31); - - //Create mask: 0 where !=8000 0000 - SSE_MOVAPS_XMM_to_XMM(rec_tmp2, rec_t); - SSE2_PCMPEQD_M128_to_XMM(rec_tmp2, (uptr)&const_clip[4]); - - //AND the mask w/ the edit values - SSE_ANDPS_XMM_to_XMM(rec_tmp1, rec_tmp2); - - //if v==8000 0000 && positive -> 8000 0000 + FFFF FFFF -> 7FFF FFFF - //if v==8000 0000 && negative -> 8000 0000 + 0 -> 8000 0000 - //if v!=8000 0000 -> v+0 (masked from the and) - - //Add the values as needed - SSE2_PADDD_XMM_to_XMM(rec_t, rec_tmp1); -} -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// FTOI 0/4/12/15 -//------------------------------------------------------------------ -static __aligned16 float FTIO_Temp1[4] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; -static __aligned16 float FTIO_Temp2[4] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; -void recVUMI_FTOI0(VURegs *VU, int info) -{ - int t1reg, t2reg; // Temp XMM regs - - if ( _Ft_ == 0 ) return; - - //Console.WriteLn ("recVUMI_FTOI0"); - - if (_X_Y_Z_W != 0xf) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) - SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); - - t1reg = _vuGetTempXMMreg(info); - - if( t1reg >= 0 ) { // If theres a temp XMM reg available - for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++) - ; // Find unused reg (For second temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg - - recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result - - SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg - _freeXMMreg(t1reg); // Free temp reg - } - else { // No temp reg available - for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) - ; // Find unused reg (For first temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg - - for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++) - ; // Find unused reg (For second temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg - - recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result - - SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg - SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg - } - - VU_MERGE_REGS(EEREC_T, EEREC_TEMP); - } - else { - if (EEREC_T != EEREC_S) { - SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_S); - vuFloat_useEAX( info, EEREC_T, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) - SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_T, EEREC_T); - - t1reg = _vuGetTempXMMreg(info); - - if( t1reg >= 0 ) { // If theres a temp XMM reg available - recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result - _freeXMMreg(t1reg); // Free temp reg - } - else { // No temp reg available - for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) - ; // Find unused reg - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg - - recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result - - SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg - } - } - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) - SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); - - t1reg = _vuGetTempXMMreg(info); - - if( t1reg >= 0 ) { // If theres a temp XMM reg available - for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++) - ; // Find unused reg (For second temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg - - recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result - - SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg - _freeXMMreg(t1reg); // Free temp reg - } - else { // No temp reg available - for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) - ; // Find unused reg (For first temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg - - for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++) - ; // Find unused reg (For second temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg - - recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result - - SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg - SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg - } - - SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_TEMP); - } - } -} - -void recVUMI_FTOIX(VURegs *VU, int addr, int info) -{ - int t1reg, t2reg; // Temp XMM regs - - if ( _Ft_ == 0 ) return; - - //Console.WriteLn ("recVUMI_FTOIX"); - if (_X_Y_Z_W != 0xf) { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr); - vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) - SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); - - t1reg = _vuGetTempXMMreg(info); - - if( t1reg >= 0 ) { // If theres a temp XMM reg available - for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++) - ; // Find unused reg (For second temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg - - recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result - - SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg - _freeXMMreg(t1reg); // Free temp reg - } - else { // No temp reg available - for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) - ; // Find unused reg (For first temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg - - for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++) - ; // Find unused reg (For second temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg - - recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result - - SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg - SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg - } - - VU_MERGE_REGS(EEREC_T, EEREC_TEMP); - } - else { - if (EEREC_T != EEREC_S) { - SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_S); - SSE_MULPS_M128_to_XMM(EEREC_T, addr); - vuFloat_useEAX( info, EEREC_T, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) - SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_T, EEREC_T); - - t1reg = _vuGetTempXMMreg(info); - - if( t1reg >= 0 ) { // If theres a temp XMM reg available - recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result - _freeXMMreg(t1reg); // Free temp reg - } - else { // No temp reg available - for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) - ; // Find unused reg - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg - - recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result - - SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg - } - } - else { - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr); - vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) - SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); - - t1reg = _vuGetTempXMMreg(info); - - if( t1reg >= 0 ) { // If theres a temp XMM reg available - for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++) - ; // Find unused reg (For second temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg - - recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result - - SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg - _freeXMMreg(t1reg); // Free temp reg - } - else { // No temp reg available - for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) - ; // Find unused reg (For first temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg - - for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++) - ; // Find unused reg (For second temp reg) - SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg - - recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result - - SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg - SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg - } - - SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_TEMP); - } - } -} - -void recVUMI_FTOI4( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int4[0], info); } -void recVUMI_FTOI12( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int12[0], info); } -void recVUMI_FTOI15( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int15[0], info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// ITOF 0/4/12/15 -//------------------------------------------------------------------ -void recVUMI_ITOF0( VURegs *VU, int info ) -{ - if ( _Ft_ == 0 ) return; - - //Console.WriteLn ("recVUMI_ITOF0"); - if (_X_Y_Z_W != 0xf) { - SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - vuFloat_useEAX( info, EEREC_TEMP, 15); // Clamp infinities - VU_MERGE_REGS(EEREC_T, EEREC_TEMP); - xmmregs[EEREC_T].mode |= MODE_WRITE; - } - else { - SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_T, EEREC_S); - vuFloat2(EEREC_T, EEREC_TEMP, 15); // Clamp infinities - } -} - -void recVUMI_ITOFX(VURegs *VU, int addr, int info) -{ - if ( _Ft_ == 0 ) return; - - //Console.WriteLn ("recVUMI_ITOFX"); - if (_X_Y_Z_W != 0xf) { - SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_TEMP, EEREC_S); - SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr); - vuFloat_useEAX( info, EEREC_TEMP, 15); // Clamp infinities - VU_MERGE_REGS(EEREC_T, EEREC_TEMP); - xmmregs[EEREC_T].mode |= MODE_WRITE; - } - else { - SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_T, EEREC_S); - SSE_MULPS_M128_to_XMM(EEREC_T, addr); - vuFloat2(EEREC_T, EEREC_TEMP, 15); // Clamp infinities - } -} - -void recVUMI_ITOF4( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float4[0], info); } -void recVUMI_ITOF12( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float12[0], info); } -void recVUMI_ITOF15( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float15[0], info); } -//------------------------------------------------------------------ - - -//------------------------------------------------------------------ -// CLIP -//------------------------------------------------------------------ -void recVUMI_CLIP(VURegs *VU, int info) -{ - int t1reg = EEREC_D; - int t2reg = EEREC_ACC; - int x86temp1, x86temp2; - - u32 clipaddr = VU_VI_ADDR(REG_CLIP_FLAG, 0); - u32 prevclipaddr = VU_VI_ADDR(REG_CLIP_FLAG, 2); - - if( clipaddr == 0 ) { // battle star has a clip right before fcset - Console.WriteLn("skipping vu clip"); - return; - } - - //Flush the clip flag before processing, incase of double clip commands (GoW) - - if( prevclipaddr != (uptr)&VU->VI[REG_CLIP_FLAG] ) { - MOV32MtoR(EAX, prevclipaddr); - MOV32RtoM((uptr)&VU->VI[REG_CLIP_FLAG], EAX); - } - - assert( clipaddr != 0 ); - assert( t1reg != t2reg && t1reg != EEREC_TEMP && t2reg != EEREC_TEMP ); - - x86temp1 = ALLOCTEMPX86(MODE_8BITREG); - x86temp2 = ALLOCTEMPX86(MODE_8BITREG); - - //if ( (x86temp1 == 0) || (x86temp2 == 0) ) Console.Error("VU CLIP Allocation Error: EAX being allocated!"); - - _freeXMMreg(t1reg); // These should have been freed at allocation in eeVURecompileCode() - _freeXMMreg(t2reg); // but if they've been used since then, then free them. (just doing this incase :p (cottonvibes)) - - if( _Ft_ == 0 ) { - SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&s_fones[0]); // all 1s - SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)&s_fones[4]); - } - else { - _unpackVF_xyzw(EEREC_TEMP, EEREC_T, 3); - SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[0]); - SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_TEMP); - SSE_ORPS_M128_to_XMM(t1reg, (uptr)&const_clip[4]); - } - - MOV32MtoR(EAX, prevclipaddr); - - SSE_CMPNLEPS_XMM_to_XMM(t1reg, EEREC_S); //-w, -z, -y, -x - SSE_CMPLTPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); //+w, +z, +y, +x - - SHL32ItoR(EAX, 6); - - SSE_MOVAPS_XMM_to_XMM(t2reg, EEREC_TEMP); //t2 = +w, +z, +y, +x - SSE_UNPCKLPS_XMM_to_XMM(EEREC_TEMP, t1reg); //EEREC_TEMP = -y,+y,-x,+x - SSE_UNPCKHPS_XMM_to_XMM(t2reg, t1reg); //t2reg = -w,+w,-z,+z - SSE_MOVMSKPS_XMM_to_R32(x86temp2, EEREC_TEMP); // -y,+y,-x,+x - SSE_MOVMSKPS_XMM_to_R32(x86temp1, t2reg); // -w,+w,-z,+z - - AND8ItoR(x86temp1, 0x3); - SHL8ItoR(x86temp1, 4); - OR8RtoR(EAX, x86temp1); - AND8ItoR(x86temp2, 0xf); - OR8RtoR(EAX, x86temp2); - AND32ItoR(EAX, 0xffffff); - - MOV32RtoM(clipaddr, EAX); - - if (( !(info & (PROCESS_VU_SUPER|PROCESS_VU_COP2)) ) ) //Instantly update the flag if its called from elsewhere (unlikely, but ok) - MOV32RtoM((uptr)&VU->VI[REG_CLIP_FLAG], EAX); - - _freeX86reg(x86temp1); - _freeX86reg(x86temp2); -} +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#include "PrecompiledHeader.h" + +#include "Common.h" +#include "GS.h" +#include "R5900OpcodeTables.h" +#include "iR5900.h" +#include "iMMI.h" +#include "iFPU.h" +#include "iCOP0.h" +#include "VUmicro.h" +#include "VUflags.h" +#include "sVU_Micro.h" +#include "sVU_Debug.h" +#include "sVU_zerorec.h" +//------------------------------------------------------------------ +#define MINMAXFIX 1 +//------------------------------------------------------------------ +// Helper Macros +//------------------------------------------------------------------ +#define _Ft_ (( VU->code >> 16) & 0x1F) // The rt part of the instruction register +#define _Fs_ (( VU->code >> 11) & 0x1F) // The rd part of the instruction register +#define _Fd_ (( VU->code >> 6) & 0x1F) // The sa part of the instruction register + +#define _X (( VU->code>>24) & 0x1) +#define _Y (( VU->code>>23) & 0x1) +#define _Z (( VU->code>>22) & 0x1) +#define _W (( VU->code>>21) & 0x1) + +#define _XYZW_SS (_X+_Y+_Z+_W==1) + +#define _Fsf_ (( VU->code >> 21) & 0x03) +#define _Ftf_ (( VU->code >> 23) & 0x03) + +#define _Imm11_ (s32)(VU->code & 0x400 ? 0xfffffc00 | (VU->code & 0x3ff) : VU->code & 0x3ff) +#define _UImm11_ (s32)(VU->code & 0x7ff) + +#define VU_VFx_ADDR(x) (uptr)&VU->VF[x].UL[0] +#define VU_VFy_ADDR(x) (uptr)&VU->VF[x].UL[1] +#define VU_VFz_ADDR(x) (uptr)&VU->VF[x].UL[2] +#define VU_VFw_ADDR(x) (uptr)&VU->VF[x].UL[3] + +#define VU_REGR_ADDR (uptr)&VU->VI[REG_R] +#define VU_REGQ_ADDR (uptr)&VU->VI[REG_Q] +#define VU_REGMAC_ADDR (uptr)&VU->VI[REG_MAC_FLAG] + +#define VU_VI_ADDR(x, read) GetVIAddr(VU, x, read, info) + +#define VU_ACCx_ADDR (uptr)&VU->ACC.UL[0] +#define VU_ACCy_ADDR (uptr)&VU->ACC.UL[1] +#define VU_ACCz_ADDR (uptr)&VU->ACC.UL[2] +#define VU_ACCw_ADDR (uptr)&VU->ACC.UL[3] + +#define _X_Y_Z_W ((( VU->code >> 21 ) & 0xF ) ) +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// Global Variables +//------------------------------------------------------------------ +static const __aligned16 int SSEmovMask[ 16 ][ 4 ] = +{ + { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, + { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, + { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, + { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, + { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, + { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } +}; + +static const __aligned16 u32 const_abs_table[16][4] = +{ + { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000 + { 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, //0001 + { 0xffffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, //0010 + { 0xffffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, //0011 + { 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, //0100 + { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, //0101 + { 0xffffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, //0110 + { 0xffffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0111 + { 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000 + { 0x7fffffff, 0xffffffff, 0xffffffff, 0x7fffffff }, //1001 + { 0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff }, //1010 + { 0x7fffffff, 0xffffffff, 0x7fffffff, 0x7fffffff }, //1011 + { 0x7fffffff, 0x7fffffff, 0xffffffff, 0xffffffff }, //1100 + { 0x7fffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }, //1101 + { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0xffffffff }, //1110 + { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1111 +}; + +static const __aligned16 float recMult_float_to_int4[4] = { 16.0, 16.0, 16.0, 16.0 }; +static const __aligned16 float recMult_float_to_int12[4] = { 4096.0, 4096.0, 4096.0, 4096.0 }; +static const __aligned16 float recMult_float_to_int15[4] = { 32768.0, 32768.0, 32768.0, 32768.0 }; + +static const __aligned16 float recMult_int_to_float4[4] = { 0.0625f, 0.0625f, 0.0625f, 0.0625f }; +static const __aligned16 float recMult_int_to_float12[4] = { 0.000244140625, 0.000244140625, 0.000244140625, 0.000244140625 }; +static const __aligned16 float recMult_int_to_float15[4] = { 0.000030517578125, 0.000030517578125, 0.000030517578125, 0.000030517578125 }; + +static const __aligned16 u32 VU_Underflow_Mask1[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; +static const __aligned16 u32 VU_Underflow_Mask2[4] = {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff}; +static const __aligned16 u32 VU_Zero_Mask[4] = {0x00000000, 0x00000000, 0x00000000, 0x00000000}; +static const __aligned16 u32 VU_Zero_Helper_Mask[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; +static const __aligned16 u32 VU_Signed_Zero_Mask[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; +static const __aligned16 u32 VU_Pos_Infinity[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; +static const __aligned16 u32 VU_Neg_Infinity[4] = {0xff800000, 0xff800000, 0xff800000, 0xff800000}; +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// recUpdateFlags() - Computes the flags for the Upper Opcodes +// +// Note: Computes under/overflow flags if CHECK_VU_EXTRA_FLAGS is 1 +//------------------------------------------------------------------ +static __aligned16 u64 TEMPXMMData[2]; +void recUpdateFlags(VURegs * VU, int reg, int info) +{ + static u8 *pjmp, *pjmp2; + static u32 *pjmp32; + static u32 macaddr, stataddr, prevstataddr; + static int x86macflag, x86statflag, x86temp; + static int t1reg, t1regBoolean; + static const int flipMask[16] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15}; + + if( !(info & PROCESS_VU_UPDATEFLAGS) ) { + if (CHECK_VU_EXTRA_OVERFLOW) { + if (reg != EEREC_TEMP) vuFloat2(reg, EEREC_TEMP, _X_Y_Z_W); + else vuFloat_useEAX(info, reg, _X_Y_Z_W); + } + return; + } + + //Console.WriteLn ("recUpdateFlags"); + + macaddr = VU_VI_ADDR(REG_MAC_FLAG, 0); + stataddr = VU_VI_ADDR(REG_STATUS_FLAG, 0); // write address + prevstataddr = VU_VI_ADDR(REG_STATUS_FLAG, 2); // previous address + + if( stataddr == 0 ) stataddr = prevstataddr; + if( macaddr == 0 ) { + Console.WriteLn( "VU ALLOCATION WARNING: Using Mac Flag Previous Address!" ); + macaddr = VU_VI_ADDR(REG_MAC_FLAG, 2); + } + + x86macflag = ALLOCTEMPX86(0); + x86statflag = ALLOCTEMPX86(0); + + if (reg == EEREC_TEMP) { + t1reg = _vuGetTempXMMreg(info); + if (t1reg < 0) { + //Console.WriteLn( "VU ALLOCATION ERROR: Temp reg can't be allocated!!!!" ); + t1reg = (reg == 0) ? 1 : 0; // Make t1reg != reg + SSE_MOVAPS_XMM_to_M128( (uptr)TEMPXMMData, t1reg ); // Backup data to temp address + t1regBoolean = 1; + } + else t1regBoolean = 0; + } + else { + t1reg = EEREC_TEMP; + t1regBoolean = 2; + } + + SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip wzyx to xyzw + MOV32MtoR(x86statflag, prevstataddr); // Load the previous status in to x86statflag + AND16ItoR(x86statflag, 0xff0); // Keep Sticky and D/I flags + + + if (CHECK_VU_EXTRA_FLAGS) { // Checks all flags + + x86temp = ALLOCTEMPX86(0); + + //-------------------------Check for Overflow flags------------------------------ + + //SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg + //SSE_CMPUNORDPS_XMM_to_XMM(t1reg, reg); // If reg == NaN then set Vector to 0xFFFFFFFF + + //SSE_MOVAPS_XMM_to_XMM(t1reg, reg); + //SSE_MINPS_M128_to_XMM(t1reg, (uptr)g_maxvals); + //SSE_MAXPS_M128_to_XMM(t1reg, (uptr)g_minvals); + //SSE_CMPNEPS_XMM_to_XMM(t1reg, reg); // If they're not equal, then overflow has occured + + SSE_MOVAPS_XMM_to_XMM(t1reg, reg); + SSE_ANDPS_M128_to_XMM(t1reg, (uptr)VU_Zero_Helper_Mask); + SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)VU_Pos_Infinity); // If infinity, then overflow has occured (NaN's don't report as overflow) + + SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the previous calculation + + AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified) + pjmp = JZ8(0); // Skip if none are + OR16ItoR(x86statflag, 0x208); // OS, O flags + SHL16ItoR(x86macflag, 12); + if (_XYZW_SS) pjmp32 = JMP32(0); // Skip Underflow Check + x86SetJ8(pjmp); + + //-------------------------Check for Underflow flags------------------------------ + + SSE_MOVAPS_XMM_to_XMM(t1reg, reg); // t1reg <- reg + + SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask1[ 0 ]); + SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg == zero exponent) then set Vector to 0xFFFFFFFF + + SSE_ANDPS_XMM_to_XMM(t1reg, reg); + SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask2[ 0 ]); + SSE_CMPNEPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg != zero mantisa) then set Vector to 0xFFFFFFFF + + SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation + + AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Has Underflowed" bits from the previous calculation + pjmp = JZ8(0); // Skip if none are + OR16ItoR(x86statflag, 0x104); // US, U flags + SHL16ItoR(EAX, 8); + OR32RtoR(x86macflag, EAX); + x86SetJ8(pjmp); + + //-------------------------Optional Code: Denormals Are Zero------------------------------ + if (CHECK_VU_UNDERFLOW) { // Sets underflow/denormals to zero + SSE_ANDNPS_XMM_to_XMM(t1reg, reg); // t1reg = !t1reg & reg (t1reg = denormals are positive zero) + VU_MERGE_REGS_SAFE(t1reg, reg, (15 - flipMask[_X_Y_Z_W])); // Send t1reg the vectors that shouldn't be modified (since reg was flipped, we need a mask to get the unmodified vectors) + // Now we have Denormals are Positive Zero in t1reg; the next two lines take Signed Zero into account + SSE_ANDPS_M128_to_XMM(reg, (uptr)&VU_Signed_Zero_Mask[ 0 ]); // Only keep the sign bit for each vector + SSE_ORPS_XMM_to_XMM(reg, t1reg); // Denormals are Signed Zero, and unmodified vectors stay the same! + } + + if (_XYZW_SS) x86SetJ32(pjmp32); // If we skipped the Underflow Flag Checking (when we had an Overflow), return here + + vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask) + + //-------------------------Check for Signed flags------------------------------ + + // The following code makes sure the Signed Bit isn't set with Negative Zero + SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg + SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero + SSE_MOVMSKPS_XMM_to_R32(x86temp, t1reg); // Used for Zero Flag Calculation + SSE_ANDNPS_XMM_to_XMM(t1reg, reg); + + SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg + + AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation + pjmp = JZ8(0); // Skip if none are + OR16ItoR(x86statflag, 0x82); // SS, S flags + SHL16ItoR(EAX, 4); + OR32RtoR(x86macflag, EAX); + if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking + x86SetJ8(pjmp); + + //-------------------------Check for Zero flags------------------------------ + + AND16ItoR(x86temp, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation + pjmp = JZ8(0); // Skip if none are + OR16ItoR(x86statflag, 0x41); // ZS, Z flags + OR32RtoR(x86macflag, x86temp); + x86SetJ8(pjmp); + + _freeX86reg(x86temp); + } + else { // Only Checks for Sign and Zero Flags + + vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask) + + //-------------------------Check for Signed flags------------------------------ + + // The following code makes sure the Signed Bit isn't set with Negative Zero + SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg + SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero + SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Used for Zero Flag Calculation + SSE_ANDNPS_XMM_to_XMM(t1reg, reg); + + SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the t1reg + + AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation + pjmp = JZ8(0); // Skip if none are + OR16ItoR(x86statflag, 0x82); // SS, S flags + SHL16ItoR(x86macflag, 4); + if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking + x86SetJ8(pjmp); + + //-------------------------Check for Zero flags------------------------------ + + AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation + pjmp = JZ8(0); // Skip if none are + OR16ItoR(x86statflag, 0x41); // ZS, Z flags + OR32RtoR(x86macflag, EAX); + x86SetJ8(pjmp); + } + //-------------------------Finally: Send the Flags to the Mac Flag Address------------------------------ + + if (_XYZW_SS) x86SetJ8(pjmp2); // If we skipped the Zero Flag Checking, return here + + if (t1regBoolean == 2) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip back reg to wzyx (have to do this because reg != EEREC_TEMP) + else if (t1regBoolean == 1) SSE_MOVAPS_M128_to_XMM( t1reg, (uptr)TEMPXMMData ); // Restore data from temo address + else _freeXMMreg(t1reg); // Free temp reg + + MOV16RtoM(macaddr, x86macflag); + MOV16RtoM(stataddr, x86statflag); + + _freeX86reg(x86macflag); + _freeX86reg(x86statflag); +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// Custom VU ADD/SUB routines by Nneeve +// +// Note: See FPU_ADD_SUB() for more info on what this is doing. +//------------------------------------------------------------------ +static __aligned16 u32 VU_addsuband[2][4]; +static __aligned16 u32 VU_addsub_reg[2][4]; + +static u32 tempECX; + +void VU_ADD_SUB(u32 regd, u32 regt, int is_sub, int info) +{ + u8 *localptr[4][8]; + + MOV32RtoM((uptr)&tempECX, ECX); + + int temp1 = ECX; //receives regd + int temp2 = ALLOCTEMPX86(0); + + if (temp2 == ECX) + { + temp2 = ALLOCTEMPX86(0); + _freeX86reg(ECX); + } + + SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd); + SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt); + + SSE2_PCMPEQB_XMM_to_XMM(regd, regd); + SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd); + SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd); + SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); + + SSE2_PSLLD_I8_to_XMM(regd, 1); + SSE2_PSLLD_I8_to_XMM(regt, 1); + + SSE2_PSRLD_I8_to_XMM(regd, 24); + SSE2_PSRLD_I8_to_XMM(regt, 24); + + SSE2_PSUBD_XMM_to_XMM(regd, regt); + +#define PERFORM(i) \ + \ + SSE_PEXTRW_XMM_to_R32(temp1, regd, i*2); \ + MOVSX32R16toR(temp1, temp1); \ + CMP32ItoR(temp1, 25);\ + localptr[i][0] = JGE8(0);\ + CMP32ItoR(temp1, 0);\ + localptr[i][1] = JG8(0);\ + localptr[i][2] = JE8(0);\ + CMP32ItoR(temp1, -25);\ + localptr[i][3] = JLE8(0);\ + \ + NEG32R(temp1); \ + DEC32R(temp1);\ + MOV32ItoR(temp2, 0xffffffff); \ + SHL32CLtoR(temp2); \ + MOV32RtoM((uptr)&VU_addsuband[0][i], temp2);\ + localptr[i][4] = JMP8(0);\ + \ + x86SetJ8(localptr[i][0]);\ + MOV32ItoM((uptr)&VU_addsuband[1][i], 0x80000000);\ + localptr[i][5] = JMP8(0);\ + \ + x86SetJ8(localptr[i][1]);\ + DEC32R(temp1);\ + MOV32ItoR(temp2, 0xffffffff);\ + SHL32CLtoR(temp2); \ + MOV32RtoM((uptr)&VU_addsuband[1][i], temp2);\ + localptr[i][6] = JMP8(0);\ + \ + x86SetJ8(localptr[i][3]);\ + MOV32ItoM((uptr)&VU_addsuband[0][i], 0x80000000);\ + localptr[i][7] = JMP8(0);\ + \ + x86SetJ8(localptr[i][2]);\ + \ + x86SetJ8(localptr[i][4]);\ + x86SetJ8(localptr[i][5]);\ + x86SetJ8(localptr[i][6]);\ + x86SetJ8(localptr[i][7]); + + PERFORM(0); + PERFORM(1); + PERFORM(2); + PERFORM(3); +#undef PERFORM + + SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); + SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); + + SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[0][0]); + SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsuband[1][0]); + + if (is_sub) SSE_SUBPS_XMM_to_XMM(regd, regt); + else SSE_ADDPS_XMM_to_XMM(regd, regt); + + SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); + + _freeX86reg(temp2); + + MOV32MtoR(ECX, (uptr)&tempECX); +} + +void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info) +{ + u8 *localptr[8]; + u32 addrt = regt; //for case is_mem + + MOV32RtoM((uptr)&tempECX, ECX); + + int temp1 = ECX; //receives regd + int temp2 = ALLOCTEMPX86(0); + + if (temp2 == ECX) + { + temp2 = ALLOCTEMPX86(0); + _freeX86reg(ECX); + } + + SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd); + if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt); + + SSE2_MOVD_XMM_to_R(temp1, regd); + SHR32ItoR(temp1, 23); + + if (is_mem) { + MOV32MtoR(temp2, addrt); + MOV32RtoM((uptr)&VU_addsub_reg[1][0], temp2); + SHR32ItoR(temp2, 23); + } + else { + SSE2_MOVD_XMM_to_R(temp2, regt); + SHR32ItoR(temp2, 23); + } + + AND32ItoR(temp1, 0xff); + AND32ItoR(temp2, 0xff); + + SUB32RtoR(temp1, temp2); //temp1 = exponent difference + + CMP32ItoR(temp1, 25); + localptr[0] = JGE8(0); + CMP32ItoR(temp1, 0); + localptr[1] = JG8(0); + localptr[2] = JE8(0); + CMP32ItoR(temp1, -25); + localptr[3] = JLE8(0); + + NEG32R(temp1); + DEC32R(temp1); + MOV32ItoR(temp2, 0xffffffff); + SHL32CLtoR(temp2); + SSE2_PCMPEQB_XMM_to_XMM(regd, regd); + if (is_mem) { + SSE_PINSRW_R32_to_XMM(regd, temp2, 0); + SHR32ItoR(temp2, 16); + SSE_PINSRW_R32_to_XMM(regd, temp2, 1); + } + else { + SSE2_MOVD_R_to_XMM(regt, temp2); + SSE_MOVSS_XMM_to_XMM(regd, regt); + SSE2_PCMPEQB_XMM_to_XMM(regt, regt); + } + localptr[4] = JMP8(0); + + x86SetJ8(localptr[0]); + MOV32ItoR(temp2, 0x80000000); + if (is_mem) + AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2); + else { + SSE2_PCMPEQB_XMM_to_XMM(regt, regt); + SSE2_MOVD_R_to_XMM(regd, temp2); + SSE_MOVSS_XMM_to_XMM(regt, regd); + } + SSE2_PCMPEQB_XMM_to_XMM(regd, regd); + localptr[5] = JMP8(0); + + x86SetJ8(localptr[1]); + DEC32R(temp1); + MOV32ItoR(temp2, 0xffffffff); + SHL32CLtoR(temp2); + if (is_mem) + AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2); + else { + SSE2_PCMPEQB_XMM_to_XMM(regt, regt); + SSE2_MOVD_R_to_XMM(regd, temp2); + SSE_MOVSS_XMM_to_XMM(regt, regd); + } + SSE2_PCMPEQB_XMM_to_XMM(regd, regd); + localptr[6] = JMP8(0); + + x86SetJ8(localptr[3]); + MOV32ItoR(temp2, 0x80000000); + SSE2_PCMPEQB_XMM_to_XMM(regd, regd); + if (is_mem) { + SSE_PINSRW_R32_to_XMM(regd, temp2, 0); + SHR32ItoR(temp2, 16); + SSE_PINSRW_R32_to_XMM(regd, temp2, 1); + } + else { + SSE2_MOVD_R_to_XMM(regt, temp2); + SSE_MOVSS_XMM_to_XMM(regd, regt); + SSE2_PCMPEQB_XMM_to_XMM(regt, regt); + } + localptr[7] = JMP8(0); + + x86SetJ8(localptr[2]); + x86SetJ8(localptr[4]); + x86SetJ8(localptr[5]); + x86SetJ8(localptr[6]); + x86SetJ8(localptr[7]); + + if (is_mem) + { + SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask + + if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]); + else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]); + } + else + { + SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask + SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask + + if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt); + else SSE_ADDSS_XMM_to_XMM(regd, regt); + + SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); + } + + _freeX86reg(temp2); + + MOV32MtoR(ECX, (uptr)&tempECX); +} + +void SSE_ADDPS_XMM_to_XMM_custom(int info, int regd, int regt) { + if (CHECK_VUADDSUBHACK) { + VU_ADD_SUB(regd, regt, 0, info); + } + else SSE_ADDPS_XMM_to_XMM(regd, regt); +} +void SSE_SUBPS_XMM_to_XMM_custom(int info, int regd, int regt) { + if (CHECK_VUADDSUBHACK) { + VU_ADD_SUB(regd, regt, 1, info); + } + else SSE_SUBPS_XMM_to_XMM(regd, regt); +} +void SSE_ADDSS_XMM_to_XMM_custom(int info, int regd, int regt) { + if (CHECK_VUADDSUBHACK) { + VU_ADD_SUB_SS(regd, regt, 0, 0, info); + } + else SSE_ADDSS_XMM_to_XMM(regd, regt); +} +void SSE_SUBSS_XMM_to_XMM_custom(int info, int regd, int regt) { + if (CHECK_VUADDSUBHACK) { + VU_ADD_SUB_SS(regd, regt, 1, 0, info); + } + else SSE_SUBSS_XMM_to_XMM(regd, regt); +} +void SSE_ADDSS_M32_to_XMM_custom(int info, int regd, int regt) { + if (CHECK_VUADDSUBHACK) { + VU_ADD_SUB_SS(regd, regt, 0, 1, info); + } + else SSE_ADDSS_M32_to_XMM(regd, regt); +} +void SSE_SUBSS_M32_to_XMM_custom(int info, int regd, int regt) { + if (CHECK_VUADDSUBHACK) { + VU_ADD_SUB_SS(regd, regt, 1, 1, info); + } + else SSE_SUBSS_M32_to_XMM(regd, regt); +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// *VU Upper Instructions!* +// +// Note: * = Checked for errors by cottonvibes +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// ABS* +//------------------------------------------------------------------ +void recVUMI_ABS(VURegs *VU, int info) +{ + //Console.WriteLn("recVUMI_ABS()"); + if ( (_Ft_ == 0) || (_X_Y_Z_W == 0) ) return; + + if ((_X_Y_Z_W == 0x8) || (_X_Y_Z_W == 0xf)) { + VU_MERGE_REGS(EEREC_T, EEREC_S); + SSE_ANDPS_M128_to_XMM(EEREC_T, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] ); + } + else { // Use a temp reg because VU_MERGE_REGS() modifies source reg! + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_abs_table[ _X_Y_Z_W ][ 0 ] ); + VU_MERGE_REGS(EEREC_T, EEREC_TEMP); + } +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// ADD*, ADD_iq*, ADD_xyzw* +//------------------------------------------------------------------ +static const __aligned16 float s_two[4] = {0,0,0,2}; +void recVUMI_ADD(VURegs *VU, int info) +{ + //Console.WriteLn("recVUMI_ADD()"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; // Don't do anything and just clear flags + if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); + + if ( _Fs_ == 0 && _Ft_ == 0 ) { // if adding VF00 with VF00, then the result is always 0,0,0,2 + if ( _X_Y_Z_W != 0xf ) { + SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)s_two); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)s_two); + } + else { + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); + } + if( _X_Y_Z_W == 8 ) { // If only adding x, then we can do a Scalar Add + if (EEREC_D == EEREC_S) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T); + else if (EEREC_D == EEREC_T) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_S); + else { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + else if (_X_Y_Z_W != 0xf) { // If xyzw != 1111, then we have to use a temp reg + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { // All xyzw being modified (xyzw == 1111) + if (EEREC_D == EEREC_S) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_T); + else if (EEREC_D == EEREC_T) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + } +flagUpdate: + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_ADD_iq(VURegs *VU, uptr addr, int info) +{ + //Console.WriteLn("recVUMI_ADD_iq()"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); + if (CHECK_VU_EXTRA_OVERFLOW) { + vuFloat3(addr); + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + } + + if ( _XYZW_SS ) { + if ( EEREC_D == EEREC_TEMP ) { + _vuFlipRegSS(VU, EEREC_S); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_ADDSS_M32_to_XMM(EEREC_D, addr); + _vuFlipRegSS(VU, EEREC_S); + _vuFlipRegSS(VU, EEREC_D); // have to flip over EEREC_D for computing flags! + } + else if ( EEREC_D == EEREC_S ) { + _vuFlipRegSS(VU, EEREC_D); + SSE_ADDSS_M32_to_XMM(EEREC_D, addr); + _vuFlipRegSS(VU, EEREC_D); + } + else { + if ( _X ) { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_ADDSS_M32_to_XMM_custom(info, EEREC_D, addr); + } + else { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + SSE_ADDPS_XMM_to_XMM_custom(info, EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + } + } + else { + if ( (_X_Y_Z_W != 0xf) || (EEREC_D == EEREC_S) || (EEREC_D == EEREC_TEMP) ) { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + } + + if (_X_Y_Z_W != 0xf) { + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + if ( EEREC_D == EEREC_TEMP ) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); + else if ( EEREC_D == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + else { + SSE_MOVSS_M32_to_XMM(EEREC_D, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00); + SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); + } + } + } +flagUpdate: + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_ADD_xyzw(VURegs *VU, int xyzw, int info) +{ + //Console.WriteLn("recVUMI_ADD_xyzw()"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); + } + + if ( _Ft_ == 0 && xyzw < 3 ) { // just move since adding zero + if ( _X_Y_Z_W == 0x8 ) { VU_MERGE_REGS(EEREC_D, EEREC_S); } + else if ( _X_Y_Z_W != 0xf ) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); + } + else if ( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP) ) { + if ( xyzw == 0 ) { + if ( EEREC_D == EEREC_T ) SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_S); + else { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + else { + _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + } + else if( _Fs_ == 0 && !_W ) { // just move + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + if ( _X_Y_Z_W != 0xf ) { + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + if( EEREC_D == EEREC_TEMP ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); } + else if( EEREC_D == EEREC_S ) { _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); } + else { _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw); SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); } + } + } +flagUpdate: + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_ADDi(VURegs *VU, int info) { recVUMI_ADD_iq(VU, VU_VI_ADDR(REG_I, 1), info); } +void recVUMI_ADDq(VURegs *VU, int info) { recVUMI_ADD_iq(VU, VU_REGQ_ADDR, info); } +void recVUMI_ADDx(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 0, info); } +void recVUMI_ADDy(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 1, info); } +void recVUMI_ADDz(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 2, info); } +void recVUMI_ADDw(VURegs *VU, int info) { recVUMI_ADD_xyzw(VU, 3, info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// ADDA*, ADDA_iq*, ADDA_xyzw* +//------------------------------------------------------------------ +void recVUMI_ADDA(VURegs *VU, int info) +{ + //Console.WriteLn("recVUMI_ADDA()"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); + } + + if( _X_Y_Z_W == 8 ) { + if (EEREC_ACC == EEREC_S) SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T); // Can this case happen? (cottonvibes) + else if (EEREC_ACC == EEREC_T) SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_S); // Can this case happen? + else { + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T); + } + } + else if (_X_Y_Z_W != 0xf) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + + VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); + } + else { + if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_T); // Can this case happen? + else if( EEREC_ACC == EEREC_T ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S); // Can this case happen? + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_T); + } + } +flagUpdate: + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_ADDA_iq(VURegs *VU, uptr addr, int info) +{ + //Console.WriteLn("recVUMI_ADDA_iq()"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if (CHECK_VU_EXTRA_OVERFLOW) { + vuFloat3(addr); + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + } + + if( _XYZW_SS ) { + assert( EEREC_ACC != EEREC_TEMP ); + if( EEREC_ACC == EEREC_S ) { + _vuFlipRegSS(VU, EEREC_ACC); + SSE_ADDSS_M32_to_XMM(EEREC_ACC, addr); + _vuFlipRegSS(VU, EEREC_ACC); + } + else { + if( _X ) { + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_ADDSS_M32_to_XMM(EEREC_ACC, addr); + } + else { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); + } + } + } + else { + if( _X_Y_Z_W != 0xf || EEREC_ACC == EEREC_S ) { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + } + + if (_X_Y_Z_W != 0xf) { + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); + } + else { + if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); + else { + SSE_MOVSS_M32_to_XMM(EEREC_ACC, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_ACC, EEREC_ACC, 0x00); + SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S); + } + } + } +flagUpdate: + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_ADDA_xyzw(VURegs *VU, int xyzw, int info) +{ + //Console.WriteLn("recVUMI_ADDA_xyzw()"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); + } + + if( _X_Y_Z_W == 8 ) { + assert( EEREC_ACC != EEREC_T ); + if( xyzw == 0 ) { + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_T); + } + else { + _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); + if( _Fs_ == 0 ) { + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); + } + else { + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_ADDSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); + } + } + } + else { + if( _X_Y_Z_W != 0xf || EEREC_ACC == EEREC_S ) + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + + if (_X_Y_Z_W != 0xf) { + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); + } + else { + if( EEREC_ACC == EEREC_S ) SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); + else { + _unpackVF_xyzw(EEREC_ACC, EEREC_T, xyzw); + SSE_ADDPS_XMM_to_XMM(EEREC_ACC, EEREC_S); + } + } + } +flagUpdate: + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_ADDAi(VURegs *VU, int info) { recVUMI_ADDA_iq(VU, VU_VI_ADDR(REG_I, 1), info); } +void recVUMI_ADDAq(VURegs *VU, int info) { recVUMI_ADDA_iq(VU, VU_REGQ_ADDR, info); } +void recVUMI_ADDAx(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 0, info); } +void recVUMI_ADDAy(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 1, info); } +void recVUMI_ADDAz(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 2, info); } +void recVUMI_ADDAw(VURegs *VU, int info) { recVUMI_ADDA_xyzw(VU, 3, info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// SUB*, SUB_iq*, SUB_xyzw* +//------------------------------------------------------------------ +void recVUMI_SUB(VURegs *VU, int info) +{ + //Console.WriteLn("recVUMI_SUB()"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); + + if( EEREC_S == EEREC_T ) { + if (_X_Y_Z_W != 0xf) SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&SSEmovMask[15-_X_Y_Z_W][0]); + else SSE_XORPS_XMM_to_XMM(EEREC_D, EEREC_D); + } + else if( _X_Y_Z_W == 8 ) { + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); + } + if (EEREC_D == EEREC_S) { + if (_Ft_) SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T); + } + else if (EEREC_D == EEREC_T) { + if (_Ft_) { + SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_SUBSS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + else SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + } + else { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + if (_Ft_) SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + else { + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); + } + if (_X_Y_Z_W != 0xf) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if( ( _Ft_ > 0 ) || _W ) SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + if (EEREC_D == EEREC_S) SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_T); + else if (EEREC_D == EEREC_T) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + } +flagUpdate: + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_SUB_iq(VURegs *VU, uptr addr, int info) +{ + //Console.WriteLn("recVUMI_SUB_iq()"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if (CHECK_VU_EXTRA_OVERFLOW) { + vuFloat3(addr); + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + } + if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); + + if( _XYZW_SS ) { + if( EEREC_D == EEREC_TEMP ) { + _vuFlipRegSS(VU, EEREC_S); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_SUBSS_M32_to_XMM(EEREC_D, addr); + _vuFlipRegSS(VU, EEREC_S); + _vuFlipRegSS(VU, EEREC_D); + } + else if( EEREC_D == EEREC_S ) { + _vuFlipRegSS(VU, EEREC_D); + SSE_SUBSS_M32_to_XMM(EEREC_D, addr); + _vuFlipRegSS(VU, EEREC_D); + } + else { + if( _X ) { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_SUBSS_M32_to_XMM(EEREC_D, addr); + } + else { + _vuMoveSS(VU, EEREC_TEMP, EEREC_S); + _vuFlipRegSS(VU, EEREC_D); + SSE_SUBSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + _vuFlipRegSS(VU, EEREC_D); + } + } + } + else { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + + if (_X_Y_Z_W != 0xf) { + int t1reg = _vuGetTempXMMreg(info); + + if( t1reg >= 0 ) { + SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S); + SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); + + VU_MERGE_REGS(EEREC_D, t1reg); + _freeXMMreg(t1reg); + } + else { + // negate + SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + } + else { + if( EEREC_D == EEREC_TEMP ) { + SSE_XORPS_M128_to_XMM(EEREC_D, (uptr)&const_clip[4]); + SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); + } + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + } + } +flagUpdate: + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_SUB_xyzw(VURegs *VU, int xyzw, int info) +{ + //Console.WriteLn("recVUMI_SUB_xyzw()"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if ( !_Fd_ ) info = (info & ~PROCESS_EE_SET_D(0xf)) | PROCESS_EE_SET_D(EEREC_TEMP); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); + } + + if ( _X_Y_Z_W == 8 ) { + if ( (xyzw == 0) && (_Ft_ == _Fs_) ) { + SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&SSEmovMask[7][0]); + } + else if ( EEREC_D == EEREC_TEMP ) { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + if ( (_Ft_ > 0) || (xyzw == 3) ) { + _vuFlipRegSS_xyzw(EEREC_T, xyzw); + SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_T); + _vuFlipRegSS_xyzw(EEREC_T, xyzw); + } + } + else { + if ( (_Ft_ > 0) || (xyzw == 3) ) { + _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_SUBSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + else SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + } + } + else { + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + + if (_X_Y_Z_W != 0xf) { + int t1reg = _vuGetTempXMMreg(info); + + if( t1reg >= 0 ) { + SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S); + SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); + + VU_MERGE_REGS(EEREC_D, t1reg); + _freeXMMreg(t1reg); + } + else { + // negate + SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + } + else { + if( EEREC_D == EEREC_TEMP ) { + SSE_XORPS_M128_to_XMM(EEREC_D, (uptr)&const_clip[4]); + SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_S); + } + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_SUBPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + } + } +flagUpdate: + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_SUBi(VURegs *VU, int info) { recVUMI_SUB_iq(VU, VU_VI_ADDR(REG_I, 1), info); } +void recVUMI_SUBq(VURegs *VU, int info) { recVUMI_SUB_iq(VU, VU_REGQ_ADDR, info); } +void recVUMI_SUBx(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 0, info); } +void recVUMI_SUBy(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 1, info); } +void recVUMI_SUBz(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 2, info); } +void recVUMI_SUBw(VURegs *VU, int info) { recVUMI_SUB_xyzw(VU, 3, info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// SUBA*, SUBA_iq, SUBA_xyzw +//------------------------------------------------------------------ +void recVUMI_SUBA(VURegs *VU, int info) +{ + //Console.WriteLn("recVUMI_SUBA()"); + if ( _X_Y_Z_W == 0 ) goto flagUpdate; + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); + } + + if( EEREC_S == EEREC_T ) { + if (_X_Y_Z_W != 0xf) SSE_ANDPS_M128_to_XMM(EEREC_ACC, (uptr)&SSEmovMask[15-_X_Y_Z_W][0]); + else SSE_XORPS_XMM_to_XMM(EEREC_ACC, EEREC_ACC); + } + else if( _X_Y_Z_W == 8 ) { + if (EEREC_ACC == EEREC_S) SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T); + else if (EEREC_ACC == EEREC_T) { + SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_SUBSS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); + } + else { + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T); + } + } + else if (_X_Y_Z_W != 0xf) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + + VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); + } + else { + if( EEREC_ACC == EEREC_S ) SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_T); + else if( EEREC_ACC == EEREC_T ) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_SUBPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); + } + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_T); + } + } +flagUpdate: + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_SUBA_iq(VURegs *VU, uptr addr, int info) +{ + //Console.WriteLn ("recVUMI_SUBA_iq"); + if (CHECK_VU_EXTRA_OVERFLOW) { + vuFloat3(addr); + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + } + + if( _XYZW_SS ) { + if( EEREC_ACC == EEREC_S ) { + _vuFlipRegSS(VU, EEREC_ACC); + SSE_SUBSS_M32_to_XMM(EEREC_ACC, addr); + _vuFlipRegSS(VU, EEREC_ACC); + } + else { + if( _X ) { + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_SUBSS_M32_to_XMM(EEREC_ACC, addr); + } + else { + _vuMoveSS(VU, EEREC_TEMP, EEREC_S); + _vuFlipRegSS(VU, EEREC_ACC); + SSE_SUBSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); + _vuFlipRegSS(VU, EEREC_ACC); + } + } + } + else { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + + if (_X_Y_Z_W != 0xf) { + int t1reg = _vuGetTempXMMreg(info); + + if( t1reg >= 0 ) { + SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S); + SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); + + VU_MERGE_REGS(EEREC_ACC, t1reg); + _freeXMMreg(t1reg); + } + else { + // negate + SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); + } + } + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); + } + } + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_SUBA_xyzw(VURegs *VU, int xyzw, int info) +{ + //Console.WriteLn ("recVUMI_SUBA_xyzw"); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); + } + + if( _X_Y_Z_W == 8 ) { + if( xyzw == 0 ) { + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_T); + } + else { + _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MOVSS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_SUBSS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); + } + } + else { + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + + if (_X_Y_Z_W != 0xf) { + int t1reg = _vuGetTempXMMreg(info); + + if( t1reg >= 0 ) { + SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_S); + SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); + + VU_MERGE_REGS(EEREC_ACC, t1reg); + _freeXMMreg(t1reg); + } + else { + // negate + SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_ACC, EEREC_TEMP); + } + } + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_ACC, EEREC_S); + SSE_SUBPS_XMM_to_XMM(EEREC_ACC, EEREC_TEMP); + } + } + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_SUBAi(VURegs *VU, int info) { recVUMI_SUBA_iq(VU, VU_VI_ADDR(REG_I, 1), info); } +void recVUMI_SUBAq(VURegs *VU, int info) { recVUMI_SUBA_iq(VU, VU_REGQ_ADDR, info); } +void recVUMI_SUBAx(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 0, info); } +void recVUMI_SUBAy(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 1, info); } +void recVUMI_SUBAz(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 2, info); } +void recVUMI_SUBAw(VURegs *VU, int info) { recVUMI_SUBA_xyzw(VU, 3, info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// MUL +//------------------------------------------------------------------ +void recVUMI_MUL_toD(VURegs *VU, int regd, int info) +{ + //Console.WriteLn ("recVUMI_MUL_toD"); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); + } + + if (_X_Y_Z_W == 1 && (_Ft_ == 0 || _Fs_==0) ) { // W + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, _Ft_ ? EEREC_T : EEREC_S); + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else if( _Fd_ == _Fs_ && _Fs_ == _Ft_ && _XYZW_SS ) { + _vuFlipRegSS(VU, EEREC_D); + SSE_MULSS_XMM_to_XMM(EEREC_D, EEREC_D); + _vuFlipRegSS(VU, EEREC_D); + } + else if( _X_Y_Z_W == 8 ) { + if (regd == EEREC_S) SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + else if (regd == EEREC_T) SSE_MULSS_XMM_to_XMM(regd, EEREC_S); + else { + SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); + SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + } + } + else if (_X_Y_Z_W != 0xf) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else { + if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_T); + else if (regd == EEREC_T) SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + else { + SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S); + SSE_MULPS_XMM_to_XMM(regd, EEREC_T); + } + } +} + +void recVUMI_MUL_iq_toD(VURegs *VU, uptr addr, int regd, int info) +{ + //Console.WriteLn ("recVUMI_MUL_iq_toD"); + if (CHECK_VU_EXTRA_OVERFLOW) { + vuFloat3(addr); + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + } + + if( _XYZW_SS ) { + if( regd == EEREC_TEMP ) { + _vuFlipRegSS(VU, EEREC_S); + SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); + SSE_MULSS_M32_to_XMM(regd, addr); + _vuFlipRegSS(VU, EEREC_S); + _vuFlipRegSS(VU, regd); + } + else if( regd == EEREC_S ) { + _vuFlipRegSS(VU, regd); + SSE_MULSS_M32_to_XMM(regd, addr); + _vuFlipRegSS(VU, regd); + } + else { + if( _X ) { + SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); + SSE_MULSS_M32_to_XMM(regd, addr); + } + else { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(regd, EEREC_TEMP); + } + } + } + else { + if( _X_Y_Z_W != 0xf || regd == EEREC_TEMP || regd == EEREC_S ) { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + } + + if (_X_Y_Z_W != 0xf) { + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else { + if( regd == EEREC_TEMP ) SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + else if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP); + else { + SSE_MOVSS_M32_to_XMM(regd, addr); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x00); + SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + } + } + } +} + +void recVUMI_MUL_xyzw_toD(VURegs *VU, int xyzw, int regd, int info) +{ + //Console.WriteLn ("recVUMI_MUL_xyzw_toD"); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); + } + if (_Fs_) { // This is needed for alot of games; so always clamp this operand + if (CHECK_VU_SIGN_OVERFLOW) vFloats4_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set + else vFloats2[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set + } + if( _Ft_ == 0 ) { + if( xyzw < 3 ) { + if (_X_Y_Z_W != 0xf) { + SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else SSE_XORPS_XMM_to_XMM(regd, regd); + } + else { + assert(xyzw==3); + if (_X_Y_Z_W != 0xf) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S); + } + } + else if( _X_Y_Z_W == 8 ) { + if( regd == EEREC_TEMP ) { + _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MULSS_XMM_to_XMM(regd, EEREC_S); + } + else { + if( xyzw == 0 ) { + if( regd == EEREC_T ) { + SSE_MULSS_XMM_to_XMM(regd, EEREC_S); + } + else { + SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); + SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + } + } + else { + _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); + SSE_MULSS_XMM_to_XMM(regd, EEREC_TEMP); + } + } + } + else { + if( _X_Y_Z_W != 0xf || regd == EEREC_TEMP || regd == EEREC_S ) + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + + if (_X_Y_Z_W != 0xf) { + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else { + if( regd == EEREC_TEMP ) SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + else if (regd == EEREC_S) SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP); + else { + _unpackVF_xyzw(regd, EEREC_T, xyzw); + SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + } + } + } +} + +void recVUMI_MUL(VURegs *VU, int info) +{ + //Console.WriteLn ("recVUMI_MUL"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MUL_toD(VU, EEREC_D, info); + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_MUL_iq(VURegs *VU, int addr, int info) +{ + //Console.WriteLn ("recVUMI_MUL_iq"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MUL_iq_toD(VU, addr, EEREC_D, info); + recUpdateFlags(VU, EEREC_D, info); + // spacefisherman needs overflow checking on MULi.z +} + +void recVUMI_MUL_xyzw(VURegs *VU, int xyzw, int info) +{ + //Console.WriteLn ("recVUMI_MUL_xyzw"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MUL_xyzw_toD(VU, xyzw, EEREC_D, info); + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_MULi(VURegs *VU, int info) { recVUMI_MUL_iq(VU, VU_VI_ADDR(REG_I, 1), info); } +void recVUMI_MULq(VURegs *VU, int info) { recVUMI_MUL_iq(VU, VU_REGQ_ADDR, info); } +void recVUMI_MULx(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 0, info); } +void recVUMI_MULy(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 1, info); } +void recVUMI_MULz(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 2, info); } +void recVUMI_MULw(VURegs *VU, int info) { recVUMI_MUL_xyzw(VU, 3, info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// MULA +//------------------------------------------------------------------ +void recVUMI_MULA( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_MULA"); + recVUMI_MUL_toD(VU, EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MULA_iq(VURegs *VU, int addr, int info) +{ + //Console.WriteLn ("recVUMI_MULA_iq"); + recVUMI_MUL_iq_toD(VU, addr, EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MULA_xyzw(VURegs *VU, int xyzw, int info) +{ + //Console.WriteLn ("recVUMI_MULA_xyzw"); + recVUMI_MUL_xyzw_toD(VU, xyzw, EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MULAi(VURegs *VU, int info) { recVUMI_MULA_iq(VU, VU_VI_ADDR(REG_I, 1), info); } +void recVUMI_MULAq(VURegs *VU, int info) { recVUMI_MULA_iq(VU, VU_REGQ_ADDR, info); } +void recVUMI_MULAx(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 0, info); } +void recVUMI_MULAy(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 1, info); } +void recVUMI_MULAz(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 2, info); } +void recVUMI_MULAw(VURegs *VU, int info) { recVUMI_MULA_xyzw(VU, 3, info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// MADD +//------------------------------------------------------------------ +void recVUMI_MADD_toD(VURegs *VU, int regd, int info) +{ + //Console.WriteLn ("recVUMI_MADD_toD"); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); + vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); + } + + + if( _X_Y_Z_W == 8 ) { + if( regd == EEREC_ACC ) { + SSE_MOVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP); + } + else if (regd == EEREC_T) { + SSE_MULSS_XMM_to_XMM(regd, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); + } + else if (regd == EEREC_S) { + SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); + } + else { + SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); + SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); + } + } + else if (_X_Y_Z_W != 0xf) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); + + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else { + if( regd == EEREC_ACC ) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP); + } + else if (regd == EEREC_T) { + SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); + } + else if (regd == EEREC_S) { + SSE_MULPS_XMM_to_XMM(regd, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); + } + else { + SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S); + SSE_MULPS_XMM_to_XMM(regd, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); + } + } +} + +void recVUMI_MADD_iq_toD(VURegs *VU, uptr addr, int regd, int info) +{ + //Console.WriteLn ("recVUMI_MADD_iq_toD"); + if (CHECK_VU_EXTRA_OVERFLOW) { + vuFloat3(addr); + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); + } + + if( _X_Y_Z_W == 8 ) { + if( _Fs_ == 0 ) { + // do nothing if regd == ACC (ACCx <= ACCx + 0.0 * *addr) + if( regd != EEREC_ACC ) { + SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC); + } + return; + } + + if( regd == EEREC_ACC ) { + assert( EEREC_TEMP < iREGCNT_XMM ); + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP); + } + else if( regd == EEREC_S ) { + SSE_MULSS_M32_to_XMM(regd, addr); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); + } + else { + SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); + SSE_MULSS_M32_to_XMM(regd, addr); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); + } + } + else { + if( _Fs_ == 0 ) { + if( regd == EEREC_ACC ) { // ACCxyz is unchanged, ACCw <= ACCw + *addr + if( _W ) { // if _W is zero, do nothing + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); // { *addr, 0, 0, 0 } + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x27); // { 0, 0, 0, *addr } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP); // { ACCx, ACCy, ACCz, ACCw + *addr } + } + } + else { // DESTxyz <= ACCxyz, DESTw <= ACCw + *addr + if( _W ) { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); // { *addr, 0, 0, 0 } + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x27); // { 0, 0, 0, *addr } + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); // { ACCx, ACCy, ACCz, ACCw + *addr } + } + else SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); + VU_MERGE_REGS(regd, EEREC_TEMP); + } + + return; + } + + if( _X_Y_Z_W != 0xf || regd == EEREC_ACC || regd == EEREC_TEMP || regd == EEREC_S ) { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + } + + if (_X_Y_Z_W != 0xf) { + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); + + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else { + if( regd == EEREC_ACC ) { + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP); + } + else if( regd == EEREC_S ) { + SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); + } + else if( regd == EEREC_TEMP ) { + SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); + } + else { + SSE_MOVSS_M32_to_XMM(regd, addr); + SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x00); + SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); + } + } + } +} + +void recVUMI_MADD_xyzw_toD(VURegs *VU, int xyzw, int regd, int info) +{ + //Console.WriteLn ("recVUMI_MADD_xyzw_toD"); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); + vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); + } + if (_Fs_) { // This is needed for alot of games; so always clamp this operand + if (CHECK_VU_SIGN_OVERFLOW) vFloats4_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set + else vFloats2[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set + } + if( _Ft_ == 0 ) { + + if( xyzw == 3 ) { + // just add + if( _X_Y_Z_W == 8 ) { + if( regd == EEREC_S ) SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); + else { + SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC); + SSE_ADDSS_XMM_to_XMM(regd, EEREC_S); + } + } + else { + if( _X_Y_Z_W != 0xf ) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); + + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else { + if( regd == EEREC_S ) SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); + else { + SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC); + SSE_ADDPS_XMM_to_XMM(regd, EEREC_S); + } + } + } + } + else { + // just move acc to regd + if( _X_Y_Z_W != 0xf ) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC); + } + + return; + } + + if( _X_Y_Z_W == 8 ) { + _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); + + if( regd == EEREC_ACC ) { + SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP); + } + else if( regd == EEREC_S ) { + SSE_MULSS_XMM_to_XMM(regd, EEREC_TEMP); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); + } + else if( regd == EEREC_TEMP ) { + SSE_MULSS_XMM_to_XMM(regd, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, 8); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_ACC); + } + else { + SSE_MOVSS_XMM_to_XMM(regd, EEREC_ACC); + SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, 8); } + SSE_ADDSS_XMM_to_XMM(regd, EEREC_TEMP); + } + } + else { + if( _X_Y_Z_W != 0xf || regd == EEREC_ACC || regd == EEREC_TEMP || regd == EEREC_S ) { + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + } + + if (_X_Y_Z_W != 0xf) { + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); + + VU_MERGE_REGS(regd, EEREC_TEMP); + } + else { + if( regd == EEREC_ACC ) { + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_TEMP); + } + else if( regd == EEREC_S ) { + SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); + } + else if( regd == EEREC_TEMP ) { + SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); + } + else { + _unpackVF_xyzw(regd, EEREC_T, xyzw); + SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_ADDPS_XMM_to_XMM(regd, EEREC_ACC); + } + } + } +} + +void recVUMI_MADD(VURegs *VU, int info) +{ + //Console.WriteLn ("recVUMI_MADD"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MADD_toD(VU, EEREC_D, info); + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_MADD_iq(VURegs *VU, int addr, int info) +{ + //Console.WriteLn ("recVUMI_MADD_iq"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MADD_iq_toD(VU, addr, EEREC_D, info); + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_MADD_xyzw(VURegs *VU, int xyzw, int info) +{ + //Console.WriteLn ("recVUMI_MADD_xyzw"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MADD_xyzw_toD(VU, xyzw, EEREC_D, info); + recUpdateFlags(VU, EEREC_D, info); + // super bust-a-move arrows needs overflow clamping +} + +void recVUMI_MADDi(VURegs *VU, int info) { recVUMI_MADD_iq(VU, VU_VI_ADDR(REG_I, 1), info); } +void recVUMI_MADDq(VURegs *VU, int info) { recVUMI_MADD_iq(VU, VU_REGQ_ADDR, info); } +void recVUMI_MADDx(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 0, info); } +void recVUMI_MADDy(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 1, info); } +void recVUMI_MADDz(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 2, info); } +void recVUMI_MADDw(VURegs *VU, int info) { recVUMI_MADD_xyzw(VU, 3, info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// MADDA +//------------------------------------------------------------------ +void recVUMI_MADDA( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_MADDA"); + recVUMI_MADD_toD(VU, EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MADDAi( VURegs *VU , int info) +{ + //Console.WriteLn ("recVUMI_MADDAi"); + recVUMI_MADD_iq_toD( VU, VU_VI_ADDR(REG_I, 1), EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MADDAq( VURegs *VU , int info) +{ + //Console.WriteLn ("recVUMI_MADDAq "); + recVUMI_MADD_iq_toD( VU, VU_REGQ_ADDR, EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MADDAx( VURegs *VU , int info) +{ + //Console.WriteLn ("recVUMI_MADDAx"); + recVUMI_MADD_xyzw_toD(VU, 0, EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MADDAy( VURegs *VU , int info) +{ + //Console.WriteLn ("recVUMI_MADDAy"); + recVUMI_MADD_xyzw_toD(VU, 1, EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MADDAz( VURegs *VU , int info) +{ + //Console.WriteLn ("recVUMI_MADDAz"); + recVUMI_MADD_xyzw_toD(VU, 2, EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MADDAw( VURegs *VU , int info) +{ + //Console.WriteLn ("recVUMI_MADDAw"); + recVUMI_MADD_xyzw_toD(VU, 3, EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// MSUB +//------------------------------------------------------------------ +void recVUMI_MSUB_toD(VURegs *VU, int regd, int info) +{ + //Console.WriteLn ("recVUMI_MSUB_toD"); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); + vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); + } + + if (_X_Y_Z_W != 0xf) { + int t1reg = _vuGetTempXMMreg(info); + + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + + if( t1reg >= 0 ) { + SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_ACC); + SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); + + VU_MERGE_REGS(regd, t1reg); + _freeXMMreg(t1reg); + } + else { + SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); + VU_MERGE_REGS(regd, EEREC_TEMP); + } + } + else { + if( regd == EEREC_S ) { + assert( regd != EEREC_ACC ); + SSE_MULPS_XMM_to_XMM(regd, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC); + SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]); + } + else if( regd == EEREC_T ) { + assert( regd != EEREC_ACC ); + SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC); + SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]); + } + else if( regd == EEREC_TEMP ) { + SSE_MOVAPS_XMM_to_XMM(regd, EEREC_S); + SSE_MULPS_XMM_to_XMM(regd, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC); + SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]); + } + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC); + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP); + } + } +} + +void recVUMI_MSUB_temp_toD(VURegs *VU, int regd, int info) +{ + //Console.WriteLn ("recVUMI_MSUB_temp_toD"); + + if (_X_Y_Z_W != 0xf) { + int t1reg = _vuGetTempXMMreg(info); + + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + + if( t1reg >= 0 ) { + SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_ACC); + SSE_SUBPS_XMM_to_XMM(t1reg, EEREC_TEMP); + + if ( regd != EEREC_TEMP ) { VU_MERGE_REGS(regd, t1reg); } + else SSE_MOVAPS_XMM_to_XMM(regd, t1reg); + + _freeXMMreg(t1reg); + } + else { + SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); + VU_MERGE_REGS(regd, EEREC_TEMP); + } + } + else { + if( regd == EEREC_ACC ) { + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP); + } + else if( regd == EEREC_S ) { + SSE_MULPS_XMM_to_XMM(regd, EEREC_TEMP); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC); + SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]); + } + else if( regd == EEREC_TEMP ) { + SSE_MULPS_XMM_to_XMM(regd, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, regd, _X_Y_Z_W ); } + SSE_SUBPS_XMM_to_XMM(regd, EEREC_ACC); + SSE_XORPS_M128_to_XMM(regd, (uptr)&const_clip[4]); + } + else { + SSE_MOVAPS_XMM_to_XMM(regd, EEREC_ACC); + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + if (CHECK_VU_EXTRA_OVERFLOW) { vuFloat_useEAX( info, EEREC_TEMP, _X_Y_Z_W ); } + SSE_SUBPS_XMM_to_XMM(regd, EEREC_TEMP); + } + } +} + +void recVUMI_MSUB_iq_toD(VURegs *VU, int regd, int addr, int info) +{ + //Console.WriteLn ("recVUMI_MSUB_iq_toD"); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); + vuFloat3(addr); + } + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + recVUMI_MSUB_temp_toD(VU, regd, info); +} + +void recVUMI_MSUB_xyzw_toD(VURegs *VU, int regd, int xyzw, int info) +{ + //Console.WriteLn ("recVUMI_MSUB_xyzw_toD"); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 1 << (3 - xyzw)); + vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W ); + } + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + recVUMI_MSUB_temp_toD(VU, regd, info); +} + +void recVUMI_MSUB(VURegs *VU, int info) +{ + //Console.WriteLn ("recVUMI_MSUB"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MSUB_toD(VU, EEREC_D, info); + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_MSUB_iq(VURegs *VU, int addr, int info) +{ + //Console.WriteLn ("recVUMI_MSUB_iq"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MSUB_iq_toD(VU, EEREC_D, addr, info); + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_MSUBi(VURegs *VU, int info) { recVUMI_MSUB_iq(VU, VU_VI_ADDR(REG_I, 1), info); } +void recVUMI_MSUBq(VURegs *VU, int info) { recVUMI_MSUB_iq(VU, VU_REGQ_ADDR, info); } +void recVUMI_MSUBx(VURegs *VU, int info) +{ + //Console.WriteLn ("recVUMI_MSUBx"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 0, info); + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_MSUBy(VURegs *VU, int info) +{ + //Console.WriteLn ("recVUMI_MSUBy"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 1, info); + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_MSUBz(VURegs *VU, int info) +{ + //Console.WriteLn ("recVUMI_MSUBz"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 2, info); + recUpdateFlags(VU, EEREC_D, info); +} + +void recVUMI_MSUBw(VURegs *VU, int info) +{ + //Console.WriteLn ("recVUMI_MSUBw"); + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + recVUMI_MSUB_xyzw_toD(VU, EEREC_D, 3, info); + recUpdateFlags(VU, EEREC_D, info); +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// MSUBA +//------------------------------------------------------------------ +void recVUMI_MSUBA( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_MSUBA"); + recVUMI_MSUB_toD(VU, EEREC_ACC, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MSUBAi( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_MSUBAi "); + recVUMI_MSUB_iq_toD( VU, EEREC_ACC, VU_VI_ADDR(REG_I, 1), info ); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MSUBAq( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_MSUBAq"); + recVUMI_MSUB_iq_toD( VU, EEREC_ACC, VU_REGQ_ADDR, info ); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MSUBAx( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_MSUBAx"); + recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 0, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MSUBAy( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_MSUBAy"); + recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 1, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MSUBAz( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_MSUBAz "); + recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 2, info); + recUpdateFlags(VU, EEREC_ACC, info); +} + +void recVUMI_MSUBAw( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_MSUBAw"); + recVUMI_MSUB_xyzw_toD(VU, EEREC_ACC, 3, info); + recUpdateFlags(VU, EEREC_ACC, info); +} +//------------------------------------------------------------------ + + +static const __aligned16 u32 special_mask[4] = {0xffffffff, 0x80000000, 0xffffffff, 0x80000000}; +static const __aligned16 u32 special_mask2[4] = {0, 0x40000000, 0, 0x40000000}; + +__aligned16 u32 temp_loc[4]; +__aligned16 u32 temp_loc2[4]; + +//MAX/MINI are non-arithmetic operations that implicitly support numbers with the EXP field being 0 ("denormals"). +// +//As such, they are sometimes used for integer move and (positive!) integer max/min, knowing that integers that +//represent denormals will not be flushed to 0. +// +//As such, this implementation performs a non-arithmetic operation that supports "denormals" and "infs/nans". +//There might be an easier way to do it but here, MAX/MIN is performed with PMAXPD/PMINPD. +//Fake double-precision numbers are constructed by copying the sign of the original numbers, clearing the upper 32 bits, +//setting the 62nd bit to 1 (to ensure double-precision number is "normalized") and having the lower 32bits +//being the same as the original number. + +void MINMAXlogical(VURegs *VU, int info, int min, int mode, uptr addr = 0, int xyzw = 0) +//mode1 = iq, mode2 = xyzw, mode0 = normal +{ + int t1regbool = 0; + int t1reg = _vuGetTempXMMreg(info); + if (t1reg < 0) + { + t1regbool = 1; + for (t1reg = 0; ( (t1reg == EEREC_D) || (t1reg == EEREC_S) || (mode != 1 && t1reg == EEREC_T) + || (t1reg == EEREC_TEMP) ); t1reg++); // Find unused reg (For first temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)temp_loc, t1reg); // Backup t1reg XMM reg + } + int t2regbool = -1; + int t2reg = EEREC_TEMP; + if (EEREC_TEMP == EEREC_D || EEREC_TEMP == EEREC_S || (mode != 1 && EEREC_TEMP == EEREC_T)) + { + t2regbool = 0; + t2reg = _vuGetTempXMMreg(info); + if (t2reg < 0) + { + t2regbool = 1; + for (t2reg = 0; ( (t2reg == EEREC_D) || (t2reg == EEREC_S) || (mode != 1 && t2reg == EEREC_T) || + (t2reg == t1reg) || (t2reg == EEREC_TEMP) ); t2reg++); // Find unused reg (For second temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)temp_loc2, t2reg); // Backup t2reg XMM reg + } + } + + if (_X || _Y) + { + SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0x50); + SSE2_PAND_M128_to_XMM(t1reg, (uptr)special_mask); + SSE2_POR_M128_to_XMM(t1reg, (uptr)special_mask2); + if (mode == 0) + SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0x50); + else if (mode == 1) + { + SSE2_MOVD_M32_to_XMM(t2reg, addr); + SSE2_PSHUFD_XMM_to_XMM(t2reg, t2reg, 0x00); + } + else if (mode == 2) + _unpackVF_xyzw(t2reg, EEREC_T, xyzw); + SSE2_PAND_M128_to_XMM(t2reg, (uptr)special_mask); + SSE2_POR_M128_to_XMM(t2reg, (uptr)special_mask2); + if (min) + SSE2_MINPD_XMM_to_XMM(t1reg, t2reg); + else + SSE2_MAXPD_XMM_to_XMM(t1reg, t2reg); + SSE2_PSHUFD_XMM_to_XMM(t1reg, t1reg, 0x88); + VU_MERGE_REGS_CUSTOM(EEREC_D, t1reg, 0xc & _X_Y_Z_W); + } + + if (_Z || _W) + { + SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0xfa); + SSE2_PAND_M128_to_XMM(t1reg, (uptr)special_mask); + SSE2_POR_M128_to_XMM(t1reg, (uptr)special_mask2); + if (mode == 0) + SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0xfa); + else if (mode == 1) + { + SSE2_MOVD_M32_to_XMM(t2reg, addr); + SSE2_PSHUFD_XMM_to_XMM(t2reg, t2reg, 0x00); + } + else if (mode == 2) + _unpackVF_xyzw(t2reg, EEREC_T, xyzw); + SSE2_PAND_M128_to_XMM(t2reg, (uptr)special_mask); + SSE2_POR_M128_to_XMM(t2reg, (uptr)special_mask2); + if (min) + SSE2_MINPD_XMM_to_XMM(t1reg, t2reg); + else + SSE2_MAXPD_XMM_to_XMM(t1reg, t2reg); + SSE2_PSHUFD_XMM_to_XMM(t1reg, t1reg, 0x88); + VU_MERGE_REGS_CUSTOM(EEREC_D, t1reg, 0x3 & _X_Y_Z_W); + } + + if (t1regbool == 0) + _freeXMMreg(t1reg); + else if (t1regbool == 1) + SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)temp_loc); // Restore t1reg XMM reg + if (t2regbool == 0) + _freeXMMreg(t2reg); + else if (t2regbool == 1) + SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)temp_loc2); // Restore t2reg XMM reg +} + +//------------------------------------------------------------------ +// MAX +//------------------------------------------------------------------ + +void recVUMI_MAX(VURegs *VU, int info) +{ + if ( _Fd_ == 0 ) return; + //Console.WriteLn ("recVUMI_MAX"); + + if (MINMAXFIX) + MINMAXlogical(VU, info, 0, 0); + else + { + + if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping + if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); + + if( _X_Y_Z_W == 8 ) { + if (EEREC_D == EEREC_S) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T); + else if (EEREC_D == EEREC_T) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_S); + else { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + else if (_X_Y_Z_W != 0xf) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + if( EEREC_D == EEREC_S ) SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_T); + else if( EEREC_D == EEREC_T ) SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S); + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + } +} + +void recVUMI_MAX_iq(VURegs *VU, uptr addr, int info) +{ + if ( _Fd_ == 0 ) return; + //Console.WriteLn ("recVUMI_MAX_iq"); + + if (MINMAXFIX) + MINMAXlogical(VU, info, 0, 1, addr); + else + { + if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping + vuFloat3(addr); + + if( _XYZW_SS ) { + if( EEREC_D == EEREC_TEMP ) { + _vuFlipRegSS(VU, EEREC_S); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MAXSS_M32_to_XMM(EEREC_D, addr); + _vuFlipRegSS(VU, EEREC_S); + + // have to flip over EEREC_D if computing flags! + //if( (info & PROCESS_VU_UPDATEFLAGS) ) + _vuFlipRegSS(VU, EEREC_D); + } + else if( EEREC_D == EEREC_S ) { + _vuFlipRegSS(VU, EEREC_D); + SSE_MAXSS_M32_to_XMM(EEREC_D, addr); + _vuFlipRegSS(VU, EEREC_D); + } + else { + if( _X ) { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MAXSS_M32_to_XMM(EEREC_D, addr); + } + else { + _vuMoveSS(VU, EEREC_TEMP, EEREC_S); + _vuFlipRegSS(VU, EEREC_D); + SSE_MAXSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + _vuFlipRegSS(VU, EEREC_D); + } + } + } + else if (_X_Y_Z_W != 0xf) { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + if(EEREC_D == EEREC_S) { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + else { + SSE_MOVSS_M32_to_XMM(EEREC_D, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00); + SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S); + } + } + } +} + +void recVUMI_MAX_xyzw(VURegs *VU, int xyzw, int info) +{ + if ( _Fd_ == 0 ) return; + //Console.WriteLn ("recVUMI_MAX_xyzw"); + + if (_Fs_ == 0 && _Ft_ == 0) + { + if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) { + if( xyzw < 3 ) { + SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + else { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)s_fones); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + } + else if (_X_Y_Z_W != 0xf) { + if( xyzw < 3 ) { + if( _X_Y_Z_W & 1 ) SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[0]); // w included, so insert the whole reg + else SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); // w not included, can zero out + } + else SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)s_fones); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + //If VF0.w isnt chosen as the constant, then its going to be MAX( 0, VF0 ), so the result is VF0 + if( xyzw < 3 ) { SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)&VU->VF[0].UL[0]); } + else SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)s_fones); + } + return; + } + + if (MINMAXFIX) + MINMAXlogical(VU, info, 0, 2, 0, xyzw); + else + { + if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping + if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); + + if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) { + if( xyzw == 0 ) { + if( EEREC_D == EEREC_S ) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T); + else if( EEREC_D == EEREC_T ) SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_S); + else { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + else { + _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MAXSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + } + else if (_X_Y_Z_W != 0xf) { + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MAXPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + if (EEREC_D == EEREC_S) { + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + else { + _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw); + SSE_MAXPS_XMM_to_XMM(EEREC_D, EEREC_S); + } + } + } +} + +void recVUMI_MAXi(VURegs *VU, int info) { recVUMI_MAX_iq(VU, VU_VI_ADDR(REG_I, 1), info); } +void recVUMI_MAXx(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 0, info); } +void recVUMI_MAXy(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 1, info); } +void recVUMI_MAXz(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 2, info); } +void recVUMI_MAXw(VURegs *VU, int info) { recVUMI_MAX_xyzw(VU, 3, info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// MINI +//------------------------------------------------------------------ +void recVUMI_MINI(VURegs *VU, int info) +{ + if ( _Fd_ == 0 ) return; + //Console.WriteLn ("recVUMI_MINI"); + + if (MINMAXFIX) + MINMAXlogical(VU, info, 1, 0); + else + { + + if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping + if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W ); + + if( _X_Y_Z_W == 8 ) { + if (EEREC_D == EEREC_S) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T); + else if (EEREC_D == EEREC_T) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_S); + else { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + else if (_X_Y_Z_W != 0xf) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + if( EEREC_D == EEREC_S ) { + //ClampUnordered(EEREC_T, EEREC_TEMP, 0); // need for GT4 vu0rec + SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_T); + } + else if( EEREC_D == EEREC_T ) { + //ClampUnordered(EEREC_S, EEREC_TEMP, 0); // need for GT4 vu0rec + SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S); + } + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + } +} + +void recVUMI_MINI_iq(VURegs *VU, uptr addr, int info) +{ + if ( _Fd_ == 0 ) return; + //Console.WriteLn ("recVUMI_MINI_iq"); + + if (MINMAXFIX) + MINMAXlogical(VU, info, 1, 1, addr); + else + { + + if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping + vuFloat3(addr); + + if( _XYZW_SS ) { + if( EEREC_D == EEREC_TEMP ) { + _vuFlipRegSS(VU, EEREC_S); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MINSS_M32_to_XMM(EEREC_D, addr); + _vuFlipRegSS(VU, EEREC_S); + + // have to flip over EEREC_D if computing flags! + //if( (info & PROCESS_VU_UPDATEFLAGS) ) + _vuFlipRegSS(VU, EEREC_D); + } + else if( EEREC_D == EEREC_S ) { + _vuFlipRegSS(VU, EEREC_D); + SSE_MINSS_M32_to_XMM(EEREC_D, addr); + _vuFlipRegSS(VU, EEREC_D); + } + else { + if( _X ) { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MINSS_M32_to_XMM(EEREC_D, addr); + } + else { + _vuMoveSS(VU, EEREC_TEMP, EEREC_S); + _vuFlipRegSS(VU, EEREC_D); + SSE_MINSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + _vuFlipRegSS(VU, EEREC_D); + } + } + } + else if (_X_Y_Z_W != 0xf) { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + if(EEREC_D == EEREC_S) { + SSE_MOVSS_M32_to_XMM(EEREC_TEMP, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x00); + SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + else { + SSE_MOVSS_M32_to_XMM(EEREC_D, addr); + SSE_SHUFPS_XMM_to_XMM(EEREC_D, EEREC_D, 0x00); + SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S); + } + } + } +} + +void recVUMI_MINI_xyzw(VURegs *VU, int xyzw, int info) +{ + if ( _Fd_ == 0 ) return; + //Console.WriteLn ("recVUMI_MINI_xyzw"); + + if (_Fs_ == 0 && _Ft_ == 0) + { + if( _X_Y_Z_W == 0xf ) + { + //If VF0.w is the constant, the result will match VF0, else its all 0's + if(xyzw == 3) SSE_MOVAPS_M128_to_XMM(EEREC_D, (uptr)&VU->VF[0].UL[0]); + else SSE_XORPS_XMM_to_XMM(EEREC_D, EEREC_D); + } + else + { + //If VF0.w is the constant, the result will match VF0, else its all 0's + if(xyzw == 3) SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[0]); + else SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + return; + } + if (MINMAXFIX) + MINMAXlogical(VU, info, 1, 2, 0, xyzw); + else + { + if (_Fs_) vuFloat4_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W ); // Always do Preserved Sign Clamping + if (_Ft_) vuFloat4_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) ); + + if( _X_Y_Z_W == 8 && (EEREC_D != EEREC_TEMP)) { + if( xyzw == 0 ) { + if( EEREC_D == EEREC_S ) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T); + else if( EEREC_D == EEREC_T ) SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_S); + else { + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_T); + } + } + else { + _unpackVFSS_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MINSS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + } + else if (_X_Y_Z_W != 0xf) { + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MINPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + VU_MERGE_REGS(EEREC_D, EEREC_TEMP); + } + else { + if (EEREC_D == EEREC_S) { + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, xyzw); + SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); + } + else { + _unpackVF_xyzw(EEREC_D, EEREC_T, xyzw); + SSE_MINPS_XMM_to_XMM(EEREC_D, EEREC_S); + } + } + } +} + +void recVUMI_MINIi(VURegs *VU, int info) { recVUMI_MINI_iq(VU, VU_VI_ADDR(REG_I, 1), info); } +void recVUMI_MINIx(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 0, info); } +void recVUMI_MINIy(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 1, info); } +void recVUMI_MINIz(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 2, info); } +void recVUMI_MINIw(VURegs *VU, int info) { recVUMI_MINI_xyzw(VU, 3, info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// OPMULA +//------------------------------------------------------------------ +void recVUMI_OPMULA( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_OPMULA"); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, 0xE); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 0xE); + } + + SSE_MOVAPS_XMM_to_XMM( EEREC_TEMP, EEREC_S ); + SSE_SHUFPS_XMM_to_XMM( EEREC_T, EEREC_T, 0xD2 ); // EEREC_T = WYXZ + SSE_SHUFPS_XMM_to_XMM( EEREC_TEMP, EEREC_TEMP, 0xC9 ); // EEREC_TEMP = WXZY + SSE_MULPS_XMM_to_XMM( EEREC_TEMP, EEREC_T ); + + VU_MERGE_REGS_CUSTOM(EEREC_ACC, EEREC_TEMP, 14); + + // revert EEREC_T + if( EEREC_T != EEREC_ACC ) + SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xC9); + + recUpdateFlags(VU, EEREC_ACC, info); +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// OPMSUB +//------------------------------------------------------------------ +void recVUMI_OPMSUB( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_OPMSUB"); + if (CHECK_VU_EXTRA_OVERFLOW) { + if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, 0xE); + if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, 0xE); + } + + if( !_Fd_ ) info |= PROCESS_EE_SET_D(EEREC_TEMP); + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xD2); // EEREC_T = WYXZ + SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0xC9); // EEREC_TEMP = WXZY + SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_T); + + // negate and add + SSE_XORPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[4]); + SSE_ADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_ACC); + VU_MERGE_REGS_CUSTOM(EEREC_D, EEREC_TEMP, 14); + + // revert EEREC_T + if( EEREC_T != EEREC_D ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xC9); + + recUpdateFlags(VU, EEREC_D, info); +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// NOP +//------------------------------------------------------------------ +void recVUMI_NOP( VURegs *VU, int info ) +{ + //Console.WriteLn ("recVUMI_NOP"); +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// recVUMI_FTOI_Saturate() - Saturates result from FTOI Instructions +//------------------------------------------------------------------ + +// unused, but leaving here for possible reference.. +//static const __aligned16 int rec_const_0x8000000[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; + +void recVUMI_FTOI_Saturate(int rec_s, int rec_t, int rec_tmp1, int rec_tmp2) +{ + //Console.WriteLn ("recVUMI_FTOI_Saturate"); + //Duplicate the xor'd sign bit to the whole value + //FFFF FFFF for positive, 0 for negative + SSE_MOVAPS_XMM_to_XMM(rec_tmp1, rec_s); + SSE2_PXOR_M128_to_XMM(rec_tmp1, (uptr)&const_clip[4]); + SSE2_PSRAD_I8_to_XMM(rec_tmp1, 31); + + //Create mask: 0 where !=8000 0000 + SSE_MOVAPS_XMM_to_XMM(rec_tmp2, rec_t); + SSE2_PCMPEQD_M128_to_XMM(rec_tmp2, (uptr)&const_clip[4]); + + //AND the mask w/ the edit values + SSE_ANDPS_XMM_to_XMM(rec_tmp1, rec_tmp2); + + //if v==8000 0000 && positive -> 8000 0000 + FFFF FFFF -> 7FFF FFFF + //if v==8000 0000 && negative -> 8000 0000 + 0 -> 8000 0000 + //if v!=8000 0000 -> v+0 (masked from the and) + + //Add the values as needed + SSE2_PADDD_XMM_to_XMM(rec_t, rec_tmp1); +} +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// FTOI 0/4/12/15 +//------------------------------------------------------------------ +static __aligned16 float FTIO_Temp1[4] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; +static __aligned16 float FTIO_Temp2[4] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; +void recVUMI_FTOI0(VURegs *VU, int info) +{ + int t1reg, t2reg; // Temp XMM regs + + if ( _Ft_ == 0 ) return; + + //Console.WriteLn ("recVUMI_FTOI0"); + + if (_X_Y_Z_W != 0xf) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) + SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); + + t1reg = _vuGetTempXMMreg(info); + + if( t1reg >= 0 ) { // If theres a temp XMM reg available + for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++) + ; // Find unused reg (For second temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg + + recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result + + SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg + _freeXMMreg(t1reg); // Free temp reg + } + else { // No temp reg available + for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) + ; // Find unused reg (For first temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg + + for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++) + ; // Find unused reg (For second temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg + + recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result + + SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg + SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg + } + + VU_MERGE_REGS(EEREC_T, EEREC_TEMP); + } + else { + if (EEREC_T != EEREC_S) { + SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_S); + vuFloat_useEAX( info, EEREC_T, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) + SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_T, EEREC_T); + + t1reg = _vuGetTempXMMreg(info); + + if( t1reg >= 0 ) { // If theres a temp XMM reg available + recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result + _freeXMMreg(t1reg); // Free temp reg + } + else { // No temp reg available + for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) + ; // Find unused reg + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg + + recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result + + SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg + } + } + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) + SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); + + t1reg = _vuGetTempXMMreg(info); + + if( t1reg >= 0 ) { // If theres a temp XMM reg available + for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++) + ; // Find unused reg (For second temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg + + recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result + + SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg + _freeXMMreg(t1reg); // Free temp reg + } + else { // No temp reg available + for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) + ; // Find unused reg (For first temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg + + for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++) + ; // Find unused reg (For second temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg + + recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result + + SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg + SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg + } + + SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_TEMP); + } + } +} + +void recVUMI_FTOIX(VURegs *VU, int addr, int info) +{ + int t1reg, t2reg; // Temp XMM regs + + if ( _Ft_ == 0 ) return; + + //Console.WriteLn ("recVUMI_FTOIX"); + if (_X_Y_Z_W != 0xf) { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr); + vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) + SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); + + t1reg = _vuGetTempXMMreg(info); + + if( t1reg >= 0 ) { // If theres a temp XMM reg available + for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++) + ; // Find unused reg (For second temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg + + recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result + + SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg + _freeXMMreg(t1reg); // Free temp reg + } + else { // No temp reg available + for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) + ; // Find unused reg (For first temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg + + for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++) + ; // Find unused reg (For second temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg + + recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result + + SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg + SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg + } + + VU_MERGE_REGS(EEREC_T, EEREC_TEMP); + } + else { + if (EEREC_T != EEREC_S) { + SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_S); + SSE_MULPS_M128_to_XMM(EEREC_T, addr); + vuFloat_useEAX( info, EEREC_T, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) + SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_T, EEREC_T); + + t1reg = _vuGetTempXMMreg(info); + + if( t1reg >= 0 ) { // If theres a temp XMM reg available + recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result + _freeXMMreg(t1reg); // Free temp reg + } + else { // No temp reg available + for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) + ; // Find unused reg + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg + + recVUMI_FTOI_Saturate(EEREC_S, EEREC_T, EEREC_TEMP, t1reg); // Saturate if Float->Int conversion returned illegal result + + SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg + } + } + else { + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr); + vuFloat_useEAX( info, EEREC_TEMP, 0xf ); // Clamp Infs and NaNs to pos/neg fmax (NaNs always to positive fmax) + SSE2_CVTTPS2DQ_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); + + t1reg = _vuGetTempXMMreg(info); + + if( t1reg >= 0 ) { // If theres a temp XMM reg available + for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg)); t2reg++) + ; // Find unused reg (For second temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t2reg); // Backup XMM reg + + recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result + + SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp1); // Restore XMM reg + _freeXMMreg(t1reg); // Free temp reg + } + else { // No temp reg available + for (t1reg = 0; ( (t1reg == EEREC_S) || (t1reg == EEREC_T) || (t1reg == EEREC_TEMP) ); t1reg++) + ; // Find unused reg (For first temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp1, t1reg); // Backup t1reg XMM reg + + for (t2reg = 0; ( (t2reg == EEREC_S) || (t2reg == EEREC_T) || (t2reg == EEREC_TEMP) || (t2reg == t1reg) ); t2reg++) + ; // Find unused reg (For second temp reg) + SSE_MOVAPS_XMM_to_M128((uptr)FTIO_Temp2, t2reg); // Backup t2reg XMM reg + + recVUMI_FTOI_Saturate(EEREC_S, EEREC_TEMP, t1reg, t2reg); // Saturate if Float->Int conversion returned illegal result + + SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)FTIO_Temp1); // Restore t1reg XMM reg + SSE_MOVAPS_M128_to_XMM(t2reg, (uptr)FTIO_Temp2); // Restore t2reg XMM reg + } + + SSE_MOVAPS_XMM_to_XMM(EEREC_T, EEREC_TEMP); + } + } +} + +void recVUMI_FTOI4( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int4[0], info); } +void recVUMI_FTOI12( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int12[0], info); } +void recVUMI_FTOI15( VURegs *VU, int info ) { recVUMI_FTOIX(VU, (uptr)&recMult_float_to_int15[0], info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// ITOF 0/4/12/15 +//------------------------------------------------------------------ +void recVUMI_ITOF0( VURegs *VU, int info ) +{ + if ( _Ft_ == 0 ) return; + + //Console.WriteLn ("recVUMI_ITOF0"); + if (_X_Y_Z_W != 0xf) { + SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + vuFloat_useEAX( info, EEREC_TEMP, 15); // Clamp infinities + VU_MERGE_REGS(EEREC_T, EEREC_TEMP); + xmmregs[EEREC_T].mode |= MODE_WRITE; + } + else { + SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_T, EEREC_S); + vuFloat2(EEREC_T, EEREC_TEMP, 15); // Clamp infinities + } +} + +void recVUMI_ITOFX(VURegs *VU, int addr, int info) +{ + if ( _Ft_ == 0 ) return; + + //Console.WriteLn ("recVUMI_ITOFX"); + if (_X_Y_Z_W != 0xf) { + SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_TEMP, EEREC_S); + SSE_MULPS_M128_to_XMM(EEREC_TEMP, addr); + vuFloat_useEAX( info, EEREC_TEMP, 15); // Clamp infinities + VU_MERGE_REGS(EEREC_T, EEREC_TEMP); + xmmregs[EEREC_T].mode |= MODE_WRITE; + } + else { + SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_T, EEREC_S); + SSE_MULPS_M128_to_XMM(EEREC_T, addr); + vuFloat2(EEREC_T, EEREC_TEMP, 15); // Clamp infinities + } +} + +void recVUMI_ITOF4( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float4[0], info); } +void recVUMI_ITOF12( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float12[0], info); } +void recVUMI_ITOF15( VURegs *VU, int info ) { recVUMI_ITOFX(VU, (uptr)&recMult_int_to_float15[0], info); } +//------------------------------------------------------------------ + + +//------------------------------------------------------------------ +// CLIP +//------------------------------------------------------------------ +void recVUMI_CLIP(VURegs *VU, int info) +{ + int t1reg = EEREC_D; + int t2reg = EEREC_ACC; + int x86temp1, x86temp2; + + u32 clipaddr = VU_VI_ADDR(REG_CLIP_FLAG, 0); + u32 prevclipaddr = VU_VI_ADDR(REG_CLIP_FLAG, 2); + + if( clipaddr == 0 ) { // battle star has a clip right before fcset + Console.WriteLn("skipping vu clip"); + return; + } + + //Flush the clip flag before processing, incase of double clip commands (GoW) + + if( prevclipaddr != (uptr)&VU->VI[REG_CLIP_FLAG] ) { + MOV32MtoR(EAX, prevclipaddr); + MOV32RtoM((uptr)&VU->VI[REG_CLIP_FLAG], EAX); + } + + assert( clipaddr != 0 ); + assert( t1reg != t2reg && t1reg != EEREC_TEMP && t2reg != EEREC_TEMP ); + + x86temp1 = ALLOCTEMPX86(MODE_8BITREG); + x86temp2 = ALLOCTEMPX86(MODE_8BITREG); + + //if ( (x86temp1 == 0) || (x86temp2 == 0) ) Console.Error("VU CLIP Allocation Error: EAX being allocated!"); + + _freeXMMreg(t1reg); // These should have been freed at allocation in eeVURecompileCode() + _freeXMMreg(t2reg); // but if they've been used since then, then free them. (just doing this incase :p (cottonvibes)) + + if( _Ft_ == 0 ) { + SSE_MOVAPS_M128_to_XMM(EEREC_TEMP, (uptr)&s_fones[0]); // all 1s + SSE_MOVAPS_M128_to_XMM(t1reg, (uptr)&s_fones[4]); + } + else { + _unpackVF_xyzw(EEREC_TEMP, EEREC_T, 3); + SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&const_clip[0]); + SSE_MOVAPS_XMM_to_XMM(t1reg, EEREC_TEMP); + SSE_ORPS_M128_to_XMM(t1reg, (uptr)&const_clip[4]); + } + + MOV32MtoR(EAX, prevclipaddr); + + SSE_CMPNLEPS_XMM_to_XMM(t1reg, EEREC_S); //-w, -z, -y, -x + SSE_CMPLTPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); //+w, +z, +y, +x + + SHL32ItoR(EAX, 6); + + SSE_MOVAPS_XMM_to_XMM(t2reg, EEREC_TEMP); //t2 = +w, +z, +y, +x + SSE_UNPCKLPS_XMM_to_XMM(EEREC_TEMP, t1reg); //EEREC_TEMP = -y,+y,-x,+x + SSE_UNPCKHPS_XMM_to_XMM(t2reg, t1reg); //t2reg = -w,+w,-z,+z + SSE_MOVMSKPS_XMM_to_R32(x86temp2, EEREC_TEMP); // -y,+y,-x,+x + SSE_MOVMSKPS_XMM_to_R32(x86temp1, t2reg); // -w,+w,-z,+z + + AND8ItoR(x86temp1, 0x3); + SHL8ItoR(x86temp1, 4); + OR8RtoR(EAX, x86temp1); + AND8ItoR(x86temp2, 0xf); + OR8RtoR(EAX, x86temp2); + AND32ItoR(EAX, 0xffffff); + + MOV32RtoM(clipaddr, EAX); + + if (( !(info & (PROCESS_VU_SUPER|PROCESS_VU_COP2)) ) ) //Instantly update the flag if its called from elsewhere (unlikely, but ok) + MOV32RtoM((uptr)&VU->VI[REG_CLIP_FLAG], EAX); + + _freeX86reg(x86temp1); + _freeX86reg(x86temp2); +} diff --git a/pcsx2/x86/sVU_zerorec.h b/pcsx2/x86/sVU_zerorec.h index 9582c7981c..5e252e5d0c 100644 --- a/pcsx2/x86/sVU_zerorec.h +++ b/pcsx2/x86/sVU_zerorec.h @@ -1,73 +1,73 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -// Super VU recompiler - author: zerofrog(@gmail.com) - -#pragma once - -#include "sVU_Micro.h" - -//Using assembly code from an external file. -#ifdef __LINUX__ -extern "C" { -#endif -extern void SuperVUExecuteProgram(u32 startpc, int vuindex); -extern void SuperVUEndProgram(); -extern void svudispfntemp(); -#ifdef __LINUX__ -} -#endif - -extern void SuperVUDestroy(int vuindex); -extern void SuperVUReset(int vuindex); - -// read = 0, will write to reg -// read = 1, will read from reg -// read = 2, addr of previously written reg (used for status and clip flags) -extern u32 SuperVUGetVIAddr(int reg, int read); - -// if p == 0, flush q else flush p; if wait is != 0, waits for p/q -extern void SuperVUFlush(int p, int wait); - - -class recSuperVU0 : public BaseVUmicroCPU -{ -public: - recSuperVU0(); - - const char* GetShortName() const { return "sVU0"; } - wxString GetLongName() const { return L"SuperVU0 Recompiler"; } - - void Allocate(); - void Shutdown() throw(); - void Reset(); - void ExecuteBlock(); - void Clear(u32 Addr, u32 Size); -}; - -class recSuperVU1 : public BaseVUmicroCPU -{ -public: - recSuperVU1(); - - const char* GetShortName() const { return "sVU1"; } - wxString GetLongName() const { return L"SuperVU1 Recompiler"; } - - void Allocate(); - void Shutdown() throw(); - void Reset(); - void ExecuteBlock(); - void Clear(u32 Addr, u32 Size); -}; +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +// Super VU recompiler - author: zerofrog(@gmail.com) + +#pragma once + +#include "sVU_Micro.h" + +//Using assembly code from an external file. +#ifdef __LINUX__ +extern "C" { +#endif +extern void SuperVUExecuteProgram(u32 startpc, int vuindex); +extern void SuperVUEndProgram(); +extern void svudispfntemp(); +#ifdef __LINUX__ +} +#endif + +extern void SuperVUDestroy(int vuindex); +extern void SuperVUReset(int vuindex); + +// read = 0, will write to reg +// read = 1, will read from reg +// read = 2, addr of previously written reg (used for status and clip flags) +extern u32 SuperVUGetVIAddr(int reg, int read); + +// if p == 0, flush q else flush p; if wait is != 0, waits for p/q +extern void SuperVUFlush(int p, int wait); + + +class recSuperVU0 : public BaseVUmicroCPU +{ +public: + recSuperVU0(); + + const char* GetShortName() const { return "sVU0"; } + wxString GetLongName() const { return L"SuperVU0 Recompiler"; } + + void Allocate(); + void Shutdown() throw(); + void Reset(); + void ExecuteBlock(); + void Clear(u32 Addr, u32 Size); +}; + +class recSuperVU1 : public BaseVUmicroCPU +{ +public: + recSuperVU1(); + + const char* GetShortName() const { return "sVU1"; } + wxString GetLongName() const { return L"SuperVU1 Recompiler"; } + + void Allocate(); + void Shutdown() throw(); + void Reset(); + void ExecuteBlock(); + void Clear(u32 Addr, u32 Size); +};