Linux: Fix bugs in _aligned_realloc and newVif's inlined SSE HashBucket finder.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2395 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-12-24 10:04:03 +00:00 · 2009-12-24 10:04:03 +00:00 · 3d9bb25505
parent 4b0b270776
commit 3d9bb25505
3 changed files with 390 additions and 375 deletions
--- a/common/src/Utilities/AlignedMalloc.cpp
+++ b/common/src/Utilities/AlignedMalloc.cpp
@ -28,7 +28,7 @@ static const uint headsize = sizeof(AlignedMallocHeader);

 void* __fastcall pcsx2_aligned_malloc(size_t size, size_t align)
 {
-	jASSUME( align < 0x10000 );
+	pxAssume( align < 0x10000 );

 	u8* p = (u8*)malloc(size+align+headsize);

@ -47,15 +47,16 @@ void* __fastcall pcsx2_aligned_malloc(size_t size, size_t align)

 void* __fastcall pcsx2_aligned_realloc(void* handle, size_t size, size_t align)
 {
-	if( handle == NULL ) return NULL;
-	jASSUME( align < 0x10000 );
-
-	AlignedMallocHeader* header = (AlignedMallocHeader*)((uptr)handle - headsize);
+	pxAssume( align < 0x10000 );

 	void* newbuf = pcsx2_aligned_malloc( size, align );
-	memcpy_fast( newbuf, handle, std::min( size, header->size ) );

-	free( header->baseptr );
+	if( handle != NULL )
+	{
+		AlignedMallocHeader* header = (AlignedMallocHeader*)((uptr)handle - headsize);
+		memcpy_fast( newbuf, handle, std::min( size, header->size ) );
+		free( header->baseptr );
+	}
 	return newbuf;
 }

@ -74,7 +75,7 @@ __forceinline void pcsx2_aligned_free(void* pmem)
 // memzero_obj and stuff).
 __forceinline void _memset16_unaligned( void* dest, u16 data, size_t size )
 {
-	jASSUME( (size & 0x1) == 0 );
+	pxAssume( (size & 0x1) == 0 );

 	u16* dst = (u16*)dest;
 	for(int i=size; i; --i, ++dst )
--- a/pcsx2/x86/VifUnpackSSE_Dynarec.cpp
+++ b/pcsx2/x86/VifUnpackSSE_Dynarec.cpp
@ -1,282 +1,282 @@
-/*  PCSX2 - PS2 Emulator for PCs
- *  Copyright (C) 2002-2009  PCSX2 Dev Team
- *
- *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
- *  of the GNU Lesser General Public License as published by the Free Software Found-
- *  ation, either version 3 of the License, or (at your option) any later version.
- *
- *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- *  PURPOSE.  See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along with PCSX2.
- *  If not, see <http://www.gnu.org/licenses/>.
- */
-
-// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets
-// authors: cottonvibes(@gmail.com)
-//			Jake.Stine (@gmail.com)
-
-#include "PrecompiledHeader.h"
-#include "VifUnpackSSE.h"
-
-#if newVif
-
-static __aligned16 nVifBlock _vBlock = {0};
-static __pagealigned u8 nVifMemCmp[__pagesize];
-
-static void emitCustomCompare() {
-	HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadWrite, false);
-	memset8<0xcc>(nVifMemCmp);
-	xSetPtr(nVifMemCmp);
-
-	xMOVAPS  (xmm0, ptr32[ecx]);
-	xPCMP.EQD(xmm0, ptr32[edx]);
-	xMOVMSKPS(eax, xmm0);
-	xAND	 (eax, 0x7);		// ignore top 4 bytes (recBlock pointer)
-
-	xRET();
-	HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadOnly, true);
-}
-
-void dVifInit(int idx) {
-	nVif[idx].idx		=  idx;
-	nVif[idx].VU		=  idx ? &VU1     : &VU0;
-	nVif[idx].vif		=  idx ? &vif1    : &vif0;
-	nVif[idx].vifRegs	=  idx ? vif1Regs : vif0Regs;
-	nVif[idx].vuMemEnd  =  idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
-	nVif[idx].vuMemLimit=  idx ? 0x3ff0 : 0xff0;
-	nVif[idx].vifCache	=  new BlockBuffer(_1mb*4); // 4mb Rec Cache
-	nVif[idx].vifBlocks =  new HashBucket<_tParams>();
-	nVif[idx].recPtr	=  nVif[idx].vifCache->getBlock();
-	nVif[idx].recEnd	= &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone
-	//emitCustomCompare();
-}
-
-// Loads Row/Col Data from vifRegs instead of g_vifmask
-// Useful for testing vifReg and g_vifmask inconsistency.
-static void loadRowCol(nVifStruct& v) {
-	xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
-	xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
-	xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
-	xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
-	xPSHUF.D(xmm0, xmm0, _v0);
-	xPSHUF.D(xmm1, xmm1, _v0);
-	xPSHUF.D(xmm2, xmm2, _v0);
-	xPSHUF.D(xmm6, xmm6, _v0);
-	mVUmergeRegs(XMM6, XMM0, 8);
-	mVUmergeRegs(XMM6, XMM1, 4);
-	mVUmergeRegs(XMM6, XMM2, 2);
-	xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
-	xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
-	xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
-	xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
-	xPSHUF.D(xmm2, xmm2, _v0);
-	xPSHUF.D(xmm3, xmm3, _v0);
-	xPSHUF.D(xmm4, xmm4, _v0);
-	xPSHUF.D(xmm5, xmm5, _v0);
-}
-
-VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_)
-	: v(vif_)
-	, vB(vifBlock_)
-{
-	isFill		= (vB.cl < vB.wl);
-	usn			= (vB.upkType>>5) & 1;
-	doMask		= (vB.upkType>>4) & 1;
-	doMode		= vB.mode & 3;
-}
-
-#define makeMergeMask(x) {									\
-	x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3);	\
-}
-
-_f void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
-	u32 m0 = vB.mask;
-	u32 m1 =  m0 & 0xaaaaaaaa;
-	u32 m2 =(~m1>>1) &  m0;
-	u32 m3 = (m1>>1) & ~m0;
-	u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
-	u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
-	if((m2&&doMask) || doMode) { xMOVAPS(xmmRow, ptr32[row]); }
-	if (m3&&doMask) {
-		xMOVAPS(xmmCol0, ptr32[col]); 
-		if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
-		if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
-		if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
-		if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0);
-	}
-	//if (mask||mode) loadRowCol(v);
-}
-
-void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
-	pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
-	int cc =  aMin(vCL, 3);
-	u32 m0 = (vB.mask >> (cc * 8)) & 0xff;
-	u32 m1 =  m0 & 0xaaaa;
-	u32 m2 =(~m1>>1) &  m0;
-	u32 m3 = (m1>>1) & ~m0;
-	u32 m4 = (m1>>1) &  m0;
-	makeMergeMask(m2);
-	makeMergeMask(m3);
-	makeMergeMask(m4);
-	if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]);		 } // Load Write Protect
-	if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id,		m2); } // Merge Row
-	if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc,	m3); } // Merge Col
-	if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id,		m4); } // Merge Write Protect
-	if (doMode) {
-		u32 m5 = (~m1>>1) & ~m0;
-		if (!doMask)  m5 = 0xf;
-		else		  makeMergeMask(m5);
-		if (m5 < 0xf) {
-			xPXOR(xmmTemp, xmmTemp);
-			mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5);
-			xPADD.D(regX, xmmTemp);
-			if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5);
-		}
-		else if (m5 == 0xf) {
-			xPADD.D(regX, xmmRow);
-			if (doMode==2) xMOVAPS(xmmRow, regX);
-		}
-	}
-	xMOVAPS(ptr32[dstIndirect], regX);	
-}
-
-void VifUnpackSSE_Dynarec::writeBackRow() const {
-	u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
-	xMOVAPS(ptr32[row], xmmRow);
-	DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
-	// ToDo: Do we need to write back to vifregs.rX too!? :/
-}
-
-static void ShiftDisplacementWindow( xAddressInfo& addr, const xRegister32& modReg )
-{
-	// Shifts the displacement factor of a given indirect address, so that the address
-	// remains in the optimal 0xf0 range (which allows for byte-form displacements when
-	// generating instructions).
-
-	int addImm = 0;
-	while( addr.Displacement >= 0x80 )
-	{
-		addImm += 0xf0;
-		addr -= 0xf0;
-	}
-	if(addImm) xADD(modReg, addImm);
-}
-
-void VifUnpackSSE_Dynarec::CompileRoutine() {
-	const int  upkNum		=  vB.upkType & 0xf;
-	const u8&  vift			=  nVifT[upkNum];
-	const int  cycleSize	=  isFill ?  vB.cl : vB.wl;
-	const int  blockSize	=  isFill ?  vB.wl : vB.cl;
-	const int  skipSize		=  blockSize - cycleSize;
-
-	int  vNum	=  vifRegs->num;
-	vCL	=  vif->cl;
-
-	SetMasks(cycleSize);
-
-	while (vNum) {
-
-		ShiftDisplacementWindow( srcIndirect, edx );
-		ShiftDisplacementWindow( dstIndirect, ecx );
-
-		if (vCL < cycleSize) { 
-			xUnpack(upkNum);
-			srcIndirect += vift;
-			dstIndirect += 16;
-			vNum--;
-			if (++vCL == blockSize) vCL = 0;
-		}
-		else if (isFill) {
-			DevCon.WriteLn("filling mode!");
-			VifUnpackSSE_Dynarec::FillingWrite( *this ).xUnpack(upkNum);
-			dstIndirect += 16;
-			vNum--;
-			if (++vCL == blockSize) vCL = 0;
-		}
-		else {
-			dstIndirect += (16 * skipSize);
-			vCL = 0;
-		}
-	}
-	if (doMode==2) writeBackRow();
-	xMOV(ptr32[&vif->cl],	   vCL);
-	xMOV(ptr32[&vifRegs->num], vNum);
-	xRET();
-}
-
-static _f u8* dVifsetVUptr(const nVifStruct& v, int offset) {
-	u8* ptr	   = (u8*)(v.VU->Mem + (offset & v.vuMemLimit));
-	u8* endPtr = ptr + _vBlock.num * 16;
-	if (endPtr > v.vuMemEnd) {
-		DevCon.WriteLn("nVif - VU Mem Ptr Overflow; falling back to interpreter.");
-		ptr = NULL; // Fall Back to Interpreters which have wrap-around logic
-	}
-	return ptr;
-}
-
-static _f void dVifRecLimit(int idx) {
-	if (nVif[idx].recPtr > nVif[idx].recEnd) {
-		DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd);
-		nVif[idx].vifBlocks->clear();
-		nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
-	}
-}
-
-_f void dVifUnpack(int idx, u8 *data, u32 size, bool isFill) {
-
-	const nVifStruct& v		= nVif[idx];
-	const u8	upkType		= vif->cmd & 0x1f | ((!!vif->usn) << 5);
-	const int	doMask		= (upkType>>4) & 1;
-
-	const int	cycle_cl	= vifRegs->cycle.cl;
-	const int	cycle_wl	= vifRegs->cycle.wl;
-	const int	cycleSize	= isFill ? cycle_cl : cycle_wl;
-	const int	blockSize	= isFill ? cycle_wl : cycle_cl;
-
-	if (vif->cl >= blockSize)  vif->cl = 0;
-
-	_vBlock.upkType   = upkType;
-	_vBlock.num		  = *(u8*)&vifRegs->num;
-	_vBlock.mode	  = *(u8*)&vifRegs->mode;
-	_vBlock.scl		  = vif->cl;
-	_vBlock.cl		  = cycle_cl;
-	_vBlock.wl		  = cycle_wl;
-
-	// Zero out the mask parameter if it's unused -- games leave random junk
-	// values here which cause false recblock cache misses.
-	_vBlock.mask	  = doMask ? vifRegs->mask : 0x00;
-
-	if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
-		if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) {
-			//DevCon.WriteLn("Running Recompiled Block!");
-			((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
-		}
-		else {
-			//DevCon.WriteLn("Running Interpreter Block");
-			_nVifUnpack(idx, data, size, isFill);
-		}
-		return;
-	}
-	static int recBlockNum = 0;
-	DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++);
-	DevCon.WriteLn(L"\t(num=0x%02x, upkType=0x%02x, mode=0x%02x, scl=0x%02x, cl/wl=0x%x/0x%x, mask=%s)",
-		_vBlock.num, _vBlock.upkType, _vBlock.mode, _vBlock.scl, _vBlock.cl, _vBlock.wl,
-		doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
-	);
-
-	xSetPtr(v.recPtr);
-	_vBlock.startPtr = (uptr)xGetAlignedCallTarget();
-	v.vifBlocks->add(_vBlock);
-	VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine();
-	nVif[idx].recPtr = xGetPtr();
-
-	dVifRecLimit(idx);
-	
-	// Run the block we just compiled.  Various conditions may force us to still use
-	// the interpreter unpacker though, so a recursive call is the safest way here...
-	dVifUnpack(idx, data, size, isFill);
-}
-
-#endif
+/*  PCSX2 - PS2 Emulator for PCs
+ *  Copyright (C) 2002-2009  PCSX2 Dev Team
+ *
+ *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ *  of the GNU Lesser General Public License as published by the Free Software Found-
+ *  ation, either version 3 of the License, or (at your option) any later version.
+ *
+ *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ *  PURPOSE.  See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along with PCSX2.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets
+// authors: cottonvibes(@gmail.com)
+//			Jake.Stine (@gmail.com)
+
+#include "PrecompiledHeader.h"
+#include "VifUnpackSSE.h"
+
+#if newVif
+
+static __aligned16 nVifBlock _vBlock = {0};
+static __pagealigned u8 nVifMemCmp[__pagesize];
+
+static void emitCustomCompare() {
+	HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadWrite, false);
+	memset8<0xcc>(nVifMemCmp);
+	xSetPtr(nVifMemCmp);
+
+	xMOVAPS  (xmm0, ptr32[ecx]);
+	xPCMP.EQD(xmm0, ptr32[edx]);
+	xMOVMSKPS(eax, xmm0);
+	xAND	 (eax, 0x7);		// ignore top 4 bytes (recBlock pointer)
+
+	xRET();
+	HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadOnly, true);
+}
+
+void dVifInit(int idx) {
+	nVif[idx].idx		=  idx;
+	nVif[idx].VU		=  idx ? &VU1     : &VU0;
+	nVif[idx].vif		=  idx ? &vif1    : &vif0;
+	nVif[idx].vifRegs	=  idx ? vif1Regs : vif0Regs;
+	nVif[idx].vuMemEnd  =  idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
+	nVif[idx].vuMemLimit=  idx ? 0x3ff0 : 0xff0;
+	nVif[idx].vifCache	=  new BlockBuffer(_1mb*4); // 4mb Rec Cache
+	nVif[idx].vifBlocks =  new HashBucket<_tParams>();
+	nVif[idx].recPtr	=  nVif[idx].vifCache->getBlock();
+	nVif[idx].recEnd	= &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone
+	//emitCustomCompare();
+}
+
+// Loads Row/Col Data from vifRegs instead of g_vifmask
+// Useful for testing vifReg and g_vifmask inconsistency.
+static void loadRowCol(nVifStruct& v) {
+	xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
+	xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
+	xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
+	xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
+	xPSHUF.D(xmm0, xmm0, _v0);
+	xPSHUF.D(xmm1, xmm1, _v0);
+	xPSHUF.D(xmm2, xmm2, _v0);
+	xPSHUF.D(xmm6, xmm6, _v0);
+	mVUmergeRegs(XMM6, XMM0, 8);
+	mVUmergeRegs(XMM6, XMM1, 4);
+	mVUmergeRegs(XMM6, XMM2, 2);
+	xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
+	xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
+	xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
+	xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
+	xPSHUF.D(xmm2, xmm2, _v0);
+	xPSHUF.D(xmm3, xmm3, _v0);
+	xPSHUF.D(xmm4, xmm4, _v0);
+	xPSHUF.D(xmm5, xmm5, _v0);
+}
+
+VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_)
+	: v(vif_)
+	, vB(vifBlock_)
+{
+	isFill		= (vB.cl < vB.wl);
+	usn			= (vB.upkType>>5) & 1;
+	doMask		= (vB.upkType>>4) & 1;
+	doMode		= vB.mode & 3;
+}
+
+#define makeMergeMask(x) {									\
+	x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3);	\
+}
+
+_f void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
+	u32 m0 = vB.mask;
+	u32 m1 =  m0 & 0xaaaaaaaa;
+	u32 m2 =(~m1>>1) &  m0;
+	u32 m3 = (m1>>1) & ~m0;
+	u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
+	u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
+	if((m2&&doMask) || doMode) { xMOVAPS(xmmRow, ptr32[row]); }
+	if (m3&&doMask) {
+		xMOVAPS(xmmCol0, ptr32[col]); 
+		if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
+		if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
+		if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
+		if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0);
+	}
+	//if (mask||mode) loadRowCol(v);
+}
+
+void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
+	pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
+	int cc =  aMin(vCL, 3);
+	u32 m0 = (vB.mask >> (cc * 8)) & 0xff;
+	u32 m1 =  m0 & 0xaaaa;
+	u32 m2 =(~m1>>1) &  m0;
+	u32 m3 = (m1>>1) & ~m0;
+	u32 m4 = (m1>>1) &  m0;
+	makeMergeMask(m2);
+	makeMergeMask(m3);
+	makeMergeMask(m4);
+	if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]);		 } // Load Write Protect
+	if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id,		m2); } // Merge Row
+	if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc,	m3); } // Merge Col
+	if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id,		m4); } // Merge Write Protect
+	if (doMode) {
+		u32 m5 = (~m1>>1) & ~m0;
+		if (!doMask)  m5 = 0xf;
+		else		  makeMergeMask(m5);
+		if (m5 < 0xf) {
+			xPXOR(xmmTemp, xmmTemp);
+			mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5);
+			xPADD.D(regX, xmmTemp);
+			if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5);
+		}
+		else if (m5 == 0xf) {
+			xPADD.D(regX, xmmRow);
+			if (doMode==2) xMOVAPS(xmmRow, regX);
+		}
+	}
+	xMOVAPS(ptr32[dstIndirect], regX);	
+}
+
+void VifUnpackSSE_Dynarec::writeBackRow() const {
+	u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
+	xMOVAPS(ptr32[row], xmmRow);
+	DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
+	// ToDo: Do we need to write back to vifregs.rX too!? :/
+}
+
+static void ShiftDisplacementWindow( xAddressInfo& addr, const xRegister32& modReg )
+{
+	// Shifts the displacement factor of a given indirect address, so that the address
+	// remains in the optimal 0xf0 range (which allows for byte-form displacements when
+	// generating instructions).
+
+	int addImm = 0;
+	while( addr.Displacement >= 0x80 )
+	{
+		addImm += 0xf0;
+		addr -= 0xf0;
+	}
+	if(addImm) xADD(modReg, addImm);
+}
+
+void VifUnpackSSE_Dynarec::CompileRoutine() {
+	const int  upkNum		=  vB.upkType & 0xf;
+	const u8&  vift			=  nVifT[upkNum];
+	const int  cycleSize	=  isFill ?  vB.cl : vB.wl;
+	const int  blockSize	=  isFill ?  vB.wl : vB.cl;
+	const int  skipSize		=  blockSize - cycleSize;
+
+	int  vNum	=  vifRegs->num;
+	vCL	=  vif->cl;
+
+	SetMasks(cycleSize);
+
+	while (vNum) {
+
+		ShiftDisplacementWindow( srcIndirect, edx );
+		ShiftDisplacementWindow( dstIndirect, ecx );
+
+		if (vCL < cycleSize) { 
+			xUnpack(upkNum);
+			srcIndirect += vift;
+			dstIndirect += 16;
+			vNum--;
+			if (++vCL == blockSize) vCL = 0;
+		}
+		else if (isFill) {
+			DevCon.WriteLn("filling mode!");
+			VifUnpackSSE_Dynarec::FillingWrite( *this ).xUnpack(upkNum);
+			dstIndirect += 16;
+			vNum--;
+			if (++vCL == blockSize) vCL = 0;
+		}
+		else {
+			dstIndirect += (16 * skipSize);
+			vCL = 0;
+		}
+	}
+	if (doMode==2) writeBackRow();
+	xMOV(ptr32[&vif->cl],	   vCL);
+	xMOV(ptr32[&vifRegs->num], vNum);
+	xRET();
+}
+
+static _f u8* dVifsetVUptr(const nVifStruct& v, int offset) {
+	u8* ptr	   = (u8*)(v.VU->Mem + (offset & v.vuMemLimit));
+	u8* endPtr = ptr + _vBlock.num * 16;
+	if (endPtr > v.vuMemEnd) {
+		DevCon.WriteLn("nVif - VU Mem Ptr Overflow; falling back to interpreter.");
+		ptr = NULL; // Fall Back to Interpreters which have wrap-around logic
+	}
+	return ptr;
+}
+
+static _f void dVifRecLimit(int idx) {
+	if (nVif[idx].recPtr > nVif[idx].recEnd) {
+		DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd);
+		nVif[idx].vifBlocks->clear();
+		nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
+	}
+}
+
+_f void dVifUnpack(int idx, u8 *data, u32 size, bool isFill) {
+
+	const nVifStruct& v		= nVif[idx];
+	const u8	upkType		= vif->cmd & 0x1f | ((!!vif->usn) << 5);
+	const int	doMask		= (upkType>>4) & 1;
+
+	const int	cycle_cl	= vifRegs->cycle.cl;
+	const int	cycle_wl	= vifRegs->cycle.wl;
+	const int	cycleSize	= isFill ? cycle_cl : cycle_wl;
+	const int	blockSize	= isFill ? cycle_wl : cycle_cl;
+
+	if (vif->cl >= blockSize)  vif->cl = 0;
+
+	_vBlock.upkType   = upkType;
+	_vBlock.num		  = *(u8*)&vifRegs->num;
+	_vBlock.mode	  = *(u8*)&vifRegs->mode;
+	_vBlock.scl		  = vif->cl;
+	_vBlock.cl		  = cycle_cl;
+	_vBlock.wl		  = cycle_wl;
+
+	// Zero out the mask parameter if it's unused -- games leave random junk
+	// values here which cause false recblock cache misses.
+	_vBlock.mask	  = doMask ? vifRegs->mask : 0x00;
+
+	if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
+		if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) {
+			//DevCon.WriteLn("Running Recompiled Block!");
+			((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
+		}
+		else {
+			//DevCon.WriteLn("Running Interpreter Block");
+			_nVifUnpack(idx, data, size, isFill);
+		}
+		return;
+	}
+	static int recBlockNum = 0;
+	DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++);
+	DevCon.WriteLn(L"\t(num=0x%02x, upkType=0x%02x, mode=0x%02x, scl=0x%02x, cl/wl=0x%x/0x%x, mask=%s)",
+		_vBlock.num, _vBlock.upkType, _vBlock.mode, _vBlock.scl, _vBlock.cl, _vBlock.wl,
+		doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
+	);
+
+	xSetPtr(v.recPtr);
+	_vBlock.startPtr = (uptr)xGetAlignedCallTarget();
+	v.vifBlocks->add(_vBlock);
+	VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine();
+	nVif[idx].recPtr = xGetPtr();
+
+	dVifRecLimit(idx);
+	
+	// Run the block we just compiled.  Various conditions may force us to still use
+	// the interpreter unpacker though, so a recursive call is the safest way here...
+	dVifUnpack(idx, data, size, isFill);
+}
+
+#endif
--- a/pcsx2/x86/newVif_HashBucket.h
+++ b/pcsx2/x86/newVif_HashBucket.h
@ -1,86 +1,100 @@
-/*  PCSX2 - PS2 Emulator for PCs
- *  Copyright (C) 2002-2009  PCSX2 Dev Team
- *
- *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
- *  of the GNU Lesser General Public License as published by the Free Software Found-
- *  ation, either version 3 of the License, or (at your option) any later version.
- *
- *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- *  PURPOSE.  See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along with PCSX2.
- *  If not, see <http://www.gnu.org/licenses/>.
- */
+/*  PCSX2 - PS2 Emulator for PCs
+ *  Copyright (C) 2002-2009  PCSX2 Dev Team
+ *
+ *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ *  of the GNU Lesser General Public License as published by the Free Software Found-
+ *  ation, either version 3 of the License, or (at your option) any later version.
+ *
+ *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ *  PURPOSE.  See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along with PCSX2.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */

-#include "xmmintrin.h"
-#pragma once
-
-template< typename T >
-struct SizeChain
-{
-	int Size;
-	T*  Chain;
-};
-
-// HashBucket is a container which uses a built-in hash function
-// to perform quick searches.
-// T is a struct data type (note: size must be in multiples of 16 bytes!)
-// hSize determines the number of buckets HashBucket will use for sorting.
-// cmpSize is the size of data to consider 2 structs equal (see find())
-// The hash function is determined by taking the first bytes of data and
-// performing a modulus the size of hSize. So the most diverse-data should
-// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
-template<typename T, int hSize, int cmpSize>
-class HashBucket {
-protected:
-	SizeChain<T> mBucket[hSize];
-
-public:
-	HashBucket() { 
-		for (int i = 0; i < hSize; i++) {
-			mBucket[i].Chain	= NULL;
-			mBucket[i].Size		= 0;
-		}
-	}
-	~HashBucket() { clear(); }
-	int quickFind(u32 data) {
-		return mBucket[data % hSize].Size;
-	}
-	__forceinline T* find(T* dataPtr) {
-		u32 d = *((u32*)dataPtr);
-		const SizeChain<T>& bucket( mBucket[d % hSize] );
-
-		for (int i=bucket.Size; i; --i) {
-			// This inline version seems about 1-2% faster in tests of games that average 1
-			// program per bucket.  Games that average more should see a bigger improvement --air
-			int result = _mm_movemask_ps( (__m128&) _mm_cmpeq_epi32( _mm_load_si128((__m128i*)&bucket.Chain[i]), _mm_load_si128((__m128i*)dataPtr) ) ) & 0x7;
-			if( result == 0x7 ) return &bucket.Chain[i];
-
-			// Dynamically generated function version, can't be inlined. :(
-			//if ((((nVifCall)((void*)nVifMemCmp))(&bucket.Chain[i], dataPtr))==7) return &bucket.Chain[i];
-
-			//if (!memcmp(&bucket.Chain[i], dataPtr, sizeof(T)-4)) return &c[i];	// old school version! >_<
-		}
-		if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
-		return NULL;
-	}
-	__forceinline void add(const T& dataPtr) {
-		u32 d = (u32&)dataPtr;
-		SizeChain<T>& bucket( mBucket[d % hSize] );
-		
-		if( bucket.Chain = (T*)_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16), bucket.Chain==NULL ) {
-			throw Exception::OutOfMemory(
-				wxsFormat(L"Out of memory re-allocating hash bucket (bucket size=%d)", bucket.Size+1),
-				wxEmptyString
-			);
-		}
-		memcpy_fast(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
-	}
-	void clear() {
-		for (int i = 0; i < hSize; i++) {
-			safe_aligned_free(mBucket[i].Chain);
-			mBucket[i].Size = 0;
-		}
-	}
-};
+#include "xmmintrin.h"
+#pragma once
+
+// Create some typecast operators for SIMD operations.  For some reason MSVC needs a
+// handle/reference typecast to avoid error.  GCC (and presumably other compilers)
+// generate an error if the handle/ref is used.  Honestly neither makes sense, since
+// both typecasts should be perfectly valid >_<.  --air
+#ifdef _MSC_VER
+#	define cast_m128		__m128&
+#	define cast_m128i		__m128i&
+#	define cast_m128d		__m128d&
+#else // defined(__GNUC__)
+#	define cast_m128		__m128
+#	define cast_m128i		__m128i
+#	define cast_m128d		__m128d
+#endif
+
+template< typename T >
+struct SizeChain
+{
+	int Size;
+	T*  Chain;
+};
+
+// HashBucket is a container which uses a built-in hash function
+// to perform quick searches.
+// T is a struct data type (note: size must be in multiples of 16 bytes!)
+// hSize determines the number of buckets HashBucket will use for sorting.
+// cmpSize is the size of data to consider 2 structs equal (see find())
+// The hash function is determined by taking the first bytes of data and
+// performing a modulus the size of hSize. So the most diverse-data should
+// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
+template<typename T, int hSize, int cmpSize>
+class HashBucket {
+protected:
+	SizeChain<T> mBucket[hSize];
+
+public:
+	HashBucket() { 
+		for (int i = 0; i < hSize; i++) {
+			mBucket[i].Chain	= NULL;
+			mBucket[i].Size		= 0;
+		}
+	}
+	~HashBucket() { clear(); }
+	int quickFind(u32 data) {
+		return mBucket[data % hSize].Size;
+	}
+	__forceinline T* find(T* dataPtr) {
+		u32 d = *((u32*)dataPtr);
+		const SizeChain<T>& bucket( mBucket[d % hSize] );
+
+		for (int i=bucket.Size; i; --i) {
+			// This inline version seems about 1-2% faster in tests of games that average 1
+			// program per bucket.  Games that average more should see a bigger improvement --air
+			int result = _mm_movemask_ps( (cast_m128) _mm_cmpeq_epi32( _mm_load_si128((__m128i*)&bucket.Chain[i]), _mm_load_si128((__m128i*)dataPtr) ) ) & 0x7;
+			if( result == 0x7 ) return &bucket.Chain[i];
+
+			// Dynamically generated function version, can't be inlined. :(
+			//if ((((nVifCall)((void*)nVifMemCmp))(&bucket.Chain[i], dataPtr))==7) return &bucket.Chain[i];
+
+			//if (!memcmp(&bucket.Chain[i], dataPtr, sizeof(T)-4)) return &c[i];	// old school version! >_<
+		}
+		if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
+		return NULL;
+	}
+	__forceinline void add(const T& dataPtr) {
+		u32 d = (u32&)dataPtr;
+		SizeChain<T>& bucket( mBucket[d % hSize] );
+		
+		if( bucket.Chain = (T*)_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16), bucket.Chain==NULL ) {
+			throw Exception::OutOfMemory(
+				wxsFormat(L"Out of memory re-allocating hash bucket (bucket size=%d)", bucket.Size+1),
+				wxEmptyString
+			);
+		}
+		memcpy_fast(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
+	}
+	void clear() {
+		for (int i = 0; i < hSize; i++) {
+			safe_aligned_free(mBucket[i].Chain);
+			mBucket[i].Size = 0;
+		}
+	}
+};