newVif: I'm a terrible person. What started out at an optimization turned into this.

* Optimized codegen of the VPU recompiler using displaced memory offsets (1-2% speedup) * Undid a lot of the inl stuff for more traditional cpp code layout (explained below) * Removed some redundant code and turned some macros into functions. * Renamed a few things to VPU (Vector Processing Unit, which is the specific name of the logic core that performs VIF Command Processing and Unpacking on the PS2) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2387 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-12-23 13:22:30 +00:00 · 2009-12-23 13:22:30 +00:00 · b27b89b162
parent 5c8f4ded22
commit b27b89b162
16 changed files with 827 additions and 806 deletions
--- a/common/include/Utilities/Assertions.h
+++ b/common/include/Utilities/Assertions.h
@ -99,15 +99,16 @@ extern pxDoAssertFnType* pxDoAssert;
 // rear ugly heads in optimized builds, this is one of the few tools we have.
 #define pxAssertRel(cond, msg)		( (likely(cond)) || (pxOnAssert(pxAssertSpot(cond), msg), false) )
-#define pxAssumeMsg(cond, msg)		((void) ( (!likely(cond)) && (pxOnAssert(pxAssertSpot(cond), msg), false) ))
+#define pxAssumeRel(cond, msg)		((void) ( (!likely(cond)) && (pxOnAssert(pxAssertSpot(cond), msg), false) ))
 #define pxFailRel(msg)				pxAssumeRel(false, msg)
 #if defined(PCSX2_DEBUG)
 #	define pxAssertMsg(cond, msg)	pxAssertRel(cond, msg)
 #	define pxAssertDev(cond, msg)	pxAssertMsg(cond, msg)
-#	define pxAssume(cond)			pxAssumeMsg(cond, wxNullChar)
+#	define pxAssumeMsg(cond, msg)	pxAssumeRel(cond, msg)
-#	define pxAssumeDev(cond, msg)	pxAssumeMsg(cond, msg)
+#	define pxAssumeDev(cond, msg)	pxAssumeRel(cond, msg)
 #	define pxFail(msg)				pxAssumeMsg(false, msg)
 #	define pxFailDev(msg)			pxAssumeDev(false, msg)
@ -120,7 +121,7 @@ extern pxDoAssertFnType* pxDoAssert;
 #	define pxAssertMsg(cond, msg)	(likely(cond))
 #	define pxAssertDev(cond, msg)	pxAssertRel(cond, msg)
-#	define pxAssume(cond)			(__assume(cond))
+#	define pxAssumeMsg(cond, msg)	(__assume(cond))
 #	define pxAssumeDev(cond, msg)	pxAssumeMsg(cond, msg)
 #	define pxFail(msg)				(__assume(false))
@ -134,7 +135,7 @@ extern pxDoAssertFnType* pxDoAssert;
 #	define pxAssertMsg(cond, msg)	(likely(cond))
 #	define pxAssertDev(cond, msg)	(likely(cond))
-#	define pxAssume(cond)			(__assume(cond))
+#	define pxAssumeMsg(cond, msg)	(__assume(cond))
 #	define pxAssumeDev(cond, msg)	(__assume(cond))
 #	define pxFail(msg)				(__assume(false))
@ -143,6 +144,7 @@ extern pxDoAssertFnType* pxDoAssert;
 #endif
 #define pxAssert(cond)				pxAssertMsg(cond, wxNullChar)
 #define pxAssume(cond)				pxAssumeMsg(cond, wxNullChar)
 #define pxAssertRelease( cond, msg )
--- a/common/include/x86emitter/x86types.h
+++ b/common/include/x86emitter/x86types.h
@ -490,6 +490,11 @@ template< typename T > void xWrite( T val );
 		__forceinline xAddressInfo operator+( s32 imm ) const					{ return xAddressInfo( *this ).Add( imm ); }
 		__forceinline xAddressInfo operator-( s32 imm ) const					{ return xAddressInfo( *this ).Add( -imm ); }
 		__forceinline xAddressInfo operator+( const void* addr ) const			{ return xAddressInfo( *this ).Add( (uptr)addr ); }
 		__forceinline void operator+=( const xAddressReg& right )	{ Add( right ); }
 		__forceinline void operator+=( const xAddressInfo& right )	{ Add( right ); }
 		__forceinline void operator+=( s32 imm ) { Add( imm ); }
 		__forceinline void operator-=( s32 imm ) { Add( -imm ); }
 	};
 	extern const xRegisterSSE
--- a/pcsx2/Linux/pcsx2.cbp
+++ b/pcsx2/Linux/pcsx2.cbp
@ -515,6 +515,10 @@
 		<Unit filename="../x86/ix86-32/iR5900Shift.cpp" />
 		<Unit filename="../x86/ix86-32/iR5900Templates.cpp" />
 		<Unit filename="../x86/ix86-32/recVTLB.cpp" />
 		<Unit filename="../x86/VpuUnpackSSE.cpp" />
 		<Unit filename="../x86/VpuUnpackSSE.h" />
 		<Unit filename="../x86/VpuUnpackSSE_Dynarec.cpp" />
 		<Unit filename="../x86/newVof_Unpack.cpp" />
 		<Unit filename="../x86/microVU.cpp" />
 		<Unit filename="../x86/microVU.h" />
 		<Unit filename="../x86/microVU_Alloc.inl" />
@ -534,12 +538,8 @@
 		<Unit filename="../x86/microVU_Upper.inl" />
 		<Unit filename="../x86/newVif.h" />
 		<Unit filename="../x86/newVif_BlockBuffer.h" />
 		<Unit filename="../x86/newVif_Dynarec.inl" />
 		<Unit filename="../x86/newVif_HashBucket.h" />
 		<Unit filename="../x86/newVif_OldUnpack.inl" />
 		<Unit filename="../x86/newVif_Tables.inl" />
 		<Unit filename="../x86/newVif_Unpack.inl" />
 		<Unit filename="../x86/newVif_UnpackGen.inl" />
 		<Unit filename="../x86/sVU_Debug.h" />
 		<Unit filename="../x86/sVU_Lower.cpp" />
 		<Unit filename="../x86/sVU_Micro.cpp" />
--- a/pcsx2/Vif1Dma.cpp
+++ b/pcsx2/Vif1Dma.cpp
@ -320,7 +320,6 @@ static int __fastcall Vif1TransDirectHL(u32 *data)
 static int  __fastcall Vif1TransUnpack(u32 *data)
 {
 #ifdef newVif1
 	extern int nVifUnpack(int idx, u8 *data);
 	return nVifUnpack(1, (u8*)data);
 #endif
--- a/pcsx2/VifDma_internal.h
+++ b/pcsx2/VifDma_internal.h
@ -75,19 +75,21 @@ template<const u32 VIFdmanum> void ProcessMemSkip(u32 size, u32 unpackType);
 template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size);
 template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size);
 template<const u32 VIFdmanum> void vuExecMicro(u32 addr);
-extern __forceinline void vif0FLUSH();
+extern void vif0FLUSH();
-extern __forceinline void vif1FLUSH();
+extern void vif1FLUSH();
 static __forceinline u32 vif_size(u8 num)
 {
    return (num == 0) ? 0x1000 : 0x4000;
 }
-//#define newVif  // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
+#define newVif  // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
-//#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
+#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
 //#define newVif0 // Use New Code for Vif0 Unpacks (not implemented)
-#ifndef newVif
+#ifdef newVif
 extern int nVifUnpack(int idx, u8 *data);
 #else
 //#	define NON_SSE_UNPACKS  // Turns off SSE Unpacks (slower)
 #endif
--- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
+++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
@ -860,7 +860,7 @@
 									>
 								</File>
 								<File
-									RelativePath="..\..\x86\newVif_Unpack.inl"
+									RelativePath="..\..\x86\newVif_Unpack.cpp"
 									>
 								</File>
 								<File
@ -871,11 +871,19 @@
 									Name="Dynarec"
 									>
 									<File
-										RelativePath="..\..\x86\newVif_Dynarec.inl"
+										RelativePath="..\..\x86\newVif_Tables.inl"
 										>
 									</File>
 									<File
-										RelativePath="..\..\x86\newVif_Tables.inl"
+										RelativePath="..\..\x86\VpuUnpackSSE.cpp"
 										>
 									</File>
 									<File
 										RelativePath="..\..\x86\VpuUnpackSSE.h"
 										>
 									</File>
 									<File
 										RelativePath="..\..\x86\VpuUnpackSSE_Dynarec.cpp"
 										>
 									</File>
 								</Filter>
--- a/pcsx2/x86/VpuUnpackSSE.cpp
+++ b/pcsx2/x86/VpuUnpackSSE.cpp
@ -0,0 +1,285 @@
 /*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2009  PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "PrecompiledHeader.h"
 #include "VpuUnpackSSE.h"
 #define xMOV8(regX, loc)	xMOVSSZX(regX, loc)
 #define xMOV16(regX, loc)	xMOVSSZX(regX, loc)
 #define xMOV32(regX, loc)	xMOVSSZX(regX, loc)
 #define xMOV64(regX, loc)	xMOVUPS(regX, loc)
 #define xMOV128(regX, loc)	xMOVUPS(regX, loc)
 static __pagealigned u8 nVifUpkExec[__pagesize*4];
 // =====================================================================================================
 //  VpuUnpackSSE_Base Section
 // =====================================================================================================
 VpuUnpackSSE_Base::VpuUnpackSSE_Base()
 	: dstIndirect(ecx)		// parameter 1 of __fastcall
 	, srcIndirect(edx)		// parameter 2 of __fastcall
 {
 }
 void VpuUnpackSSE_Base::xMovDest(const xRegisterSSE& srcReg) const {
 	if (!doMode && !doMask)	{ xMOVAPS (ptr[dstIndirect], srcReg); }
 	else					{ doMaskWrite(srcReg); }
 }
 void VpuUnpackSSE_Base::xShiftR(const xRegisterSSE& regX, int n) const {
 	if (usn)	{ xPSRL.D(regX, n); }
 	else		{ xPSRA.D(regX, n); }
 }
 void VpuUnpackSSE_Base::xPMOVXX8(const xRegisterSSE& regX) const {
 	if (usn)	xPMOVZX.BD(regX, ptr32[srcIndirect]);
 	else		xPMOVSX.BD(regX, ptr32[srcIndirect]);
 }
 void VpuUnpackSSE_Base::xPMOVXX16(const xRegisterSSE& regX) const {
 	if (usn)	xPMOVZX.WD(regX, ptr64[srcIndirect]);
 	else		xPMOVSX.WD(regX, ptr64[srcIndirect]);
 }
 void VpuUnpackSSE_Base::xUPK_S_32() const {
 	xMOV32     (xmm0, ptr32[srcIndirect]);
 	xPSHUF.D   (xmm1, xmm0, _v0);
 	xMovDest   (xmm1);
 }
 void VpuUnpackSSE_Base::xUPK_S_16() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX16  (xmm0);
 }
 else {
 	xMOV16     (xmm0, ptr32[srcIndirect]);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 16);
 }
 	xPSHUF.D   (xmm1, xmm0, _v0);
 	xMovDest   (xmm1);
 }
 void VpuUnpackSSE_Base::xUPK_S_8() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX8   (xmm0);
 }
 else {
 	xMOV8      (xmm0, ptr32[srcIndirect]);
 	xPUNPCK.LBW(xmm0, xmm0);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 24);
 }
 	xPSHUF.D   (xmm1, xmm0, _v0);
 	xMovDest   (xmm1);
 }
 void VpuUnpackSSE_Base::xUPK_V2_32() const {
 	xMOV64     (xmm0, ptr32[srcIndirect]);
 	xMovDest   (xmm0);
 }
 void VpuUnpackSSE_Base::xUPK_V2_16() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX16  (xmm0);
 }
 else {
 	xMOV32     (xmm0, ptr32[srcIndirect]);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 16);
 }
 	xMovDest   (xmm0);
 }
 void VpuUnpackSSE_Base::xUPK_V2_8() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX8   (xmm0);
 }
 else {
 	xMOV16     (xmm0, ptr32[srcIndirect]);
 	xPUNPCK.LBW(xmm0, xmm0);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 24);
 }
 	xMovDest   (xmm0);
 }
 void VpuUnpackSSE_Base::xUPK_V3_32() const {
 	xMOV128    (xmm0, ptr32[srcIndirect]);
 	xMovDest   (xmm0);
 }
 void VpuUnpackSSE_Base::xUPK_V3_16() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX16  (xmm0);
 }
 else {
 	xMOV64     (xmm0, ptr32[srcIndirect]);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 16);
 }
 	xMovDest   (xmm0);
 }
 void VpuUnpackSSE_Base::xUPK_V3_8() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX8   (xmm0);
 }
 else {
 	xMOV32     (xmm0, ptr32[srcIndirect]);
 	xPUNPCK.LBW(xmm0, xmm0);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 24);
 }
 	xMovDest   (xmm0);
 }
 void VpuUnpackSSE_Base::xUPK_V4_32() const {
 	xMOV128    (xmm0, ptr32[srcIndirect]);
 	xMovDest   (xmm0);
 }
 void VpuUnpackSSE_Base::xUPK_V4_16() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX16  (xmm0);
 }
 else {
 	xMOV64     (xmm0, ptr32[srcIndirect]);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 16);
 }
 	xMovDest   (xmm0);
 }
 void VpuUnpackSSE_Base::xUPK_V4_8() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX8   (xmm0);
 }
 else {
 	xMOV32     (xmm0, ptr32[srcIndirect]);
 	xPUNPCK.LBW(xmm0, xmm0);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 24);
 }
 	xMovDest   (xmm0);
 }
 void VpuUnpackSSE_Base::xUPK_V4_5() const {
 	xMOV16		(xmm0, ptr32[srcIndirect]);
 	xPSHUF.D	(xmm0, xmm0, _v0);
 	xPSLL.D		(xmm0, 3);			// ABG|R5.000
 	xMOVAPS		(xmm1, xmm0);		// x|x|x|R
 	xPSRL.D		(xmm0, 8);			// ABG
 	xPSLL.D		(xmm0, 3);			// AB|G5.000
 	mVUmergeRegs(XMM1, XMM0, 0x4);	// x|x|G|R
 	xPSRL.D		(xmm0, 8);			// AB
 	xPSLL.D		(xmm0, 3);			// A|B5.000
 	mVUmergeRegs(XMM1, XMM0, 0x2);	// x|B|G|R
 	xPSRL.D		(xmm0, 8);			// A
 	xPSLL.D		(xmm0, 7);			// A.0000000
 	mVUmergeRegs(XMM1, XMM0, 0x1);	// A|B|G|R
 	xPSLL.D		(xmm1, 24); // can optimize to
 	xPSRL.D		(xmm1, 24); // single AND...
 	xMovDest	(xmm1);
 }
 void VpuUnpackSSE_Base::xUnpack( int upknum )
 {
 	switch( upknum )
 	{
 		case 0:	 xUPK_S_32();	break;
 		case 1:  xUPK_S_16();	break;
 		case 2:  xUPK_S_8();	break;
 		case 4:  xUPK_V2_32();	break;
 		case 5:  xUPK_V2_16();	break;
 		case 6:  xUPK_V2_8();	break;
 		case 8:  xUPK_V3_32();	break;
 		case 9:  xUPK_V3_16();	break;
 		case 10: xUPK_V3_8();	break;
 		case 12: xUPK_V4_32();	break;
 		case 13: xUPK_V4_16();	break;
 		case 14: xUPK_V4_8();	break;
 		case 15: xUPK_V4_5();	break;
 		case 3:  
 		case 7:  
 		case 11:
 			pxFailRel( wxsFormat( L"Vpu/Vif - Invalid Unpack! [%d]", upknum ) );
 		break;
 	}
 }
 // =====================================================================================================
 //  VpuUnpackSSE_Simple
 // =====================================================================================================
 VpuUnpackSSE_Simple::VpuUnpackSSE_Simple(bool usn_, bool domask_, int curCycle_)
 {
 	curCycle	= curCycle_;
 	usn			= usn_;
 	doMask		= domask_;
 }
 void VpuUnpackSSE_Simple::doMaskWrite(const xRegisterSSE& regX) const {
 	xMOVAPS(xmm7, ptr[dstIndirect]);
 	int offX = aMin(curCycle, 3);
 	xPAND(regX, ptr32[nVifMask[0][offX]]);
 	xPAND(xmm7, ptr32[nVifMask[1][offX]]);
 	xPOR (regX, ptr32[nVifMask[2][offX]]);
 	xPOR (regX, xmm7);
 	xMOVAPS(ptr[dstIndirect], regX);
 }
 // ecx = dest, edx = src
 static void nVifGen(int usn, int mask, int curCycle) {
 	int usnpart		= usn*2*16;
 	int maskpart	= mask*16;
 	int curpart		= curCycle;
 	VpuUnpackSSE_Simple vpugen( !!usn, !!mask, curCycle );
 	for( int i=0; i<16; ++i )
 	{
 		nVifCall& ucall( nVifUpk[((usnpart+maskpart+i) * 4) + (curpart)] );
 		ucall = NULL;
 		if( nVifT[i] == 0 ) continue;
 		ucall = (nVifCall)xGetAlignedCallTarget();
 		vpugen.xUnpack(i);
 		xRET();
 		pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
 	}
 }
 void VpuUnpackSSE_Init()
 {
 	HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
 	memset8<0xcc>( nVifUpkExec );
 	xSetPtr( nVifUpkExec );
 	for (int a = 0; a < 2; a++) {
 		for (int b = 0; b < 2; b++) {
 			for (int c = 0; c < 4; c++) {
 				nVifGen(a, b, c);
 			}}}
 	HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
 }
--- a/pcsx2/x86/VpuUnpackSSE.h
+++ b/pcsx2/x86/VpuUnpackSSE.h
@ -0,0 +1,134 @@
 /*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2009  PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 #pragma once
 #include "Common.h"
 #include "VifDma_internal.h"
 #include "newVif.h"
 #include <xmmintrin.h>
 #include <emmintrin.h>
 using namespace x86Emitter;
 // --------------------------------------------------------------------------------------
 //  VpuUnpackSSE_Base
 // --------------------------------------------------------------------------------------
 class VpuUnpackSSE_Base
 {
 public:
 	bool			usn;			// unsigned flag
 	bool			doMask;			// masking write enable flag
 	int				doMode;			// two bit value representing... something!
 protected:
 	xAddressInfo	dstIndirect;
 	xAddressInfo	srcIndirect;
 public:
 	VpuUnpackSSE_Base();
 	virtual ~VpuUnpackSSE_Base() throw() {}
 	void xUnpack( int upktype );
 protected:
 	virtual void doMaskWrite(const xRegisterSSE& regX ) const=0;
 	virtual void xMovDest(const xRegisterSSE& srcReg) const;
 	virtual void xShiftR(const xRegisterSSE& regX, int n) const;
 	virtual void xPMOVXX8(const xRegisterSSE& regX) const;
 	virtual void xPMOVXX16(const xRegisterSSE& regX) const;
 	virtual void xUPK_S_32() const;
 	virtual void xUPK_S_16() const;
 	virtual void xUPK_S_8() const;
 	virtual void xUPK_V2_32() const;
 	virtual void xUPK_V2_16() const;
 	virtual void xUPK_V2_8() const;
 	virtual void xUPK_V3_32() const;
 	virtual void xUPK_V3_16() const;
 	virtual void xUPK_V3_8() const;
 	virtual void xUPK_V4_32() const;
 	virtual void xUPK_V4_16() const;
 	virtual void xUPK_V4_8() const;
 	virtual void xUPK_V4_5() const;
 };
 // --------------------------------------------------------------------------------------
 //  VpuUnpackSSE_Simple
 // --------------------------------------------------------------------------------------
 class VpuUnpackSSE_Simple : public VpuUnpackSSE_Base
 {
 	typedef VpuUnpackSSE_Base _parent;
 public:
 	int				curCycle;
 public:
 	VpuUnpackSSE_Simple(bool usn_, bool domask_, int curCycle_);
 	virtual ~VpuUnpackSSE_Simple() throw() {}
 protected:
 	virtual void doMaskWrite(const xRegisterSSE& regX ) const;
 };
 // --------------------------------------------------------------------------------------
 //  VpuUnpackSSE_Dynarec
 // --------------------------------------------------------------------------------------
 class VpuUnpackSSE_Dynarec : public VpuUnpackSSE_Base
 {
 	typedef VpuUnpackSSE_Base _parent;
 public:
 	bool			isFill;
 protected:
 	const nVifStruct&	v;			// vif0 or vif1
 	const nVifBlock&	vB;			// some pre-collected data from VifStruct
 	int					vCL;		// internal copy of vif->cl
 public:
 	VpuUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_);
 	VpuUnpackSSE_Dynarec(const VpuUnpackSSE_Dynarec& src)	// copy constructor
 		: _parent(src)
 		, v(src.v)
 		, vB(src.vB)
 	{
 		isFill	= src.isFill;
 		vCL		= src.vCL;
 	}
 	virtual ~VpuUnpackSSE_Dynarec() throw() {}
 	void CompileRoutine();
 protected:
 	virtual void doMaskWrite(const xRegisterSSE& regX) const;
 	void SetMasks(int cS) const;
 	void writeBackRow() const;
 	static VpuUnpackSSE_Dynarec FillingWrite( const VpuUnpackSSE_Dynarec& src )
 	{
 		VpuUnpackSSE_Dynarec fillingWrite( src );
 		fillingWrite.doMask = true;
 		fillingWrite.doMode = 0;
 		return fillingWrite;
 	}
 };
--- a/pcsx2/x86/VpuUnpackSSE_Dynarec.cpp
+++ b/pcsx2/x86/VpuUnpackSSE_Dynarec.cpp
@ -0,0 +1,278 @@
 /*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2009  PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 // newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets
 // authors: cottonvibes(@gmail.com)
 //			Jake.Stine (@gmail.com)
 #include "PrecompiledHeader.h"
 #include "VpuUnpackSSE.h"
 static __aligned16 nVifBlock _vBlock = {0};
 static __pagealigned u8 nVifMemCmp[__pagesize];
 static void emitCustomCompare() {
 	HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadWrite, false);
 	memset8<0xcc>(nVifMemCmp);
 	xSetPtr(nVifMemCmp);
 	xMOVAPS  (xmm0, ptr32[ecx]);
 	xPCMP.EQD(xmm0, ptr32[edx]);
 	xMOVMSKPS(eax, xmm0);
 	xAND	 (eax, 0x7);		// ignore top 4 bytes (recBlock pointer)
 	xRET();
 	HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadOnly, true);
 }
 void dVifInit(int idx) {
 	nVif[idx].idx		=  idx;
 	nVif[idx].VU		=  idx ? &VU1     : &VU0;
 	nVif[idx].vif		=  idx ? &vif1    : &vif0;
 	nVif[idx].vifRegs	=  idx ? vif1Regs : vif0Regs;
 	nVif[idx].vuMemEnd  =  idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
 	nVif[idx].vuMemLimit=  idx ? 0x3ff0 : 0xff0;
 	nVif[idx].vifCache	=  new BlockBuffer(_1mb*4); // 4mb Rec Cache
 	nVif[idx].vifBlocks =  new HashBucket<_tParams>();
 	nVif[idx].recPtr	=  nVif[idx].vifCache->getBlock();
 	nVif[idx].recEnd	= &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone
 	//emitCustomCompare();
 }
 // Loads Row/Col Data from vifRegs instead of g_vifmask
 // Useful for testing vifReg and g_vifmask inconsistency.
 static void loadRowCol(nVifStruct& v) {
 	xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
 	xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
 	xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
 	xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
 	xPSHUF.D(xmm0, xmm0, _v0);
 	xPSHUF.D(xmm1, xmm1, _v0);
 	xPSHUF.D(xmm2, xmm2, _v0);
 	xPSHUF.D(xmm6, xmm6, _v0);
 	mVUmergeRegs(XMM6, XMM0, 8);
 	mVUmergeRegs(XMM6, XMM1, 4);
 	mVUmergeRegs(XMM6, XMM2, 2);
 	xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
 	xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
 	xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
 	xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
 	xPSHUF.D(xmm2, xmm2, _v0);
 	xPSHUF.D(xmm3, xmm3, _v0);
 	xPSHUF.D(xmm4, xmm4, _v0);
 	xPSHUF.D(xmm5, xmm5, _v0);
 }
 VpuUnpackSSE_Dynarec::VpuUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_)
 	: v(vif_)
 	, vB(vifBlock_)
 {
 	isFill		= (vB.cl < vB.wl);
 	usn			= (vB.upkType>>5) & 1;
 	doMask		= (vB.upkType>>4) & 1;
 	doMode		= vB.mode & 3;
 }
 #define makeMergeMask(x) {									\
 	x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3);	\
 }
 _f void VpuUnpackSSE_Dynarec::SetMasks(int cS) const {
 	u32 m0 = vB.mask;
 	u32 m1 =  m0 & 0xaaaaaaaa;
 	u32 m2 =(~m1>>1) &  m0;
 	u32 m3 = (m1>>1) & ~m0;
 	u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
 	u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
 	if((m2&&doMask) || doMode) { xMOVAPS(xmmRow, ptr32[row]); }
 	if (m3&&doMask) {
 		xMOVAPS(xmmCol0, ptr32[col]); 
 		if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
 		if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
 		if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
 		if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0);
 	}
 	//if (mask||mode) loadRowCol(v);
 }
 void VpuUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
 	pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
 	int cc =  aMin(vCL, 3);
 	u32 m0 = (vB.mask >> (cc * 8)) & 0xff;
 	u32 m1 =  m0 & 0xaaaa;
 	u32 m2 =(~m1>>1) &  m0;
 	u32 m3 = (m1>>1) & ~m0;
 	u32 m4 = (m1>>1) &  m0;
 	makeMergeMask(m2);
 	makeMergeMask(m3);
 	makeMergeMask(m4);
 	if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]);		 } // Load Write Protect
 	if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id,		m2); } // Merge Row
 	if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc,	m3); } // Merge Col
 	if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id,		m4); } // Merge Write Protect
 	if (doMode) {
 		u32 m5 = (~m1>>1) & ~m0;
 		if (!doMask)  m5 = 0xf;
 		else		  makeMergeMask(m5);
 		if (m5 < 0xf) {
 			xPXOR(xmmTemp, xmmTemp);
 			mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5);
 			xPADD.D(regX, xmmTemp);
 			if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5);
 		}
 		else if (m5 == 0xf) {
 			xPADD.D(regX, xmmRow);
 			if (doMode==2) xMOVAPS(xmmRow, regX);
 		}
 	}
 	xMOVAPS(ptr32[dstIndirect], regX);	
 }
 void VpuUnpackSSE_Dynarec::writeBackRow() const {
 	u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
 	xMOVAPS(ptr32[row], xmmRow);
 	DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
 	// ToDo: Do we need to write back to vifregs.rX too!? :/
 }
 static void ShiftDisplacementWindow( xAddressInfo& addr, const xRegister32& modReg )
 {
 	// Shifts the displacement factor of a given indirect address, so that the address
 	// remains in the optimal 0xf0 range (which allows for byte-form displacements when
 	// generating instructions).
 	int addImm = 0;
 	while( addr.Displacement >= 0x80 )
 	{
 		addImm += 0xf0;
 		addr -= 0xf0;
 	}
 	if(addImm) xADD(modReg, addImm);
 }
 void VpuUnpackSSE_Dynarec::CompileRoutine() {
 	const int  upkNum		=  vB.upkType & 0xf;
 	const u8&  vift			=  nVifT[upkNum];
 	const int  cycleSize	=  isFill ?  vB.cl : vB.wl;
 	const int  blockSize	=  isFill ?  vB.wl : vB.cl;
 	const int  skipSize		=  blockSize - cycleSize;
 	int  vNum	=  vifRegs->num;
 	vCL	=  vif->cl;
 	SetMasks(cycleSize);
 	while (vNum) {
 		ShiftDisplacementWindow( srcIndirect, edx );
 		ShiftDisplacementWindow( dstIndirect, ecx );
 		if (vCL < cycleSize) { 
 			xUnpack(upkNum);
 			srcIndirect += vift;
 			dstIndirect += 16;
 			vNum--;
 			if (++vCL == blockSize) vCL = 0;
 		}
 		else if (isFill) {
 			DevCon.WriteLn("filling mode!");
 			VpuUnpackSSE_Dynarec::FillingWrite( *this ).xUnpack(upkNum);
 			dstIndirect += 16;
 			vNum--;
 			if (++vCL == blockSize) vCL = 0;
 		}
 		else {
 			dstIndirect += (16 * skipSize);
 			vCL = 0;
 		}
 	}
 	if (doMode==2) writeBackRow();
 	xMOV(ptr32[&vif->cl],	   vCL);
 	xMOV(ptr32[&vifRegs->num], vNum);
 	xRET();
 }
 static _f u8* dVifsetVUptr(const nVifStruct& v, int offset) {
 	u8* ptr	   = (u8*)(v.VU->Mem + (offset & v.vuMemLimit));
 	u8* endPtr = ptr + _vBlock.num * 16;
 	if (endPtr > v.vuMemEnd) {
 		DevCon.WriteLn("nVif - VU Mem Ptr Overflow; falling back to interpreter.");
 		ptr = NULL; // Fall Back to Interpreters which have wrap-around logic
 	}
 	return ptr;
 }
 static _f void dVifRecLimit(int idx) {
 	if (nVif[idx].recPtr > nVif[idx].recEnd) {
 		DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd);
 		nVif[idx].vifBlocks->clear();
 		nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
 	}
 }
 _f void dVifUnpack(int idx, u8 *data, u32 size, bool isFill) {
 	const nVifStruct& v		= nVif[idx];
 	const u8	upkType		= vif->cmd & 0x1f | ((!!vif->usn) << 5);
 	const int	doMask		= (upkType>>4) & 1;
 	const int	cycle_cl	= vifRegs->cycle.cl;
 	const int	cycle_wl	= vifRegs->cycle.wl;
 	const int	cycleSize	= isFill ? cycle_cl : cycle_wl;
 	const int	blockSize	= isFill ? cycle_wl : cycle_cl;
 	if (vif->cl >= blockSize)  vif->cl = 0;
 	_vBlock.upkType   = upkType;
 	_vBlock.num		  = *(u8*)&vifRegs->num;
 	_vBlock.mode	  = *(u8*)&vifRegs->mode;
 	_vBlock.scl		  = vif->cl;
 	_vBlock.cl		  = cycle_cl;
 	_vBlock.wl		  = cycle_wl;
 	// Zero out the mask parameter if it's unused -- games leave random junk
 	// values here which cause false recblock cache misses.
 	_vBlock.mask	  = doMask ? vifRegs->mask : 0x00;
 	if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
 		if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) {
 			//DevCon.WriteLn("Running Recompiled Block!");
 			((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
 		}
 		else {
 			//DevCon.WriteLn("Running Interpreter Block");
 			_nVifUnpack(idx, data, size, isFill);
 		}
 		return;
 	}
 	static int recBlockNum = 0;
 	DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++);
 	DevCon.WriteLn(L"\t(num=0x%02x, upkType=0x%02x, mode=0x%02x, scl=0x%02x, cl/wl=0x%x/0x%x, mask=%s)",
 		_vBlock.num, _vBlock.upkType, _vBlock.mode, _vBlock.scl, _vBlock.cl, _vBlock.wl,
 		doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
 	);
 	xSetPtr(v.recPtr);
 	_vBlock.startPtr = (uptr)xGetAlignedCallTarget();
 	v.vifBlocks->add(_vBlock);
 	VpuUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine();
 	nVif[idx].recPtr = xGetPtr();
 	dVifRecLimit(idx);
 	// Run the block we just compiled.  Various conditions may force us to still use
 	// the interpreter unpacker though, so a recursive call is the safest way here...
 	dVifUnpack(idx, data, size, isFill);
 }
--- a/pcsx2/x86/newVif.h
+++ b/pcsx2/x86/newVif.h
@ -15,6 +15,12 @@
 #pragma once
 #include "Vif.h"
 #include "VU.h"
 #include "x86emitter/x86emitter.h"
 using namespace x86Emitter;
 #ifdef newVif
 // newVif_HashBucket.h uses this typedef, so it has to be decared first.
@ -23,17 +29,12 @@ typedef void (__fastcall *nVifrecCall)(uptr dest, uptr src);
 #include "newVif_BlockBuffer.h"
 #include "newVif_HashBucket.h"
 #include "x86emitter/x86emitter.h"
 using namespace x86Emitter;
 extern void  mVUmergeRegs(int dest, int src,  int xyzw, bool modXYZW = 0);
 extern void  nVifGen	 (int usn,  int mask, int curCycle);
 extern void _nVifUnpack  (int idx,  u8 *data, u32 size);
 extern void  dVifUnpack  (int idx,  u8 *data, u32 size);
 extern void  dVifInit    (int idx);
-static __pagealigned u8 nVifUpkExec[__pagesize*4];
+extern void  mVUmergeRegs(int dest, int src,  int xyzw, bool modXYZW = 0);
-static __aligned16 nVifCall nVifUpk[(2*2*16)  *4]; // ([USN][Masking][Unpack Type]) [curCycle]
+extern void _nVifUnpack  (int idx,  u8 *data, u32 size, bool isFill);
-static __aligned16 u32 nVifMask[3][4][4] = {0};	   // [MaskNumber][CycleNumber][Vector]
+extern void  dVifUnpack  (int idx,  u8 *data, u32 size, bool isFill);
 extern void  dVifInit    (int idx);
 extern void  VpuUnpackSSE_Init();
 #define VUFT VIFUnpackFuncTable
 #define _1mb (0x100000)
@ -56,7 +57,10 @@ static __aligned16 u32 nVifMask[3][4][4] = {0};	   // [MaskNumber][CycleNumber][
 #	pragma warning(disable:4996) // 'function': was declared deprecated
 #endif
-struct __aligned16 nVifBlock { // Ordered for Hashing
+// nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
 //             used as the hash bucke selector.
 //
 struct __aligned16 nVifBlock {
 	u8   num;		// [00] Num  Field
 	u8   upkType;	// [01] Unpack Type [usn*1:mask*1:upk*4]
 	u8   mode;		// [02] Mode Field
@ -88,63 +92,14 @@ struct nVifStruct {
 	u8*						recEnd;			// End of Rec Cache
 	BlockBuffer*			vifCache;		// Block Buffer
 	HashBucket<_tParams>*	vifBlocks;		// Vif Blocks
 	nVifBlock*				vifBlock;		// Current Vif Block Ptr
 };
-// Contents of this table are doubled up for doMask(false) and doMask(true) lookups.
+extern __aligned16 nVifStruct nVif[2];
-// (note: currently unused, I'm using gsize in the interp tables instead since it
+extern __aligned16 const u8 nVifT[32];
-//  seems to be faster for now, which may change when nVif isn't reliant on interpreted
+extern __aligned16 nVifCall nVifUpk[(2*2*16)  *4];		// ([USN][Masking][Unpack Type]) [curCycle]
-//  unpackers anymore --air)
+extern __aligned16 u32		nVifMask[3][4][4];			// [MaskNumber][CycleNumber][Vector]
 static const u32 nVifT[32] = { 
 	4, // S-32
 	2, // S-16
 	1, // S-8
 	0, // ----
 	8, // V2-32
 	4, // V2-16
 	2, // V2-8
 	0, // ----
 	12,// V3-32
 	6, // V3-16
 	3, // V3-8
 	0, // ----
 	16,// V4-32
 	8, // V4-16
 	4, // V4-8
 	2, // V4-5
-	// Second verse, same as the first!
+static const bool useOldUnpack = false; // Use code in newVif_OldUnpack.inl
-	4,2,1,0,8,4,2,0,12,6,3,0,16,8,4,2
+static const bool newVifDynaRec = true; // Use code in newVif_Dynarec.inl
 };
 template< int idx, bool doMode, bool isFill, bool singleUnpack >
 __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size);
 typedef void (__fastcall* Fnptr_VifUnpackLoop)(u8 *data, u32 size);
 // Unpacks Until 'Num' is 0
 static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = {
 	{{ _nVifUnpackLoop<0,0,0,0>, _nVifUnpackLoop<0,0,1,0> },
 	{  _nVifUnpackLoop<0,1,0,0>, _nVifUnpackLoop<0,1,1,0> },},
 	{{ _nVifUnpackLoop<1,0,0,0>, _nVifUnpackLoop<1,0,1,0> },
 	 { _nVifUnpackLoop<1,1,0,0>, _nVifUnpackLoop<1,1,1,0> },},
 };
 // Unpacks until 1 normal write cycle unpack has been written to VU mem
 static const __aligned16 Fnptr_VifUnpackLoop UnpackSingleTable[2][2][2] = {
 	{{ _nVifUnpackLoop<0,0,0,1>, _nVifUnpackLoop<0,0,1,1> },
 	{  _nVifUnpackLoop<0,1,0,1>, _nVifUnpackLoop<0,1,1,1> },},
 	{{ _nVifUnpackLoop<1,0,0,1>, _nVifUnpackLoop<1,0,1,1> },
 	 { _nVifUnpackLoop<1,1,0,1>, _nVifUnpackLoop<1,1,1,1> },},
 };
 #define  useOldUnpack  0 // Use code in newVif_OldUnpack.inl
 #define  newVifDynaRec 1 // Use code in newVif_Dynarec.inl
 #include "newVif_OldUnpack.inl"
 #include "newVif_Unpack.inl"
 #include "newVif_UnpackGen.inl"
 #include "newVif_Tables.inl"
 #include "newVif_Dynarec.inl"
 #endif
--- a/pcsx2/x86/newVif_BlockBuffer.h
+++ b/pcsx2/x86/newVif_BlockBuffer.h
@ -25,7 +25,7 @@
 // just use 'new' and 'delete' for initialization and 
 // deletion/cleanup respectfully...
 class BlockBuffer {
-private:
+protected:
 	u32 mSize;  // Cur Size
 	u32 mSizeT; // Total Size
 	u8* mData;  // Data Ptr
--- a/pcsx2/x86/newVif_Dynarec.inl
+++ b/pcsx2/x86/newVif_Dynarec.inl
@ -1,163 +0,0 @@
 /*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2009  PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 // newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets
 // authors: cottonvibes(@gmail.com)
 //			Jake.Stine (@gmail.com)
 #pragma once
 void dVifInit(int idx) {
 	nVif[idx].idx		=  idx;
 	nVif[idx].VU		=  idx ? &VU1     : &VU0;
 	nVif[idx].vif		=  idx ? &vif1    : &vif0;
 	nVif[idx].vifRegs	=  idx ? vif1Regs : vif0Regs;
 	nVif[idx].vuMemEnd  =  idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
 	nVif[idx].vuMemLimit=  idx ? 0x3ff0 : 0xff0;
 	nVif[idx].vifCache	=  new BlockBuffer(_1mb*4); // 4mb Rec Cache
 	nVif[idx].vifBlocks =  new HashBucket<_tParams>();
 	nVif[idx].recPtr	=  nVif[idx].vifCache->getBlock();
 	nVif[idx].recEnd	= &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone
 	emitCustomCompare();
 }
 _f void dVifRecLimit(int idx) {
 	if (nVif[idx].recPtr > nVif[idx].recEnd) {
 		DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd);
 		nVif[idx].vifBlocks->clear();
 		nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
 	}
 }
 _f void dVifSetMasks(nVifStruct& v, int mask, int mode, int cS) {
 	u32 m0 = v.vifBlock->mask;
 	u32 m1 =  m0 & 0xaaaaaaaa;
 	u32 m2 =(~m1>>1) &  m0;
 	u32 m3 = (m1>>1) & ~m0;
 	u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
 	u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
 	if((m2&&mask) || mode) { xMOVAPS(xmmRow, ptr32[row]); }
 	if (m3&&mask) {
 		xMOVAPS(xmmCol0, ptr32[col]); 
 		if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
 		if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
 		if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
 		if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0);
 	}
 	//if (mask||mode) loadRowCol(v);
 }
 void dVifRecompile(nVifStruct& v, nVifBlock* vB) {
 	const bool isFill		= (vB->cl < vB->wl);
 	const int  usn			= (vB->upkType>>5)&1;
 	const int  doMask		= (vB->upkType>>4)&1;
 	const int  upkNum		=  vB->upkType & 0xf;
 	const u32& vift			=  nVifT[upkNum];
 	const int  doMode		=  vifRegs->mode & 3;
 	const int  cycleSize	=  isFill ?  vifRegs->cycle.cl : vifRegs->cycle.wl;
 	const int  blockSize	=  isFill ?  vifRegs->cycle.wl : vifRegs->cycle.cl;
 	const int  skipSize		=  blockSize - cycleSize;
 	const bool simpleBlock	= (vifRegs->num == 1);
 	const int  backupCL		=  vif->cl;
 	const int  backupNum	=  vifRegs->num;
 	if (vif->cl >= blockSize)  vif->cl = 0;
 	v.vifBlock = vB;
 	xSetPtr(v.recPtr);
 	xAlignPtr(16);
 	vB->startPtr = (uptr)xGetPtr();
 	dVifSetMasks(v, doMask, doMode, cycleSize);
 	while (vifRegs->num) {
 		if (vif->cl < cycleSize) { 
 			xUnpack[upkNum](&v, doMode<<1 | doMask);
 			if (!simpleBlock) xADD(edx, vift);
 			if (!simpleBlock) xADD(ecx, 16);
 			vifRegs->num--;
 			if (++vif->cl == blockSize) vif->cl = 0;
 		}
 		else if (isFill) {
 			DevCon.WriteLn("filling mode!");
 			xUnpack[upkNum](&v, 1);
 			xADD(ecx, 16);
 			vifRegs->num--;
 			if (++vif->cl == blockSize) vif->cl = 0;
 		}
 		else {
 			xADD(ecx, 16 * skipSize);
 			vif->cl = 0;
 		}
 	}
 	if (doMode==2) writeBackRow(v);
 	xMOV(ptr32[&vif->cl],	   vif->cl);
 	xMOV(ptr32[&vifRegs->num], vifRegs->num);
 	xRET();
 	v.recPtr	 = xGetPtr();
 	vif->cl		 = backupCL;
 	vifRegs->num = backupNum;
 }
 static __aligned16 nVifBlock _vBlock = {0};
 _f u8* dVifsetVUptr(nVifStruct& v, int offset) {
 	u8* ptr	   = (u8*)(v.VU->Mem + (offset & v.vuMemLimit));
 	u8* endPtr = ptr + _vBlock.num * 16;
 	if (endPtr > v.vuMemEnd) {
 		DevCon.WriteLn("nVif - VU Mem Ptr Overflow!");
 		ptr = NULL; // Fall Back to Interpreters which have wrap-around logic
 	}
 	return ptr;
 }
 void dVifUnpack(int idx, u8 *data, u32 size) {
 	nVifStruct& v			= nVif[idx];
 	const u8	upkType		= vif->cmd & 0x1f | ((!!(vif->usn)) << 5);
 	const int	doMask		= (upkType>>4)&1;
 	_vBlock.upkType   = upkType;
 	_vBlock.num		  = *(u8*)&vifRegs->num;
 	_vBlock.mode	  = *(u8*)&vifRegs->mode;
 	_vBlock.scl		  = vif->cl;
 	_vBlock.cl		  = vifRegs->cycle.cl;
 	_vBlock.wl		  = vifRegs->cycle.wl;
 	// Zero out the mask parameter if it's unused -- games leave random junk
 	// values here which cause false recblock cache misses.
 	_vBlock.mask	  = doMask ? vifRegs->mask : 0x00;
 	if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
 		if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) {
 			//DevCon.WriteLn("Running Recompiled Block!");
 			((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
 		}
 		else {
 			//DevCon.WriteLn("Running Interpreter Block");
 			_nVifUnpack(idx, data, size);
 		}
 		return;
 	}
 	static int recBlockNum = 0;
 	DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++);
 	DevCon.WriteLn(L"\t(num=0x%02x, upkType=0x%02x, mode=0x%02x, scl=0x%02x, cl=0x%x, wl=0x%x, mask=%s)",
 		_vBlock.num, _vBlock.upkType, _vBlock.mode, _vBlock.scl, _vBlock.cl, _vBlock.wl,
 		doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
 	);
 	dVifRecompile(v, &_vBlock);
 	v.vifBlocks->add(&_vBlock);
 	dVifRecLimit(idx);
 	dVifUnpack(idx, data, size);
 }
--- a/pcsx2/x86/newVif_HashBucket.h
+++ b/pcsx2/x86/newVif_HashBucket.h
@ -15,8 +15,6 @@
 #pragma once
 static __pagealigned u8 nVifMemCmp[__pagesize];
 template< typename T >
 struct SizeChain
 {
@ -66,8 +64,8 @@ public:
 		if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
 		return NULL;
 	}
-	__forceinline void add(T* dataPtr) {
+	__forceinline void add(const T& dataPtr) {
-		u32 d = *(u32*)dataPtr;
+		u32 d = (u32&)dataPtr;
 		SizeChain<T>& bucket( mBucket[d % hSize] );
 		if( bucket.Chain = (T*)_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16), bucket.Chain==NULL ) {
@ -76,7 +74,7 @@ public:
 				wxEmptyString
 			);
 		}
-		memcpy_fast(&bucket.Chain[bucket.Size++], dataPtr, sizeof(T));
+		memcpy_fast(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
 	}
 	void clear() {
 		for (int i = 0; i < hSize; i++) {
--- a/pcsx2/x86/newVif_Tables.inl
+++ b/pcsx2/x86/newVif_Tables.inl
@ -1,287 +0,0 @@
 /*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2009  PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 #pragma once
 #define vUPK(x) void x(nVifStruct* v, int doMask)
 #define _doUSN  (v->vifBlock->upkType & 0x20)
 #undef  xMovDest
 #undef  xShiftR
 #undef  xPMOVXX8
 #undef  xPMOVXX16
 #undef  xMaskWrite
 #define makeMergeMask(x) {									\
 	x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3);	\
 }
 void doMaskWrite(const xRegisterSSE& regX, nVifStruct* v, int doMask) {
 	if (regX.Id > 1) DevCon.WriteLn("Reg Overflow!!!");
 	int doMode = doMask>>1; doMask &= 1;
 	int cc =  aMin(v->vif->cl, 3);
 	u32 m0 = (v->vifBlock->mask >> (cc * 8)) & 0xff;
 	u32 m1 =  m0 & 0xaaaa;
 	u32 m2 =(~m1>>1) &  m0;
 	u32 m3 = (m1>>1) & ~m0;
 	u32 m4 = (m1>>1) &  m0;
 	makeMergeMask(m2);
 	makeMergeMask(m3);
 	makeMergeMask(m4);
 	if (doMask&&m4) { xMOVAPS(xmmTemp, ptr32[ecx]);				 } // Load Write Protect
 	if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id,		m2); } // Merge Row
 	if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc,	m3); } // Merge Col
 	if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id,		m4); } // Merge Write Protect
 	if (doMode) {
 		u32 m5 = (~m1>>1) & ~m0;
 		if (!doMask)  m5 = 0xf;
 		else		  makeMergeMask(m5);
 		if (m5 < 0xf) {
 			xPXOR(xmmTemp, xmmTemp);
 			mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5);
 			xPADD.D(regX, xmmTemp);
 			if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5);
 		}
 		else if (m5 == 0xf) {
 			xPADD.D(regX, xmmRow);
 			if (doMode==2) xMOVAPS(xmmRow, regX);
 		}
 	}
 	xMOVAPS(ptr32[ecx], regX);	
 }
 #define xMovDest(regX) {							\
 	if (!doMask){ xMOVAPS (ptr32[ecx], regX); }		\
 	else		{ doMaskWrite(regX, v, doMask); }	\
 }
 #define xShiftR(regX, n) {							\
 	if (_doUSN)	{ xPSRL.D(regX, n); }				\
 	else		{ xPSRA.D(regX, n); }				\
 }
 #define xPMOVXX8(regX, src) {						\
 	if (_doUSN) xPMOVZX.BD(regX, src);				\
 	else		xPMOVSX.BD(regX, src);				\
 }
 #define xPMOVXX16(regX, src) {						\
 	if (_doUSN) xPMOVZX.WD(regX, src);				\
 	else		xPMOVSX.WD(regX, src);				\
 }
 // ecx = dest, edx = src
 vUPK(nVif_S_32) {
 	xMOV32     (xmm0, ptr32[edx]);
 	xPSHUF.D   (xmm1, xmm0, _v0);
 	xMovDest   (xmm1);
 }
 vUPK(nVif_S_16) {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX16  (xmm0, ptr64[edx]);
 }
 else {
 	xMOV16     (xmm0, ptr32[edx]);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 16);
 }
 	xPSHUF.D   (xmm1, xmm0, _v0);
 	xMovDest   (xmm1);
 }
 vUPK(nVif_S_8) {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX8   (xmm0, ptr32[edx]);
 }
 else {
 	xMOV8      (xmm0, ptr32[edx]);
 	xPUNPCK.LBW(xmm0, xmm0);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 24);
 }
 	xPSHUF.D   (xmm1, xmm0, _v0);
 	xMovDest   (xmm1);
 }
 vUPK(nVif_V2_32) {
 	xMOV64     (xmm0, ptr32[edx]);
 	xMovDest   (xmm0);
 }
 vUPK(nVif_V2_16) {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX16  (xmm0, ptr64[edx]);
 }
 else {
 	xMOV32     (xmm0, ptr32[edx]);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 16);
 }
 	xMovDest   (xmm0);
 }
 vUPK(nVif_V2_8) {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX8   (xmm0, ptr32[edx]);
 }
 else {
 	xMOV16     (xmm0, ptr32[edx]);
 	xPUNPCK.LBW(xmm0, xmm0);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 24);
 }
 	xMovDest   (xmm0);
 }
 vUPK(nVif_V3_32) {
 	xMOV128    (xmm0, ptr32[edx]);
 	xMovDest   (xmm0);
 }
 vUPK(nVif_V3_16) {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX16  (xmm0, ptr64[edx]);
 }
 else {
 	xMOV64     (xmm0, ptr32[edx]);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 16);
 }
 	xMovDest   (xmm0);
 }
 vUPK(nVif_V3_8) {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX8   (xmm0, ptr32[edx]);
 }
 else {
 	xMOV32     (xmm0, ptr32[edx]);
 	xPUNPCK.LBW(xmm0, xmm0);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 24);
 }
 	xMovDest   (xmm0);
 }
 vUPK(nVif_V4_32) {
 	xMOV128    (xmm0, ptr32[edx]);
 	xMovDest   (xmm0);
 }
 vUPK(nVif_V4_16) {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX16  (xmm0, ptr64[edx]);
 }
 else {
 	xMOV64     (xmm0, ptr32[edx]);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 16);
 }
 	xMovDest   (xmm0);
 }
 vUPK(nVif_V4_8) {
 if (x86caps.hasStreamingSIMD4Extensions) {
 	xPMOVXX8   (xmm0, ptr32[edx]);
 }
 else {
 	xMOV32     (xmm0, ptr32[edx]);
 	xPUNPCK.LBW(xmm0, xmm0);
 	xPUNPCK.LWD(xmm0, xmm0);
 	xShiftR    (xmm0, 24);
 }
 	xMovDest   (xmm0);
 }
 vUPK(nVif_V4_5) {
 	xMOV16		(xmm0, ptr32[edx]);
 	xPSHUF.D	(xmm0, xmm0, _v0);
 	xPSLL.D		(xmm0, 3);			// ABG|R5.000
 	xMOVAPS		(xmm1, xmm0);		// x|x|x|R
 	xPSRL.D		(xmm0, 8);			// ABG
 	xPSLL.D		(xmm0, 3);			// AB|G5.000
 	mVUmergeRegs(XMM1, XMM0, 0x4);	// x|x|G|R
 	xPSRL.D		(xmm0, 8);			// AB
 	xPSLL.D		(xmm0, 3);			// A|B5.000
 	mVUmergeRegs(XMM1, XMM0, 0x2);	// x|B|G|R
 	xPSRL.D		(xmm0, 8);			// A
 	xPSLL.D		(xmm0, 7);			// A.0000000
 	mVUmergeRegs(XMM1, XMM0, 0x1);	// A|B|G|R
 	xPSLL.D		(xmm1, 24); // can optimize to
 	xPSRL.D		(xmm1, 24); // single AND...
 	xMovDest	(xmm1);
 }
 vUPK(nVif_unkown) {
 	Console.Error("nVif%d - Invalid Unpack! [%d]", v->idx, v->vif->tag.cmd & 0xf);
 }
 void (*xUnpack[16])(nVifStruct* v, int doMask) = {
 	nVif_S_32,
 	nVif_S_16,
 	nVif_S_8,
 	nVif_unkown,
 	nVif_V2_32,
 	nVif_V2_16,
 	nVif_V2_8,
 	nVif_unkown,
 	nVif_V3_32,
 	nVif_V3_16,
 	nVif_V3_8,
 	nVif_unkown,
 	nVif_V4_32,
 	nVif_V4_16,
 	nVif_V4_8,
 	nVif_V4_5,
 };
 // Loads Row/Col Data from vifRegs instead of g_vifmask
 // Useful for testing vifReg and g_vifmask inconsistency.
 void loadRowCol(nVifStruct& v) {
 	xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
 	xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
 	xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
 	xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
 	xPSHUF.D(xmm0, xmm0, _v0);
 	xPSHUF.D(xmm1, xmm1, _v0);
 	xPSHUF.D(xmm2, xmm2, _v0);
 	xPSHUF.D(xmm6, xmm6, _v0);
 	mVUmergeRegs(XMM6, XMM0, 8);
 	mVUmergeRegs(XMM6, XMM1, 4);
 	mVUmergeRegs(XMM6, XMM2, 2);
 	xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
 	xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
 	xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
 	xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
 	xPSHUF.D(xmm2, xmm2, _v0);
 	xPSHUF.D(xmm3, xmm3, _v0);
 	xPSHUF.D(xmm4, xmm4, _v0);
 	xPSHUF.D(xmm5, xmm5, _v0);
 }
 void writeBackRow(nVifStruct& v) {
 	u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
 	xMOVAPS(ptr32[row], xmmRow);
 	DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
 	// ToDo: Do we need to write back to vifregs.rX too!? :/
 }
 void emitCustomCompare() {
 	HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadWrite, false);
 	memset8<0xcc>(nVifMemCmp);
 	xSetPtr(nVifMemCmp);
 	xMOVAPS  (xmm0, ptr32[ecx]);
 	xPCMP.EQD(xmm0, ptr32[edx]);
 	xMOVMSKPS(eax, xmm0);
 	xAND	 (eax, 0x7);		// ignore top 4 bytes (recBlock pointer)
 	xRET();
 	HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadOnly, true);
 }
--- a/pcsx2/x86/newVif_Unpack.cpp
+++ b/pcsx2/x86/newVif_Unpack.cpp
@ -17,9 +17,66 @@
 // authors: cottonvibes(@gmail.com)
 //			Jake.Stine (@gmail.com)
-#pragma once
+#include "PrecompiledHeader.h"
 #include "Common.h"
 #include "VifDma_internal.h"
 #include "newVif.h"
-static __aligned16 nVifStruct nVif[2];
+#ifdef newVif
 #include "newVif_OldUnpack.inl"
 __aligned16 nVifStruct	nVif[2];
 __aligned16 nVifCall	nVifUpk[(2*2*16)  *4];		// ([USN][Masking][Unpack Type]) [curCycle]
 __aligned16 u32			nVifMask[3][4][4] = {0};	// [MaskNumber][CycleNumber][Vector]
 // Contents of this table are doubled up for doMask(false) and doMask(true) lookups.
 // (note: currently unused, I'm using gsize in the interp tables instead since it
 //  seems to be faster for now, which may change when nVif isn't reliant on interpreted
 //  unpackers anymore --air)
 __aligned16 const u8 nVifT[32] = { 
 	4, // S-32
 	2, // S-16
 	1, // S-8
 	0, // ----
 	8, // V2-32
 	4, // V2-16
 	2, // V2-8
 	0, // ----
 	12,// V3-32
 	6, // V3-16
 	3, // V3-8
 	0, // ----
 	16,// V4-32
 	8, // V4-16
 	4, // V4-8
 	2, // V4-5
 	// Second verse, same as the first!
 	4,2,1,0,8,4,2,0,12,6,3,0,16,8,4,2
 };
 // ----------------------------------------------------------------------------
 template< int idx, bool doMode, bool isFill, bool singleUnpack >
 __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size);
 typedef void (__fastcall* Fnptr_VifUnpackLoop)(u8 *data, u32 size);
 // Unpacks Until 'Num' is 0
 static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = {
 	{{ _nVifUnpackLoop<0,0,0,0>, _nVifUnpackLoop<0,0,1,0> },
 	{  _nVifUnpackLoop<0,1,0,0>, _nVifUnpackLoop<0,1,1,0> },},
 	{{ _nVifUnpackLoop<1,0,0,0>, _nVifUnpackLoop<1,0,1,0> },
 	{ _nVifUnpackLoop<1,1,0,0>, _nVifUnpackLoop<1,1,1,0> },},
 };
 // Unpacks until 1 normal write cycle unpack has been written to VU mem
 static const __aligned16 Fnptr_VifUnpackLoop UnpackSingleTable[2][2][2] = {
 	{{ _nVifUnpackLoop<0,0,0,1>, _nVifUnpackLoop<0,0,1,1> },
 	{  _nVifUnpackLoop<0,1,0,1>, _nVifUnpackLoop<0,1,1,1> },},
 	{{ _nVifUnpackLoop<1,0,0,1>, _nVifUnpackLoop<1,0,1,1> },
 	{ _nVifUnpackLoop<1,1,0,1>, _nVifUnpackLoop<1,1,1,1> },},
 };
 // ----------------------------------------------------------------------------
 void initNewVif(int idx) {
 	nVif[idx].idx			= idx;
@ -31,26 +88,15 @@ void initNewVif(int idx) {
 	nVif[idx].vifCache		= NULL;
 	nVif[idx].partTransfer	= 0;
-	HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
+	VpuUnpackSSE_Init();
 	memset8<0xcc>( nVifUpkExec );
 	xSetPtr( nVifUpkExec );
 	for (int a = 0; a < 2; a++) {
 	for (int b = 0; b < 2; b++) {
 	for (int c = 0; c < 4; c++) {
 		nVifGen(a, b, c);
 	}}}
 	HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
 	if (newVifDynaRec)  dVifInit(idx);
 }
-_f u8* setVUptr(int vuidx, const u8* vuMemBase, int offset) {
+static _f u8* setVUptr(int vuidx, const u8* vuMemBase, int offset) {
 	return (u8*)(vuMemBase + ( offset & (vuidx ? 0x3ff0 : 0xff0) ));
 }
-_f void incVUptr(int vuidx, u8* &ptr, const u8* vuMemBase, int amount) {
+static _f void incVUptr(int vuidx, u8* &ptr, const u8* vuMemBase, int amount) {
 	pxAssert( ((uptr)ptr & 0xf) == 0 ); // alignment check
 	ptr += amount;
 	int diff = ptr - (vuMemBase + (vuidx ? 0x4000 : 0x1000));
@ -59,7 +105,7 @@ _f void incVUptr(int vuidx, u8* &ptr, const u8* vuMemBase, int amount) {
 	}
 }
-_f void incVUptrBy16(int vuidx, u8* &ptr, const u8* vuMemBase) {
+static _f void incVUptrBy16(int vuidx, u8* &ptr, const u8* vuMemBase) {
 	pxAssert( ((uptr)ptr & 0xf) == 0 );	// alignment check
 	ptr += 16;
 	if( ptr == (vuMemBase + (vuidx ? 0x4000 : 0x1000)) )
@ -73,16 +119,16 @@ int nVifUnpack(int idx, u8* data) {
 	vifRegs	 = v.vifRegs;
 	int ret  = aMin(vif->vifpacketsize, vif->tag.size);
 	s32 size = ret << 2;
-	u32 vifT = nVifT[vif->cmd & 0xf];
+	const u8& vifT = nVifT[vif->cmd & 0xf];
 	vif->tag.size -= ret;
 	const bool  isFill	= (vifRegs->cycle.cl < vifRegs->cycle.wl);
 	if (v.partTransfer) { // Last transfer was a partial vector transfer...
 		const bool  doMode	=  vifRegs->mode && !(vif->tag.cmd & 0x10);
 		const bool  isFill	= (vifRegs->cycle.cl < vifRegs->cycle.wl);
 		const u8    upkNum	=  vif->cmd & 0x1f;
-		const VUFT& ft		=  VIFfuncTable[upkNum];
+		const int   diff	=  vifT - v.partTransfer;
 		const int   diff	=  ft.gsize - v.partTransfer;
 		memcpy(&v.partBuffer[v.partTransfer], data, diff);
 		UnpackSingleTable[idx][doMode][isFill]( v.partBuffer, size );
 		data += diff;
@ -95,8 +141,8 @@ int nVifUnpack(int idx, u8* data) {
 	u32 oldNum = vifRegs->num;
 	if (size > 0) {
-		if (newVifDynaRec)	dVifUnpack(idx, data, size);
+		if (newVifDynaRec)	dVifUnpack(idx, data, size, isFill);
-		else			   _nVifUnpack(idx, data, size);
+		else			   _nVifUnpack(idx, data, size, isFill);
 	}
 	u32 s	 =(size/vifT)  * vifT;
@ -230,7 +276,7 @@ __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) {
 	}
 }
-_f void _nVifUnpack(int idx, u8 *data, u32 size) {
+_f void _nVifUnpack(int idx, u8 *data, u32 size, bool isFill) {
 	if (useOldUnpack) {
 		if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2);
@ -239,7 +285,6 @@ _f void _nVifUnpack(int idx, u8 *data, u32 size) {
 	}
 	const bool doMode =  vifRegs->mode && !(vif->tag.cmd & 0x10);
 	const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
 	UnpackLoopTable[idx][doMode][isFill]( data, size );
 }
 #endif
--- a/pcsx2/x86/newVif_UnpackGen.inl
+++ b/pcsx2/x86/newVif_UnpackGen.inl
@ -1,240 +0,0 @@
 /*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2009  PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 #pragma once
 #define xMaskWrite(regX) {					\
 	xMOVAPS(xmm7, ptr32[ecx]);				\
 	int offX = aMin(curCycle, 3);			\
 	xPAND(regX, ptr32[nVifMask[0][offX]]);	\
 	xPAND(xmm7, ptr32[nVifMask[1][offX]]);	\
 	xPOR (regX, ptr32[nVifMask[2][offX]]);	\
 	xPOR (regX, xmm7);						\
 	xMOVAPS(ptr32[ecx], regX);				\
 }
 #define xMovDest(regX) {						\
 	if (!mask)	{ xMOVAPS (ptr32[ecx], regX); }	\
 	else		{ xMaskWrite(regX); }			\
 }
 #define xShiftR(regX, n) {						\
 	if (usn) { xPSRL.D(regX, n); }				\
 	else	 { xPSRA.D(regX, n); }				\
 }
 #define xPMOVXX8(regX, src) {					\
 	if (usn) xPMOVZX.BD(regX, src);				\
 	else	 xPMOVSX.BD(regX, src);				\
 }
 #define xPMOVXX16(regX, src) {					\
 	if (usn) xPMOVZX.WD(regX, src);				\
 	else	 xPMOVSX.WD(regX, src);				\
 }
 struct VifUnpackIndexer {
 	int	usn, mask;
 	int	curCycle, cyclesToWrite;
 	nVifCall& GetCall(int packType) const {
 		int usnpart		= usn*2*16;
 		int maskpart	= mask*16;
 		int packpart	= packType;
 		int curpart		= curCycle;
 		return nVifUpk[((usnpart+maskpart+packpart) * 4) + (curpart)];
 	}
 	void xSetCall(int packType) const {
 		GetCall( packType ) = (nVifCall)xGetAlignedCallTarget();
 	}
 	void xSetNullCall(int packType) const {
 		GetCall( packType ) = NULL;
 	}
 };
 // xMOVSS doesn't seem to have all overloads defined with new emitter
 #define xMOVSSS(regX, loc) SSE_MOVSS_Rm_to_XMM(0, 2, 0)
 #define xMOV8(regX, loc)	xMOVSSS(regX, loc)
 #define xMOV16(regX, loc)	xMOVSSS(regX, loc)
 #define xMOV32(regX, loc)	xMOVSSS(regX, loc)
 #define xMOV64(regX, loc)	xMOVUPS(regX, loc)
 #define xMOV128(regX, loc)	xMOVUPS(regX, loc)
 // ecx = dest, edx = src
 void nVifGen(int usn, int mask, int curCycle) {
 	const VifUnpackIndexer indexer = { usn, mask, curCycle, 0 };
 	indexer.xSetCall(0x0); // S-32
 		xMOV32     (xmm0, ptr32[edx]);
 		xPSHUF.D   (xmm1, xmm0, _v0);
 		xMovDest   (xmm1);
 	xRET();
 	indexer.xSetCall(0x1); // S-16
 	if (x86caps.hasStreamingSIMD4Extensions) {
 		xPMOVXX16  (xmm0, ptr64[edx]);
 	}
 	else {
 		xMOV16     (xmm0, ptr32[edx]);
 		xPUNPCK.LWD(xmm0, xmm0);
 		xShiftR    (xmm0, 16);
 	}
 		xPSHUF.D   (xmm1, xmm0, _v0);
 		xMovDest   (xmm1);
 	xRET();
 	indexer.xSetCall(0x2); // S-8
 	if (x86caps.hasStreamingSIMD4Extensions) {
 		xPMOVXX8   (xmm0, ptr32[edx]);
 	}
 	else {
 		xMOV8      (xmm0, ptr32[edx]);
 		xPUNPCK.LBW(xmm0, xmm0);
 		xPUNPCK.LWD(xmm0, xmm0);
 		xShiftR    (xmm0, 24);
 	}
 		xPSHUF.D   (xmm1, xmm0, _v0);
 		xMovDest   (xmm1);
 	xRET();
 	indexer.xSetNullCall(0x3); // ----
 	indexer.xSetCall(0x4); // V2-32
 		xMOV64     (xmm0, ptr32[edx]);
 		xMovDest   (xmm0);
 	xRET();
 	indexer.xSetCall(0x5); // V2-16
 	if (x86caps.hasStreamingSIMD4Extensions) {
 		xPMOVXX16  (xmm0, ptr64[edx]);
 	}
 	else {
 		xMOV32     (xmm0, ptr32[edx]);
 		xPUNPCK.LWD(xmm0, xmm0);
 		xShiftR    (xmm0, 16);
 	}
 		xMovDest   (xmm0);
 	xRET();
 	indexer.xSetCall(0x6); // V2-8
 	if (x86caps.hasStreamingSIMD4Extensions) {
 		xPMOVXX8   (xmm0, ptr32[edx]);
 	}
 	else {
 		xMOV16     (xmm0, ptr32[edx]);
 		xPUNPCK.LBW(xmm0, xmm0);
 		xPUNPCK.LWD(xmm0, xmm0);
 		xShiftR    (xmm0, 24);
 	}
 		xMovDest   (xmm0);
 	xRET();
 	indexer.xSetNullCall(0x7); // ----
 	indexer.xSetCall(0x8); // V3-32
 		xMOV128    (xmm0, ptr32[edx]);
 		xMovDest   (xmm0);
 	xRET();
 	indexer.xSetCall(0x9); // V3-16
 	if (x86caps.hasStreamingSIMD4Extensions) {
 		xPMOVXX16  (xmm0, ptr64[edx]);
 	}
 	else {
 		xMOV64     (xmm0, ptr32[edx]);
 		xPUNPCK.LWD(xmm0, xmm0);
 		xShiftR    (xmm0, 16);
 	}
 		xMovDest   (xmm0);
 	xRET();
 	indexer.xSetCall(0xa); // V3-8
 	if (x86caps.hasStreamingSIMD4Extensions) {
 		xPMOVXX8   (xmm0, ptr32[edx]);
 	}
 	else {
 		xMOV32     (xmm0, ptr32[edx]);
 		xPUNPCK.LBW(xmm0, xmm0);
 		xPUNPCK.LWD(xmm0, xmm0);
 		xShiftR    (xmm0, 24);
 	}
 		xMovDest   (xmm0);
 	xRET();
 	indexer.xSetNullCall(0xb); // ----
 	indexer.xSetCall(0xc); // V4-32
 		xMOV128    (xmm0, ptr32[edx]);
 		xMovDest   (xmm0);
 	xRET();
 	indexer.xSetCall(0xd); // V4-16
 	if (x86caps.hasStreamingSIMD4Extensions) {
 		xPMOVXX16  (xmm0, ptr64[edx]);
 	}
 	else {
 		xMOV64     (xmm0, ptr32[edx]);
 		xPUNPCK.LWD(xmm0, xmm0);
 		xShiftR    (xmm0, 16);
 	}
 		xMovDest   (xmm0);
 	xRET();
 	indexer.xSetCall(0xe); // V4-8
 	if (x86caps.hasStreamingSIMD4Extensions) {
 		xPMOVXX8   (xmm0, ptr32[edx]);
 	}
 	else {
 		xMOV32     (xmm0, ptr32[edx]);
 		xPUNPCK.LBW(xmm0, xmm0);
 		xPUNPCK.LWD(xmm0, xmm0);
 		xShiftR    (xmm0, 24);
 	}
 		xMovDest   (xmm0);
 	xRET();
 	// A | B5 | G5 | R5
 	// ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000
 	// Optimization: This function has a *really* long dependency chain.
 	// It would be better if the [edx] is loaded into multiple regs and
 	// then the regs are shifted each independently, instead of using the
 	// progressive shift->move pattern below. --air
 	indexer.xSetCall(0xf); // V4-5
 		xMOV16		(xmm0, ptr32[edx]);
 		xMOVAPS		(xmm1, xmm0);
 		xPSLL.D		(xmm1, 3);   // ABG|R5.000
 		xMOVAPS		(xmm2, xmm1);// R5.000 (garbage upper bits)
 		xPSRL.D		(xmm1, 8);   // ABG
 		xPSLL.D		(xmm1, 3);   // AB|G5.000
 		xMOVAPS		(xmm3, xmm1);// G5.000 (garbage upper bits)
 		xPSRL.D		(xmm1, 8);   // AB
 		xPSLL.D		(xmm1, 3);   // A|B5.000
 		xMOVAPS		(xmm4, xmm1);// B5.000 (garbage upper bits)
 		xPSRL.D		(xmm1, 8);   // A
 		xPSLL.D		(xmm1, 7);   // A.0000000
 		xPSHUF.D	(xmm1, xmm1, _v0); // A|A|A|A
 		xPSHUF.D	(xmm3, xmm3, _v0); // G|G|G|G
 		xPSHUF.D	(xmm4, xmm4, _v0); // B|B|B|B
 		mVUmergeRegs(XMM2, XMM1, 0x3); // A|x|x|R
 		mVUmergeRegs(XMM2, XMM3, 0x4); // A|x|G|R
 		mVUmergeRegs(XMM2, XMM4, 0x2); // A|B|G|R
 		xPSLL.D		(xmm2, 24); // can optimize to
 		xPSRL.D		(xmm2, 24); // single AND...
 		xMovDest	(xmm2);
 	xRET();
 	pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
 }