* Significant optimizations to the VIFunpack interpreter (employs templated maskmode and cyclesize constants).

* Minor optimizations to newVifUnpackSSE, and more optimization notes. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2352 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-12-17 13:59:01 +00:00 · 2009-12-17 13:59:01 +00:00 · f34f3ac0c4
parent 2b3b60511b
commit f34f3ac0c4
14 changed files with 1156 additions and 1177 deletions
--- a/pcsx2/Config.h
+++ b/pcsx2/Config.h
@ -618,7 +618,6 @@ TraceLogFilters&			SetTraceConfig();
 #endif

 #define EE_CONST_PROP // rec2 - enables constant propagation (faster)
-//#define NON_SSE_UNPACKS  // Turns off SSE Unpacks (slower)

 // Uncomment this if working on getting PS1 emulation working.
 // This disables the exception normally caused by trying to load PS1
--- a/pcsx2/Linux/pcsx2.cbp
+++ b/pcsx2/Linux/pcsx2.cbp
--- a/pcsx2/VIFunpack.cpp
+++ b/pcsx2/VIFunpack.cpp
@ -0,0 +1,385 @@
+/*  PCSX2 - PS2 Emulator for PCs
+ *  Copyright (C) 2002-2009  PCSX2 Dev Team
+ *
+ *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ *  of the GNU Lesser General Public License as published by the Free Software Found-
+ *  ation, either version 3 of the License, or (at your option) any later version.
+ *
+ *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ *  PURPOSE.  See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along with PCSX2.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "PrecompiledHeader.h"
+#include "Common.h"
+
+#include <cmath>
+
+#include "Vif.h"
+#include "VifDma_internal.h"
+
+enum UnpackOffset
+{
+	OFFSET_X = 0,
+	OFFSET_Y = 1,
+	OFFSET_Z = 2,
+	OFFSET_W = 3
+};
+
+static __forceinline u32 setVifRowRegs(u32 reg, u32 data)
+{
+	switch (reg)
+	{
+		case 0:
+			vifRegs->r0 = data;
+			break;
+		case 1:
+			vifRegs->r1 = data;
+			break;
+		case 2:
+			vifRegs->r2 = data;
+			break;
+		case 3:
+			vifRegs->r3 = data;
+			break;
+			jNO_DEFAULT;
+	}
+	return data;
+}
+
+static __forceinline u32 getVifRowRegs(u32 reg)
+{
+	switch (reg)
+	{
+		case 0:
+			return vifRegs->r0;
+			break;
+		case 1:
+			return vifRegs->r1;
+			break;
+		case 2:
+			return vifRegs->r2;
+			break;
+		case 3:
+			return vifRegs->r3;
+			break;
+			jNO_DEFAULT;
+	}
+	return 0;	// unreachable...
+}
+
+static __forceinline u32 setVifColRegs(u32 reg, u32 data)
+{
+	switch (reg)
+	{
+		case 0:
+			vifRegs->c0 = data;
+			break;
+		case 1:
+			vifRegs->c1 = data;
+			break;
+		case 2:
+			vifRegs->c2 = data;
+			break;
+		case 3:
+			vifRegs->c3 = data;
+			break;
+			jNO_DEFAULT;
+	}
+	return data;
+}
+
+static __forceinline u32 getVifColRegs(u32 reg)
+{
+	switch (reg)
+	{
+		case 0:
+			return vifRegs->c0;
+			break;
+		case 1:
+			return vifRegs->c1;
+			break;
+		case 2:
+			return vifRegs->c2;
+			break;
+		case 3:
+			return vifRegs->c3;
+			break;
+			jNO_DEFAULT;
+	}
+	return 0;	// unreachable...
+}
+
+template< bool doMask >
+static __releaseinline void writeXYZW(u32 offnum, u32 &dest, u32 data)
+{
+	int n;
+	u32 vifRowReg = getVifRowRegs(offnum);
+
+	if (doMask)
+	{
+		switch (vif->cl)
+		{
+			case 0:
+				if (offnum == OFFSET_X)
+					n = (vifRegs->mask) & 0x3;
+				else
+					n = (vifRegs->mask >> (offnum * 2)) & 0x3;
+				break;
+			case 1:
+				n = (vifRegs->mask >> ( 8 + (offnum * 2))) & 0x3;
+				break;
+			case 2:
+				n = (vifRegs->mask >> (16 + (offnum * 2))) & 0x3;
+				break;
+			default:
+				n = (vifRegs->mask >> (24 + (offnum * 2))) & 0x3;
+				break;
+		}
+	}
+	else n = 0;
+
+	switch (n)
+	{
+		case 0:
+			if ((vif->cmd & 0x6F) == 0x6f)
+			{
+				dest = data;
+			}
+			else switch (vifRegs->mode)
+			{
+				case 1:
+					dest = data + vifRowReg;
+					break;
+				case 2:
+					// vifRowReg isn't used after this, or I would make it equal to dest here.
+					dest = setVifRowRegs(offnum, vifRowReg + data);
+					break;
+				default:
+					dest = data;
+					break;
+			}
+			break;
+		case 1:
+			dest = vifRowReg;
+			break;
+		case 2:
+			dest = getVifColRegs((vif->cl > 2) ? 3 : vif->cl);
+			break;
+		case 3:
+			break;
+	}
+//	VIF_LOG("writeX %8.8x : Mode %d, r0 = %x, data %8.8x", *dest,vifRegs->mode,vifRegs->r0,data);
+}
+
+template < bool doMask, class T >
+static __forceinline void __fastcall UNPACK_S(u32 *dest, T *data, int size)
+{
+	//S-# will always be a complete packet, no matter what. So we can skip the offset bits
+	writeXYZW<doMask>(OFFSET_X, *dest++, *data);
+	writeXYZW<doMask>(OFFSET_Y, *dest++, *data);
+	writeXYZW<doMask>(OFFSET_Z, *dest++, *data);
+	writeXYZW<doMask>(OFFSET_W, *dest  , *data);
+}
+
+template <bool doMask, class T>
+static __forceinline void __fastcall UNPACK_V2(u32 *dest, T *data, int size)
+{
+	if (vifRegs->offset == OFFSET_X)
+	{
+		if (size > 0)
+		{
+			writeXYZW<doMask>(vifRegs->offset, *dest++, *data++);
+			vifRegs->offset = OFFSET_Y;
+			size--;
+		}
+	}
+
+	if (vifRegs->offset == OFFSET_Y)
+	{
+		if (size > 0)
+		{
+			writeXYZW<doMask>(vifRegs->offset, *dest++, *data);
+			vifRegs->offset = OFFSET_Z;
+			size--;
+		}
+	}
+
+	if (vifRegs->offset == OFFSET_Z)
+	{
+		writeXYZW<doMask>(vifRegs->offset, *dest++, *dest-2);
+		vifRegs->offset = OFFSET_W;
+	}
+
+	if (vifRegs->offset == OFFSET_W)
+	{
+		writeXYZW<doMask>(vifRegs->offset, *dest, *data);
+		vifRegs->offset = OFFSET_X;
+	}
+}
+
+template <bool doMask, class T>
+static __forceinline void __fastcall UNPACK_V3(u32 *dest, T *data, int size)
+{
+	if(vifRegs->offset == OFFSET_X)
+	{
+		if (size > 0)
+		{
+			writeXYZW<doMask>(vifRegs->offset, *dest++, *data++);
+			vifRegs->offset = OFFSET_Y;
+			size--;
+		}
+	}
+
+	if(vifRegs->offset == OFFSET_Y)
+	{
+		if (size > 0)
+		{
+			writeXYZW<doMask>(vifRegs->offset, *dest++, *data++);
+			vifRegs->offset = OFFSET_Z;
+			size--;
+		}
+	}
+
+	if(vifRegs->offset == OFFSET_Z)
+	{
+		if (size > 0)
+		{
+			writeXYZW<doMask>(vifRegs->offset, *dest++, *data++);
+			vifRegs->offset = OFFSET_W;
+			size--;
+		}
+	}
+
+	if(vifRegs->offset == OFFSET_W)
+	{
+		//V3-# does some bizzare thing with alignment, every 6qw of data the W becomes 0 (strange console!)
+		//Ape Escape doesnt seem to like it tho (what the hell?) gonna have to investigate
+		writeXYZW<doMask>(vifRegs->offset, *dest, *data);
+		vifRegs->offset = OFFSET_X;
+	}
+}
+
+template <bool doMask, class T>
+static __forceinline void __fastcall UNPACK_V4(u32 *dest, T *data , int size)
+{
+	while (size > 0)
+	{
+		writeXYZW<doMask>(vifRegs->offset, *dest++, *data++);
+		vifRegs->offset++;
+		size--;
+	}
+
+	if (vifRegs->offset > OFFSET_W) vifRegs->offset = OFFSET_X;
+}
+
+template< bool doMask >
+static __releaseinline void __fastcall UNPACK_V4_5(u32 *dest, u32 *data, int size)
+{
+	//As with S-#, this will always be a complete packet
+	writeXYZW<doMask>(OFFSET_X, *dest++,	((*data & 0x001f) << 3));
+	writeXYZW<doMask>(OFFSET_Y, *dest++,	((*data & 0x03e0) >> 2));
+	writeXYZW<doMask>(OFFSET_Z, *dest++,	((*data & 0x7c00) >> 7));
+	writeXYZW<doMask>(OFFSET_W, *dest,		((*data & 0x8000) >> 8));
+}
+
+// =====================================================================================================
+
+template < bool doMask, int size, class T >
+static void __fastcall fUNPACK_S(u32 *dest, T *data)
+{
+	UNPACK_S<doMask>( dest, data, size );
+}
+
+template <bool doMask, int size, class T>
+static void __fastcall fUNPACK_V2(u32 *dest, T *data)
+{
+	UNPACK_V2<doMask>( dest, data, size );
+}
+
+template <bool doMask, int size, class T>
+static void __fastcall fUNPACK_V3(u32 *dest, T *data)
+{
+	UNPACK_V3<doMask>( dest, data, size );
+}
+
+template <bool doMask, int size, class T>
+static void __fastcall fUNPACK_V4(u32 *dest, T *data)
+{
+	UNPACK_V4<doMask>( dest, data, size );
+}
+
+template< bool doMask >
+static void __fastcall fUNPACK_V4_5(u32 *dest, u32 *data)
+{
+	UNPACK_V4_5<doMask>(dest, data, 0);		// size is ignored.
+}
+
+#define _upk (UNPACKFUNCTYPE)
+#define _odd (UNPACKFUNCTYPE_ODD)
+
+// --------------------------------------------------------------------------------------
+//  Main table for function unpacking. 
+// --------------------------------------------------------------------------------------
+// The extra data bsize/dsize/etc are all duplicated between the doMask enabled and
+// disabled versions.  This is probably simpler and more efficient than bothering
+// to generate separate tables.
+
+// 32-bits versions are unsigned-only!!
+#define UnpackFuncPair32( sizefac, vt, doMask ) \
+	_upk fUNPACK_##vt<doMask, sizefac, u32>, \
+	_upk fUNPACK_##vt<doMask, sizefac, u32>, \
+	_odd UNPACK_##vt<doMask, u32>, \
+	_odd UNPACK_##vt<doMask, u32>,
+
+#define UnpackFuncPair( sizefac, vt, bits, doMask ) \
+	_upk fUNPACK_##vt<doMask, sizefac, u##bits>, \
+	_upk fUNPACK_##vt<doMask, sizefac, s##bits>, \
+	_odd UNPACK_##vt<doMask, u##bits>, \
+	_odd UNPACK_##vt<doMask, s##bits>,
+
+#define UnpackFuncSet( doMask ) \
+ 	{	UnpackFuncPair32( 4, S, doMask )		/* 0x0 - S-32 */ \
+		1, 4, 4, 4 }, \
+	{	UnpackFuncPair	( 4, S, 16, doMask )	/* 0x1 - S-16 */ \
+		2, 2, 2, 4 }, \
+	{	UnpackFuncPair	( 4, S, 8, doMask )		/* 0x2 - S-8 */ \
+		4, 1, 1, 4 }, \
+	{ NULL, NULL, NULL, NULL, 0, 0, 0, 0 },		/* 0x3 (NULL) */ \
+ \
+	{	UnpackFuncPair32( 2, V2, doMask )		/* 0x4 - V2-32 */ \
+		24, 4, 8, 2 }, \
+	{	UnpackFuncPair	( 2, V2, 16, doMask )	/* 0x5 - V2-16 */ \
+		12, 2, 4, 2 }, \
+	{	UnpackFuncPair	( 2, V2, 8, doMask )	/* 0x6 - V2-8 */ \
+		 6, 1, 2, 2 }, \
+	{ NULL, NULL, NULL, NULL,0, 0, 0, 0 },		/* 0x7 (NULL) */ \
+ \
+	{	UnpackFuncPair32( 3, V3, doMask )		/* 0x8 - V3-32 */ \
+		36, 4, 12, 3 }, \
+	{	UnpackFuncPair	( 3, V3, 16, doMask )	/* 0x9 - V3-16 */ \
+		18, 2, 6, 3 }, \
+	{	UnpackFuncPair	( 3, V3, 8, doMask )	/* 0xA - V3-8 */ \
+		 9, 1, 3, 3 }, \
+	{ NULL, NULL, NULL, NULL,0, 0, 0, 0 },		/* 0xB (NULL) */ \
+ \
+	{	UnpackFuncPair32( 4, V4, doMask )		/* 0xC - V4-32 */ \
+		48, 4, 16, 4 }, \
+	{	UnpackFuncPair	( 4, V4, 16, doMask )	/* 0xD - V4-16 */ \
+		24, 2, 8, 4 }, \
+	{	UnpackFuncPair	( 4, V4, 8, doMask )	/* 0xE - V4-8 */ \
+		12, 1, 4, 4 }, \
+	{										/* 0xF - V4-5 */ \
+		_upk fUNPACK_V4_5<doMask>,		_upk fUNPACK_V4_5<doMask>, \
+		_odd UNPACK_V4_5<doMask>,		_odd UNPACK_V4_5<doMask>, \
+		6, 2, 2, 4 },
+
+const __aligned16 VIFUnpackFuncTable VIFfuncTable[32] =
+{
+	UnpackFuncSet( false )
+	UnpackFuncSet( true )
+};
--- a/pcsx2/Vif.cpp
+++ b/pcsx2/Vif.cpp
@ -18,7 +18,6 @@
 #include "Common.h"

 #include <cmath>
-#include <assert.h>

 #include "Vif.h"
 #include "VifDma.h"
@ -33,386 +32,6 @@ __aligned16 VifMaskTypes g_vifmask;

 extern int g_vifCycles;

-enum UnpackOffset
-{
-	OFFSET_X = 0,
-	OFFSET_Y = 1,
-	OFFSET_Z = 2,
-	OFFSET_W = 3
-};
-
-static __forceinline u32 setVifRowRegs(u32 reg, u32 data)
-{
-	switch (reg)
-	{
-		case 0:
-			vifRegs->r0 = data;
-			break;
-		case 1:
-			vifRegs->r1 = data;
-			break;
-		case 2:
-			vifRegs->r2 = data;
-			break;
-		case 3:
-			vifRegs->r3 = data;
-			break;
-			jNO_DEFAULT;
-	}
-	return data;
-}
-
-static __forceinline u32 getVifRowRegs(u32 reg)
-{
-	switch (reg)
-	{
-		case 0:
-			return vifRegs->r0;
-			break;
-		case 1:
-			return vifRegs->r1;
-			break;
-		case 2:
-			return vifRegs->r2;
-			break;
-		case 3:
-			return vifRegs->r3;
-			break;
-			jNO_DEFAULT;
-	}
-	return 0;	// unreachable...
-}
-
-static __forceinline u32 setVifColRegs(u32 reg, u32 data)
-{
-	switch (reg)
-	{
-		case 0:
-			vifRegs->c0 = data;
-			break;
-		case 1:
-			vifRegs->c1 = data;
-			break;
-		case 2:
-			vifRegs->c2 = data;
-			break;
-		case 3:
-			vifRegs->c3 = data;
-			break;
-			jNO_DEFAULT;
-	}
-	return data;
-}
-
-static __forceinline u32 getVifColRegs(u32 reg)
-{
-	switch (reg)
-	{
-		case 0:
-			return vifRegs->c0;
-			break;
-		case 1:
-			return vifRegs->c1;
-			break;
-		case 2:
-			return vifRegs->c2;
-			break;
-		case 3:
-			return vifRegs->c3;
-			break;
-			jNO_DEFAULT;
-	}
-	return 0;	// unreachable...
-}
-
-
-static __releaseinline void writeXYZW(u32 offnum, u32 &dest, u32 data)
-{
-	int n;
-	u32 vifRowReg = getVifRowRegs(offnum);
-
-	if (vifRegs->code & 0x10000000)
-	{
-		switch (vif->cl)
-		{
-			case 0:
-				if (offnum == OFFSET_X)
-					n = (vifRegs->mask) & 0x3;
-				else
-					n = (vifRegs->mask >> (offnum * 2)) & 0x3;
-				break;
-			case 1:
-				n = (vifRegs->mask >> ( 8 + (offnum * 2))) & 0x3;
-				break;
-			case 2:
-				n = (vifRegs->mask >> (16 + (offnum * 2))) & 0x3;
-				break;
-			default:
-				n = (vifRegs->mask >> (24 + (offnum * 2))) & 0x3;
-				break;
-		}
-	}
-	else n = 0;
-
-	switch (n)
-	{
-		case 0:
-			if ((vif->cmd & 0x6F) == 0x6f)
-			{
-				dest = data;
-			}
-			else switch (vifRegs->mode)
-			{
-				case 1:
-					dest = data + vifRowReg;
-					break;
-				case 2:
-					// vifRowReg isn't used after this, or I would make it equal to dest here.
-					dest = setVifRowRegs(offnum, vifRowReg + data);
-					break;
-				default:
-					dest = data;
-					break;
-			}
-			break;
-		case 1:
-			dest = vifRowReg;
-			break;
-		case 2:
-			dest = getVifColRegs((vif->cl > 2) ? 3 : vif->cl);
-			break;
-		case 3:
-			break;
-	}
-//	VIF_LOG("writeX %8.8x : Mode %d, r0 = %x, data %8.8x", *dest,vifRegs->mode,vifRegs->r0,data);
-}
-
-template <class T>
-void __fastcall UNPACK_S(u32 *dest, T *data, int size)
-{
-	//S-# will always be a complete packet, no matter what. So we can skip the offset bits
-	writeXYZW(OFFSET_X, *dest++, *data);
-	writeXYZW(OFFSET_Y, *dest++, *data);
-	writeXYZW(OFFSET_Z, *dest++, *data);
-	writeXYZW(OFFSET_W, *dest  , *data);
-}
-
-template <class T>
-void __fastcall UNPACK_V2(u32 *dest, T *data, int size)
-{
-	if (vifRegs->offset == OFFSET_X)
-	{
-		if (size > 0)
-		{
-			writeXYZW(vifRegs->offset, *dest++, *data++);
-			vifRegs->offset = OFFSET_Y;
-			size--;
-		}
-	}
-
-	if (vifRegs->offset == OFFSET_Y)
-	{
-		if (size > 0)
-		{
-			writeXYZW(vifRegs->offset, *dest++, *data);
-			vifRegs->offset = OFFSET_Z;
-			size--;
-		}
-	}
-
-	if (vifRegs->offset == OFFSET_Z)
-	{
-		writeXYZW(vifRegs->offset, *dest++, *dest-2);
-		vifRegs->offset = OFFSET_W;
-	}
-
-	if (vifRegs->offset == OFFSET_W)
-	{
-		writeXYZW(vifRegs->offset, *dest, *data);
-		vifRegs->offset = OFFSET_X;
-	}
-}
-
-template <class T>
-void __fastcall UNPACK_V3(u32 *dest, T *data, int size)
-{
-	if(vifRegs->offset == OFFSET_X)
-	{
-		if (size > 0)
-		{
-			writeXYZW(vifRegs->offset, *dest++, *data++);
-			vifRegs->offset = OFFSET_Y;
-			size--;
-		}
-	}
-
-	if(vifRegs->offset == OFFSET_Y)
-	{
-		if (size > 0)
-		{
-			writeXYZW(vifRegs->offset, *dest++, *data++);
-			vifRegs->offset = OFFSET_Z;
-			size--;
-		}
-	}
-
-	if(vifRegs->offset == OFFSET_Z)
-	{
-		if (size > 0)
-		{
-			writeXYZW(vifRegs->offset, *dest++, *data++);
-			vifRegs->offset = OFFSET_W;
-			size--;
-		}
-	}
-
-	if(vifRegs->offset == OFFSET_W)
-	{
-		//V3-# does some bizzare thing with alignment, every 6qw of data the W becomes 0 (strange console!)
-		//Ape Escape doesnt seem to like it tho (what the hell?) gonna have to investigate
-		writeXYZW(vifRegs->offset, *dest, *data);
-		vifRegs->offset = OFFSET_X;
-	}
-}
-
-template <class T>
-void __fastcall UNPACK_V4(u32 *dest, T *data , int size)
-{
-	while (size > 0)
-	{
-		writeXYZW(vifRegs->offset, *dest++, *data++);
-		vifRegs->offset++;
-		size--;
-	}
-
-	if (vifRegs->offset > OFFSET_W) vifRegs->offset = OFFSET_X;
-}
-
-void __fastcall UNPACK_V4_5(u32 *dest, u32 *data, int size)
-{
-	//As with S-#, this will always be a complete packet
-	writeXYZW(OFFSET_X, *dest++,  ((*data & 0x001f) << 3));
-	writeXYZW(OFFSET_Y, *dest++, ((*data & 0x03e0) >> 2));
-	writeXYZW(OFFSET_Z, *dest++, ((*data & 0x7c00) >> 7));
-	writeXYZW(OFFSET_W, *dest, ((*data & 0x8000) >> 8));
-}
-
-void __fastcall UNPACK_S_32(u32 *dest, u32 *data, int size)
-{
-	UNPACK_S(dest, data, size);
-}
-
-void __fastcall UNPACK_S_16s(u32 *dest, u32 *data, int size)
-{
-	s16 *sdata = (s16*)data;
-	UNPACK_S(dest, sdata, size);
-}
-
-void __fastcall UNPACK_S_16u(u32 *dest, u32 *data, int size)
-{
-	u16 *sdata = (u16*)data;
-	UNPACK_S(dest, sdata, size);
-}
-
-void __fastcall UNPACK_S_8s(u32 *dest, u32 *data, int size)
-{
-	s8 *cdata = (s8*)data;
-	UNPACK_S(dest, cdata, size);
-}
-
-void __fastcall UNPACK_S_8u(u32 *dest, u32 *data, int size)
-{
-	u8 *cdata = (u8*)data;
-	UNPACK_S(dest, cdata, size);
-}
-
-void __fastcall UNPACK_V2_32(u32 *dest, u32 *data, int size)
-{
-	UNPACK_V2(dest, data, size);
-}
-
-void __fastcall UNPACK_V2_16s(u32 *dest, u32 *data, int size)
-{
-	s16 *sdata = (s16*)data;
-	UNPACK_V2(dest, sdata, size);
-}
-
-void __fastcall UNPACK_V2_16u(u32 *dest, u32 *data, int size)
-{
-	u16 *sdata = (u16*)data;
-	UNPACK_V2(dest, sdata, size);
-}
-
-void __fastcall UNPACK_V2_8s(u32 *dest, u32 *data, int size)
-{
-	s8 *cdata = (s8*)data;
-	UNPACK_V2(dest, cdata, size);
-}
-
-void __fastcall UNPACK_V2_8u(u32 *dest, u32 *data, int size)
-{
-	u8 *cdata = (u8*)data;
-	UNPACK_V2(dest, cdata, size);
-}
-
-void __fastcall UNPACK_V3_32(u32 *dest, u32 *data, int size)
-{
-	UNPACK_V3(dest, data, size);
-}
-
-void __fastcall UNPACK_V3_16s(u32 *dest, u32 *data, int size)
-{
-	s16 *sdata = (s16*)data;
-	UNPACK_V3(dest, sdata, size);
-}
-
-void __fastcall UNPACK_V3_16u(u32 *dest, u32 *data, int size)
-{
-	u16 *sdata = (u16*)data;
-	UNPACK_V3(dest, sdata, size);
-}
-
-void __fastcall UNPACK_V3_8s(u32 *dest, u32 *data, int size)
-{
-	s8 *cdata = (s8*)data;
-	UNPACK_V3(dest, cdata, size);
-}
-
-void __fastcall UNPACK_V3_8u(u32 *dest, u32 *data, int size)
-{
-	u8 *cdata = (u8*)data;
-	UNPACK_V3(dest, cdata, size);
-}
-
-void __fastcall UNPACK_V4_32(u32 *dest, u32 *data , int size)
-{
-	UNPACK_V4(dest, data, size);
-}
-
-void __fastcall UNPACK_V4_16s(u32 *dest, u32 *data, int size)
-{
-	s16 *sdata = (s16*)data;
-	UNPACK_V4(dest, sdata, size);
-}
-
-void __fastcall UNPACK_V4_16u(u32 *dest, u32 *data, int size)
-{
-	u16 *sdata = (u16*)data;
-	UNPACK_V4(dest, sdata, size);
-}
-
-void __fastcall UNPACK_V4_8s(u32 *dest, u32 *data, int size)
-{
-	s8 *cdata = (s8*)data;
-	UNPACK_V4(dest, cdata, size);
-}
-
-void __fastcall UNPACK_V4_8u(u32 *dest, u32 *data, int size)
-{
-	u8 *cdata = (u8*)data;
-	UNPACK_V4(dest, cdata, size);
-}
-
 static __forceinline bool mfifoVIF1rbTransfer()
 {
 	u32 maddr = dmacRegs->rbor.ADDR;
--- a/pcsx2/Vif0Dma.cpp
+++ b/pcsx2/Vif0Dma.cpp
@ -17,7 +17,6 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"

-#include "VifDma.h"
 #include "VifDma_internal.h"

 #include "VUmicro.h"
--- a/pcsx2/Vif1Dma.cpp
+++ b/pcsx2/Vif1Dma.cpp
@ -17,7 +17,6 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"

-#include "VifDma.h"
 #include "VifDma_internal.h"

 #include "GS.h"
--- a/pcsx2/VifDma.cpp
+++ b/pcsx2/VifDma.cpp
@ -16,7 +16,6 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"

-#include "VifDma.h"
 #include "VifDma_internal.h"
 #include "VUmicro.h"

@ -32,53 +31,9 @@ extern "C"
 	extern u32* vifRow;
 }

-extern vifStruct *vif;
-
 int g_vifCycles = 0;
 u8 s_maskwrite[256];

-/* block size; data size; group size; qword size; */
-#define _UNPACK_TABLE32(name, bsize, dsize, gsize, qsize) \
-   { UNPACK_##name,         UNPACK_##name, \
-	 bsize, dsize, gsize, qsize },
-
-#define _UNPACK_TABLE(name, bsize, dsize, gsize, qsize) \
-   { UNPACK_##name##u,      UNPACK_##name##s, \
-	 bsize, dsize, gsize, qsize },
-
-// Main table for function unpacking
-const VIFUnpackFuncTable VIFfuncTable[16] =
-{
-	_UNPACK_TABLE32(S_32, 1, 4, 4, 4)		// 0x0 - S-32
-	_UNPACK_TABLE(S_16, 2, 2, 2, 4)			// 0x1 - S-16
-	_UNPACK_TABLE(S_8, 4, 1, 1, 4)			// 0x2 - S-8
-	{
-		NULL, NULL, 0, 0, 0, 0
-	}
-	,				// 0x3
-
-	_UNPACK_TABLE32(V2_32, 24, 4, 8, 2)		// 0x4 - V2-32
-	_UNPACK_TABLE(V2_16, 12, 2, 4, 2)		// 0x5 - V2-16
-	_UNPACK_TABLE(V2_8, 6, 1, 2, 2)			// 0x6 - V2-8
-	{
-		NULL, NULL, 0, 0, 0, 0
-	}
-	,	// 0x7
-
-	_UNPACK_TABLE32(V3_32, 36, 4, 12, 3)	// 0x8 - V3-32
-	_UNPACK_TABLE(V3_16, 18, 2, 6, 3)		// 0x9 - V3-16
-	_UNPACK_TABLE(V3_8, 9, 1, 3, 3)			// 0xA - V3-8
-	{
-		NULL, NULL, 0, 0, 0, 0
-	}
-	,				// 0xB
-
-	_UNPACK_TABLE32(V4_32, 48, 4, 16, 4)	// 0xC - V4-32
-	_UNPACK_TABLE(V4_16, 24, 2, 8, 4)		// 0xD - V4-16
-	_UNPACK_TABLE(V4_8, 12, 1, 4, 4)		// 0xE - V4-8
-	_UNPACK_TABLE32(V4_5, 6, 2, 2, 4)		// 0xF - V4-5
-};
-
 struct VIFSSEUnpackTable
 {
 	// regular 0, 1, 2; mask 0, 1, 2
@ -171,6 +126,9 @@ template<const u32 VIFdmanum> void ProcessMemSkip(u32 size, u32 unpackType)
 {
 	const VIFUnpackFuncTable *unpack;

+	// unpackType is only 0->0xf but that's ok, because the data we're using here is
+	// just duplicated in 0x10->0x1f.
+
 	unpack = &VIFfuncTable[ unpackType ];

 	switch (unpackType)
@ -259,9 +217,6 @@ template u32 VIFalign<1>(u32 *data, vifCode *v, u32 size);
 template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size)
 {
 	u32 *dest;
-	u32 unpackType;
-	UNPACKFUNCTYPE func;
-	const VIFUnpackFuncTable *ft;
 	VURegs * VU;
 	u8 *cdata = (u8*)data;

@ -290,11 +245,8 @@ template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size)
 	VIF_LOG("VIF%d UNPACK Align: Mode=%x, v->size=%d, size=%d, v->addr=%x v->num=%x",
 	        VIFdmanum, v->cmd & 0xf, v->size, size, v->addr, vifRegs->num);

-	// The unpack type
-	unpackType = v->cmd & 0xf;
-
-	ft = &VIFfuncTable[ unpackType ];
-	func = vif->usn ? ft->funcU : ft->funcS;
+	const VIFUnpackFuncTable& ft( VIFfuncTable[ v->cmd & 0x1f ] );
+	UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS;

 	size <<= 2;
 	memsize = size;
@ -311,17 +263,17 @@ template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size)

 		VIFUNPACK_LOG("Aligning packet size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr);

-		if (((u32)size / (u32)ft->dsize) < ((u32)ft->qsize - vifRegs->offset))
+		if (((u32)size / (u32)ft.dsize) < ((u32)ft.qsize - vifRegs->offset))
 		{
-			DevCon.Error("Wasn't enough left size/dsize = %x left to write %x", (size / ft->dsize), (ft->qsize - vifRegs->offset));
+			DevCon.Error("Wasn't enough left size/dsize = %x left to write %x", (size / ft.dsize), (ft.qsize - vifRegs->offset));
 		}
-		unpacksize = min((size / ft->dsize), (ft->qsize - vifRegs->offset));
+		unpacksize = min((size / ft.dsize), (ft.qsize - vifRegs->offset));


-		VIFUNPACK_LOG("Increasing dest by %x from offset %x", (4 - ft->qsize) + unpacksize, vifRegs->offset);
+		VIFUNPACK_LOG("Increasing dest by %x from offset %x", (4 - ft.qsize) + unpacksize, vifRegs->offset);

-		func(dest, (u32*)cdata, unpacksize);
-		size -= unpacksize * ft->dsize;
+		(vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, unpacksize);
+		size -= unpacksize * ft.dsize;

 		if(vifRegs->offset == 0)
 		{
@ -339,13 +291,13 @@ template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size)
 		{
 			if (vifRegs->cycle.cl != vifRegs->cycle.wl)
 			{
-				vif->tag.addr += (((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + ((4 - ft->qsize) + unpacksize)) * 4;
-				dest += ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + (4 - ft->qsize) + unpacksize;
+				vif->tag.addr += (((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + ((4 - ft.qsize) + unpacksize)) * 4;
+				dest += ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + (4 - ft.qsize) + unpacksize;
 			}
 			else
 			{
-				vif->tag.addr += ((4 - ft->qsize) + unpacksize) * 4;
-				dest += (4 - ft->qsize) + unpacksize;
+				vif->tag.addr += ((4 - ft.qsize) + unpacksize) * 4;
+				dest += (4 - ft.qsize) + unpacksize;
 			}

 			if (vif->tag.addr >= (u32)vif_size(VIFdmanum))
@ -354,7 +306,7 @@ template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size)
 				dest = (u32*)(VU->Mem + v->addr);
 			}

-			cdata += unpacksize * ft->dsize;
+			cdata += unpacksize * ft.dsize;
 			vif->cl = 0;
 			VIFUNPACK_LOG("Aligning packet done size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr);
 			if ((size & 0xf) == 0) return size >> 2;
@ -362,8 +314,8 @@ template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size)
 		}
 		else
 		{
-			vif->tag.addr += ((4 - ft->qsize) + unpacksize) * 4;
-			dest += (4 - ft->qsize) + unpacksize;
+			vif->tag.addr += ((4 - ft.qsize) + unpacksize) * 4;
+			dest += (4 - ft.qsize) + unpacksize;

 			if (vif->tag.addr >= (u32)vif_size(VIFdmanum))
 			{
@ -371,7 +323,7 @@ template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size)
 				dest = (u32*)(VU->Mem + v->addr);
 			}

-			cdata += unpacksize * ft->dsize;
+			cdata += unpacksize * ft.dsize;
 			VIFUNPACK_LOG("Aligning packet done size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr);
 		}
 	}
@ -391,11 +343,11 @@ template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size)
 			VIFUNPACK_LOG("Continuing last stream size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr);
 			incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4;

-			while ((size >= ft->gsize) && (vifRegs->num > 0))
+			while ((size >= ft.gsize) && (vifRegs->num > 0))
 			{
-				func(dest, (u32*)cdata, ft->qsize);
-				cdata += ft->gsize;
-				size -= ft->gsize;
+				func(dest, (u32*)cdata);
+				cdata += ft.gsize;
+				size -= ft.gsize;

 				vifRegs->num--;
 				++vif->cl;
@ -431,20 +383,20 @@ template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size)
 			}

 		}
-		if (size >= ft->dsize && vifRegs->num > 0 && ((size & 0xf) != 0 || vif->cl != 0))
+		if (size >= ft.dsize && vifRegs->num > 0 && ((size & 0xf) != 0 || vif->cl != 0))
 		{
 			//VIF_LOG("warning, end with size = %d", size);
 			/* unpack one qword */
-			if(vif->tag.addr + ((size / ft->dsize) * 4)  >= (u32)vif_size(VIFdmanum))
+			if(vif->tag.addr + ((size / ft.dsize) * 4)  >= (u32)vif_size(VIFdmanum))
 			{
 				//DevCon.Warning("Overflow");
 				vif->tag.addr &= (u32)(vif_size(VIFdmanum) - 1);
 				dest = (u32*)(VU->Mem + v->addr);
 			}

-			vif->tag.addr += (size / ft->dsize) * 4;
+			vif->tag.addr += (size / ft.dsize) * 4;

-			func(dest, (u32*)cdata, size / ft->dsize);
+			(vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize);
 			size = 0;

 			if(vifRegs->mode == 2)
@ -468,9 +420,6 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 {
 	//DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data);
 	u32 *dest;
-	u32 unpackType;
-	UNPACKFUNCTYPE func;
-	const VIFUnpackFuncTable *ft;
 	VURegs * VU;
 	u8 *cdata = (u8*)data;
 	u32 tempsize = 0;
@ -507,13 +456,10 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)

 	VIFUNPACK_LOG("USN %x Masking %x Mask %x Mode %x CL %x WL %x Offset %x", vif->usn, (vifRegs->code & 0x10000000) >> 28, vifRegs->mask, vifRegs->mode, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->offset);

-	// The unpack type
-	unpackType = v->cmd & 0xf;
-
 	_mm_prefetch((char*)data + 128, _MM_HINT_NTA);

-	ft = &VIFfuncTable[ unpackType ];
-	func = vif->usn ? ft->funcU : ft->funcS;
+	const VIFUnpackFuncTable& ft( VIFfuncTable[ v->cmd & 0x1f ] );
+	UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS;

 	size <<= 2;

@ -528,12 +474,12 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 			dest = (u32*)(VU->Mem + v->addr);
 		}

-		size = min(size, (int)vifRegs->num * ft->gsize); //size will always be the same or smaller
+		size = std::min<u32>(size, vifRegs->num * ft.gsize); //size will always be the same or smaller

 		tempsize = vif->tag.addr + ((((vifRegs->num-1) / vifRegs->cycle.wl) *
 			(vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16);

-		/*tempsize = vif->tag.addr + (((size / (ft->gsize * vifRegs->cycle.wl)) *
+		/*tempsize = vif->tag.addr + (((size / (ft.gsize * vifRegs->cycle.wl)) *
 			(vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16);*/

 		//Sanity Check (memory overflow)
@ -562,7 +508,7 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 #endif
 		}

-		if (size >= ft->gsize)
+		if (size >= ft.gsize)
 		{
 			const UNPACKPARTFUNCTYPESSE* pfn;
 			int writemask;
@ -612,7 +558,7 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 				vifRegs->cycle.cl = vifRegs->cycle.wl = 1;
 			}

-			pfn = vif->usn ? VIFfuncTableSSE[unpackType].funcU : VIFfuncTableSSE[unpackType].funcS;
+			pfn = vif->usn ? VIFfuncTableSSE[v->cmd & 0xf].funcU : VIFfuncTableSSE[v->cmd & 0xf].funcS;
 			writemask = VIFdmanum ? g_vif1HasMask3[min(vifRegs->cycle.wl,(u8)3)] : g_vif0HasMask3[min(vifRegs->cycle.wl,(u8)3)];
 			writemask = pfn[(((vifRegs->code & 0x10000000)>>28)<<writemask)*3+vifRegs->mode](dest, (u32*)cdata, size);

@ -630,20 +576,20 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 			// if size is left over, update the src,dst pointers
 			if (writemask > 0)
 			{
-				int left = (size - writemask) / ft->gsize;
-				cdata += left * ft->gsize;
+				int left = (size - writemask) / ft.gsize;
+				cdata += left * ft.gsize;
 				dest = (u32*)((u8*)dest + ((left / vifRegs->cycle.wl) * vifRegs->cycle.cl + left % vifRegs->cycle.wl) * 16);
 				vifRegs->num -= left;
-				vif->cl = (size % (ft->gsize * vifRegs->cycle.wl)) / ft->gsize;
+				vif->cl = (size % (ft.gsize * vifRegs->cycle.wl)) / ft.gsize;
 				size = writemask;

-				if (size >= ft->dsize && vifRegs->num > 0)
+				if (size >= ft.dsize && vifRegs->num > 0)
 				{
 					VIF_LOG("warning, end with size = %d", size);

 					/* unpack one qword */
-					//vif->tag.addr += (size / ft->dsize) * 4;
-					func(dest, (u32*)cdata, size / ft->dsize);
+					//vif->tag.addr += (size / ft.dsize) * 4;
+					(vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize);
 					size = 0;

 					if(vifRegs->mode == 2)
@ -659,8 +605,8 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 			}
 			else
 			{
-				vifRegs->num -= size / ft->gsize;
-				if (vifRegs->num > 0) vif->cl = (size % (ft->gsize * vifRegs->cycle.wl)) / ft->gsize;
+				vifRegs->num -= size / ft.gsize;
+				if (vifRegs->num > 0) vif->cl = (size % (ft.gsize * vifRegs->cycle.wl)) / ft.gsize;
 				size = 0;
 			}
 		}
@ -669,11 +615,14 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 			int incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4;
 			size = 0;
 			int addrstart = v->addr;
-			if((tempsize >> 2) != vif->tag.size) DevCon.Warning("split when size != tagsize");
+
+			#ifndef NON_SSE_UNPACKS		// spams pointlessly when SSE unpacks are disabled
+			//if((tempsize >> 2) != vif->tag.size) DevCon.Warning("split when size != tagsize");
+			#endif

 			VIFUNPACK_LOG("sorting tempsize :p, size %d, vifnum %d, addr %x", tempsize, vifRegs->num, vif->tag.addr);

-			while ((tempsize >= ft->gsize) && (vifRegs->num > 0))
+			while ((tempsize >= ft.gsize) && (vifRegs->num > 0))
 			{
 				if(v->addr >= memlimit)
 				{
@ -682,9 +631,9 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 					dest = (u32*)(VU->Mem + v->addr);
 				}

-				func(dest, (u32*)cdata, ft->qsize);
-				cdata += ft->gsize;
-				tempsize -= ft->gsize;
+				func(dest, (u32*)cdata);
+				cdata += ft.gsize;
+				tempsize -= ft.gsize;

 				vifRegs->num--;
 				++vif->cl;
@ -721,13 +670,13 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 			if(tempsize > 0) size = tempsize;
 		}
 		
-		if (size >= ft->dsize && vifRegs->num > 0) //Else write what we do have
+		if (size >= ft.dsize && vifRegs->num > 0) //Else write what we do have
 		{
 			VIF_LOG("warning, end with size = %d", size);

 			/* unpack one qword */
-			//vif->tag.addr += (size / ft->dsize) * 4;
-			func(dest, (u32*)cdata, size / ft->dsize);
+			//vif->tag.addr += (size / ft.dsize) * 4;
+			(vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize);
 			size = 0;

 			if(vifRegs->mode == 2)
@ -745,8 +694,8 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 	{

 		if(vifRegs->cycle.cl > 0) // Quicker and avoids zero division :P
-			if((u32)(((size / ft->gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num)
-			DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft->gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl);
+			if((u32)(((size / ft.gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num)
+			DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft.gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl);

 		//DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType, vif->tag.addr);
 		while (vifRegs->num > 0)
@ -758,15 +707,16 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)

 			if (vif->cl < vifRegs->cycle.cl)   /* unpack one qword */
 			{
-				if(size < ft->gsize)
+				if(size < ft.gsize)
 				{
 					VIF_LOG("Out of Filling write data");
 					break;
 				}

-				func(dest, (u32*)cdata, ft->qsize);
-				cdata += ft->gsize;
-				size -= ft->gsize;
+				func(dest, (u32*)cdata);
+				cdata += ft.gsize;
+				size -= ft.gsize;
+
 				vif->cl++;
 				vifRegs->num--;

@ -777,7 +727,7 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size)
 			}
 			else
 			{
-				func(dest, (u32*)cdata, ft->qsize);
+				func(dest, (u32*)cdata);
 				vif->tag.addr += 16;
 				vifRegs->num--;
 				++vif->cl;
--- a/pcsx2/VifDma.h
+++ b/pcsx2/VifDma.h
@ -47,40 +47,6 @@ extern vifStruct vif0, vif1;
 extern u8 schedulepath3msk;
 static const int VifCycleVoodoo = 4;

-void __fastcall UNPACK_S_32( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_S_16u( u32 *dest, u32 *data, int size );
-void __fastcall UNPACK_S_16s( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_S_8u( u32 *dest, u32 *data, int size );
-void __fastcall UNPACK_S_8s( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_V2_32( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_V2_16u( u32 *dest, u32 *data, int size );
-void __fastcall UNPACK_V2_16s( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_V2_8u( u32 *dest, u32 *data, int size );
-void __fastcall UNPACK_V2_8s( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_V3_32( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_V3_16u( u32 *dest, u32 *data, int size );
-void __fastcall UNPACK_V3_16s( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_V3_8u( u32 *dest, u32 *data, int size );
-void __fastcall UNPACK_V3_8s( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_V4_32( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_V4_16u( u32 *dest, u32 *data, int size );
-void __fastcall UNPACK_V4_16s( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_V4_8u( u32 *dest, u32 *data, int size );
-void __fastcall UNPACK_V4_8s( u32 *dest, u32 *data, int size );
-
-void __fastcall UNPACK_V4_5( u32 *dest, u32 *data, int size );
-
 extern void vifDmaInit();

 extern void vif0Init();
--- a/pcsx2/VifDma_internal.h
+++ b/pcsx2/VifDma_internal.h
@ -16,6 +16,8 @@
 #ifndef __VIFDMA_INTERNAL_H__
 #define __VIFDMA_INTERNAL_H__

+#include "VifDma.h"
+
 enum VifModes
 {
 	VIF_NORMAL_TO_MEM_MODE = 0,
@ -27,7 +29,8 @@ enum VifModes
 static const unsigned int VIF0intc = 4;
 static const unsigned int VIF1intc = 5;

-typedef void (__fastcall *UNPACKFUNCTYPE)(u32 *dest, u32 *data, int size);
+typedef void (__fastcall *UNPACKFUNCTYPE)(u32 *dest, u32 *data);
+typedef void (__fastcall *UNPACKFUNCTYPE_ODD)(u32 *dest, u32 *data, int size);
 typedef int (*UNPACKPARTFUNCTYPESSE)(u32 *dest, u32 *data, int size);

 struct VIFUnpackFuncTable
@ -35,18 +38,23 @@ struct VIFUnpackFuncTable
 	UNPACKFUNCTYPE       funcU;
 	UNPACKFUNCTYPE       funcS;

-	u32 bsize; // currently unused
-	u32 dsize; // byte size of one channel
-	u32 gsize; // size of data in bytes used for each write cycle
-	u32 qsize; // used for unpack parts, num of vectors that
+	UNPACKFUNCTYPE_ODD   oddU;		// needed for old-style vif only, remove when old vif is removed.
+	UNPACKFUNCTYPE_ODD   oddS;		// needed for old-style vif only, remove when old vif is removed.
+
+	u8 bsize; // currently unused
+	u8 dsize; // byte size of one channel
+	u8 gsize; // size of data in bytes used for each write cycle
+	u8 qsize; // used for unpack parts, num of vectors that
 	// will be decompressed from data for 1 cycle
 };

-extern const VIFUnpackFuncTable VIFfuncTable[16];
+extern const __aligned16 VIFUnpackFuncTable VIFfuncTable[32];
+
 extern __aligned16 u32 g_vif0Masks[64], g_vif1Masks[64];
 extern u32 g_vif0HasMask3[4], g_vif1HasMask3[4];
 extern int g_vifCycles;
 extern u8 s_maskwrite[256];
+extern vifStruct *vif;

 template<const u32 VIFdmanum> void ProcessMemSkip(u32 size, u32 unpackType);
 template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size);
@ -63,4 +71,9 @@ static __forceinline u32 vif_size(u8 num)
 //#define newVif  // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
 //#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
 //#define newVif0 // Use New Code for Vif0 Unpacks (not implemented)
+
+#ifndef newVif
+//#	define NON_SSE_UNPACKS  // Turns off SSE Unpacks (slower)
+#endif
+
 #endif
--- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
+++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
@ -824,6 +824,10 @@
 								RelativePath="..\..\VifDma_internal.h"
 								>
 							</File>
+							<File
+								RelativePath="..\..\VIFunpack.cpp"
+								>
+							</File>
 							<Filter
 								Name="Dynarec"
 								>
--- a/pcsx2/x86/newVif.h
+++ b/pcsx2/x86/newVif.h
@ -24,8 +24,8 @@ extern void _nVifUnpack(int idx, u8 *data, u32 size);

 typedef u32 (__fastcall *nVifCall)(void*, void*);

-static __pagealigned u8 nVifUpkExec[__pagesize*16];
-static __aligned16 nVifCall nVifUpk[(2*2*16)*4]; // ([USN][Masking][Unpack Type]) [curCycle]
+static __pagealigned u8 nVifUpkExec[__pagesize*4];
+static __aligned16 nVifCall nVifUpk[(2*2*16) *4 ]; // ([USN][Masking][Unpack Type]) [curCycle]
 static __aligned16 u32 nVifMask[3][4][4] = {0};  // [MaskNumber][CycleNumber][Vector]

 #define _1mb (0x100000)
@ -57,7 +57,30 @@ struct nVifStruct {
 	BlockBuffer*	vifCache;	// Block Buffer
 };

-static const u32 nVifT[16] = { 
+// Contents of this table are doubled up for doMast(false) and doMask(true) lookups.
+// (note: currently unused, I'm using gsize in the interp tables instead since it
+//  seems to be faster for now, which may change when nVif isn't reliant on interpreted
+//  unpackers anymore --air)
+static const u32 nVifT[32] = { 
+	4, // S-32
+	2, // S-16
+	1, // S-8
+	0, // ----
+	8, // V2-32
+	4, // V2-16
+	2, // V2-8
+	0, // ----
+	12,// V3-32
+	6, // V3-16
+	3, // V3-8
+	0, // ----
+	16,// V4-32
+	8, // V4-16
+	4, // V4-8
+	2, // V4-5
+
+	// Second verse, same as the first!
+
 	4, // S-32
 	2, // S-16
 	1, // S-8
@ -77,8 +100,8 @@ static const u32 nVifT[16] = {
 };

 #include "newVif_OldUnpack.inl"
-#include "newVif_UnpackGen.inl"
 #include "newVif_Unpack.inl"
+#include "newVif_UnpackGen.inl"

 //#include "newVif_Dynarec.inl"

--- a/pcsx2/x86/newVif_OldUnpack.inl
+++ b/pcsx2/x86/newVif_OldUnpack.inl
@ -21,8 +21,6 @@ template void VIFunpack<0>(u32 *data, vifCode *v, u32 size);
 template void VIFunpack<1>(u32 *data, vifCode *v, u32 size);
 template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size) {
 	//if (!VIFdmanum) DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data);
-	UNPACKFUNCTYPE func;
-	const VIFUnpackFuncTable *ft;
 	VURegs * VU;
 	u8 *cdata = (u8*)data;
 	u32 tempsize = 0;
@ -44,10 +42,10 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size) {
 	}

 	u32 *dest      = (u32*)(VU->Mem + v->addr);
-	u32 unpackType = v->cmd & 0xf;

-	ft     = &VIFfuncTable[ unpackType ];
-	func   = vif->usn ? ft->funcU : ft->funcS;
+	const VIFUnpackFuncTable& ft( VIFfuncTable[ v->cmd & 0x1f ] );
+	UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS;
+
 	size <<= 2;

 	if (vifRegs->cycle.cl >= vifRegs->cycle.wl) { // skipping write
@ -57,7 +55,7 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size) {
 			dest = (u32*)(VU->Mem + v->addr);
 		}

-		size = min(size, (int)vifRegs->num * ft->gsize); //size will always be the same or smaller
+		size = std::min<u32>(size, vifRegs->num * ft.gsize); //size will always be the same or smaller

 		tempsize = v->addr + ((((vifRegs->num-1) / vifRegs->cycle.wl) *
 			 (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16);
@ -90,16 +88,16 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size) {

 			VIFUNPACK_LOG("sorting tempsize :p, size %d, vifnum %d, addr %x", tempsize, vifRegs->num, v->addr);

-			while ((tempsize >= ft->gsize) && (vifRegs->num > 0)) {
+			while ((tempsize >= ft.gsize) && (vifRegs->num > 0)) {
 				if(v->addr >= memlimit) {
 					DevCon.Warning("Mem limit overflow");
 					v->addr &= (memlimit - 1);
 					dest = (u32*)(VU->Mem + v->addr);
 				}

-				func(dest, (u32*)cdata, ft->qsize);
-				cdata    += ft->gsize;
-				tempsize -= ft->gsize;
+				func(dest, (u32*)cdata);
+				cdata    += ft.gsize;
+				tempsize -= ft.gsize;

 				vifRegs->num--;
 				vif->cl++;
@ -122,32 +120,32 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size) {
 			if(tempsize > 0) size = tempsize;
 		}
 		
-		if (size >= ft->dsize && vifRegs->num > 0) { //Else write what we do have
+		if (size >= ft.dsize && vifRegs->num > 0) { //Else write what we do have
 			DevCon.Warning("huh!!!!!!!!!!!!!!!!!!!!!!");
 			VIF_LOG("warning, end with size = %d", size);
 			// unpack one qword
-			//v->addr += (size / ft->dsize) * 4;
-			func(dest, (u32*)cdata, size / ft->dsize);
+			//v->addr += (size / ft.dsize) * 4;
+			(vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize);
 			size = 0;
 			VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, v->addr);
 		}
 	}
 	else { // filling write
 		if(vifRegs->cycle.cl > 0) // Quicker and avoids zero division :P
-			if((u32)(((size / ft->gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num)
-			DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft->gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl);
+			if((u32)(((size / ft.gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num)
+			DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft.gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl);

-		DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType, vif->tag.addr);
+		DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, v->cmd & 0xf, vif->tag.addr);
 		while (vifRegs->num > 0) {
 			if (vif->cl == vifRegs->cycle.wl) {
 				vif->cl = 0;
 			}
 			// unpack one qword
 			if (vif->cl < vifRegs->cycle.cl) { 
-				if(size < ft->gsize) { DevCon.WriteLn("Out of Filling write data!"); break; }
-				func(dest, (u32*)cdata, ft->qsize);
-				cdata += ft->gsize;
-				size  -= ft->gsize;
+				if(size < ft.gsize) { DevCon.WriteLn("Out of Filling write data!"); break; }
+				func(dest, (u32*)cdata);
+				cdata += ft.gsize;
+				size  -= ft.gsize;
 				vif->cl++;
 				vifRegs->num--;
 				if (vif->cl == vifRegs->cycle.wl) {
@ -155,7 +153,7 @@ template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size) {
 				}
 			}
 			else {
-				func(dest, (u32*)cdata, ft->qsize);
+				func(dest, (u32*)cdata);
 				v->addr += 16;
 				vifRegs->num--;
 				vif->cl++;
--- a/pcsx2/x86/newVif_Unpack.inl
+++ b/pcsx2/x86/newVif_Unpack.inl
@ -21,29 +21,6 @@

 static __aligned16 nVifStruct nVif[2];

-void initNewVif(int idx) {
-	nVif[idx].idx		= idx;
-	nVif[idx].VU		= idx ? &VU1     : &VU0;
-	nVif[idx].vif		= idx ? &vif1    : &vif0;
-	nVif[idx].vifRegs	= idx ? vif1Regs : vif0Regs;
-	nVif[idx].vuMemEnd  = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
-	nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
-	nVif[idx].vifCache	= NULL;
-
-	HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
-	memset8<0xcc>( nVifUpkExec );
-
-	xSetPtr( nVifUpkExec );
-
-	for (int a = 0; a < 2; a++) {
-	for (int b = 0; b < 2; b++) {
-	for (int c = 0; c < 4; c++) {
-		nVifGen(a, b, c);
-	}}}
-
-	HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
-}
-
 int nVifUnpack(int idx, u32 *data) {
 	XMMRegisters::Freeze();
 	int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
@ -108,65 +85,76 @@ static void setMasks(int idx, const VIFregisters& v) {
 // ----------------------------------------------------------------------------
 //  Unpacking Optimization notes:
 // ----------------------------------------------------------------------------
-//  Some games send a LOT of small packets.  This is a problem because the new VIF unpacker
-//  has a lot of setup code to establish which unpack function to call.  The best way to
-//  optimize this is to cache the unpack function's base (see fnbase below) and update it
-//  when the variables it depends on are modified: writes to vif->tag.cmd and vif->usn.
-//  Problem: vif->tag.cmd is modified a lot.  Like, constantly.  So won't work.
+// Some games send a LOT of single-cycle packets (God of War, SotC, TriAce games, etc),
+// so we always need to be weary of keeping loop setup code optimized.  It's not always
+// a "win" to move code outside the loop, like normally in most other loop scenarios.
 //
-//  A secondary optimization would be adding special handlers for packets where vifRegs->num==1.
-//  (which would remove the loop, simplify the incVUptr code, etc).  But checking for it has
-//  to be simple enough that it doesn't offset the benefits (which I'm not sure is possible).
-//   -- air
+// The biggest bottleneck of the current code is the call/ret needed to invoke the SSE
+// unpackers.  A better option is to generate the entire vifRegs->num loop code as part
+// of the SSE template, and inline the SSE code into the heart of it.  This both avoids
+// the call/ret and opens the door for resolving some register dependency chains in the
+// current emitted functions.  (this is what zero's SSE does to get it's final bit of
+// speed advantage over the new vif). --air
+//
+// As a secondary optimization to above, special handlers could be generated for the
+// cycleSize==1 case, which is used frequently enough, and results in enough code
+// elimination that it would probably be a win in most cases (and for sure in many
+// "slow" games that need it most). --air

 template< int idx, bool doMode, bool isFill >
 __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) {

-	const int	usn		= !!(vif->usn);
-	const int	doMask	= !!(vif->tag.cmd & 0x10);
-	const int	upkNum	= vif->tag.cmd & 0xf;
-	const u32&	vift	= nVifT[upkNum];
-
-	u8* dest					 = setVUptr(idx, vif->tag.addr);
-	const VIFUnpackFuncTable& ft = VIFfuncTable[upkNum];
-	UNPACKFUNCTYPE func			 = usn ? ft.funcU : ft.funcS;
-
-	// Did a bunch of work to make it so I could optimize this index lookup to outside
-	// the main loop but it was for naught -- too often the loop is only 1-2 iterations,
-	// so this setup code ends up being slower (1 iter) or same speed (2 iters).
-	const nVifCall*	fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*1) ];
-
 	const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
 	const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
 	const int skipSize  = blockSize - cycleSize;
+
 	//if (skipSize > 2)
 	//DevCon.WriteLn("[num = %d][cl = %d][bl = %d][diff = %d]", vifRegs->num, vif->cl, blockSize, skipSize);

-	// This condition doesn't appear to ever occur, and really it never should.
-	// Normally it wouldn't matter, but even simple setup code matters here (see 
-	// optimization notes above) >_<
+	if (vif->cmd & 0x10) setMasks(idx, *vifRegs);
+
+	const int	usn		= !!(vif->usn);
+	const int	upkNum	= vif->cmd & 0x1f;
+	//const s8&	vift	= nVifT[upkNum];	// might be useful later when other SSE paths are finished.
+
+	// Recompiled Unpacker, used when doMode is false.
+	// Did a bunch of work to make it so I could optimize this index lookup to outside
+	// the main loop but it was for naught -- too often the loop is only 1-2 iterations,
+	// so this setup code ends up being slower (1 iter) or same speed (2 iters).
+	const nVifCall*	fnbase			= &nVifUpk[ ((usn*2*16) + upkNum) * (4*1) ];
+
+	// Interpreted Unpacker, used if doMode is true OR if isFill is true.  Lookup is
+	// always performed for now, due to ft.gsize reference (seems faster than using
+	// nVifT for now)
+	const VIFUnpackFuncTable& ft	= VIFfuncTable[upkNum];
+	UNPACKFUNCTYPE func				= usn ? ft.funcU : ft.funcS;
+
+	u8* dest = setVUptr(idx, vif->tag.addr);
+
 	if (vif->cl >= blockSize)  vif->cl = 0;
-	if (doMask) setMasks(idx, *vifRegs);

 	while (vifRegs->num /*&& size*/) {
 		if (vif->cl < cycleSize) { 
 			if (doMode /*|| doMask*/) {
 				//if (doMask)
 				//DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum);
-				func((u32*)dest, (u32*)data, ft.qsize);
+				func((u32*)dest, (u32*)data);
 			}
 			else {
 				//DevCon.WriteLn("SSE Unpack!");
-				fnbase[aMin(vif->cl, 4)](dest, data);
+				
+				// Opt note: removing this min check (which isn't needed right now?) is +1%
+				// or more.  Just something to keep in mind. :) --air
+				fnbase[0/*aMin(vif->cl, 4)*/](dest, data);
 			}
-			data += vift;
-			size -= vift;
+			data += ft.gsize;
+			size -= ft.gsize;
 			vifRegs->num--;
 			incVUptr(idx, dest, 16);
 			if (++vif->cl == blockSize) vif->cl = 0;
 		}
 		else if (isFill) {
-			func((u32*)dest, (u32*)data, ft.qsize);
+			func((u32*)dest, (u32*)data);
 			vifRegs->num--;
 			incVUptr(idx, dest, 16);
 			if (++vif->cl == blockSize) vif->cl = 0;
@ -179,7 +167,24 @@ __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) {
 	//if (size > 0) DevCon.WriteLn("size = %d", size);
 }

-void _nVifUnpack(int idx, u8 *data, u32 size) {
+typedef void (__fastcall* Fnptr_VifUnpackLoop)(u8 *data, u32 size);
+
+static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = 
+{
+	{
+		{ _nVifUnpackLoop<0,false,false>, _nVifUnpackLoop<0,false,true> },
+		{ _nVifUnpackLoop<0,true,false>, _nVifUnpackLoop<0,true,true> },
+	},
+
+	{
+		{ _nVifUnpackLoop<1,false,false>, _nVifUnpackLoop<1,false,true> },
+		{ _nVifUnpackLoop<1,true,false>, _nVifUnpackLoop<1,true,true> },
+	},
+	
+};
+
+
+static _f void _nVifUnpack(int idx, u8 *data, u32 size) {
 	/*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write
 		if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2);
 		else	  VIFunpack<1>((u32*)data, &vif1.tag, size>>2);
@ -192,19 +197,7 @@ void _nVifUnpack(int idx, u8 *data, u32 size) {
 		const bool doMode =   vifRegs->mode && !(vif->tag.cmd & 0x10);
 		const bool isFill =  (vifRegs->cycle.cl < vifRegs->cycle.wl);

-		//UnpackLoopTable[idx][doMode][isFill]( data, size );
-
-		if (idx) {
-			if (doMode) {
-				if (isFill)	_nVifUnpackLoop<1,true,true>  (data, size);
-				else		_nVifUnpackLoop<1,true,false> (data, size);
-			}
-			else {
-				if (isFill) _nVifUnpackLoop<1,false,true> (data, size);
-				else		_nVifUnpackLoop<1,false,false>(data, size);
-			}
-		}
-		else pxFailDev( "No VIF0 support yet, sorry!" );
+		UnpackLoopTable[idx][doMode][isFill]( data, size );

 		//if (isFill)
 		//DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
--- a/pcsx2/x86/newVif_UnpackGen.inl
+++ b/pcsx2/x86/newVif_UnpackGen.inl
@ -43,7 +43,7 @@ struct VifUnpackIndexer {
 		int packpart	= packType;
 		int curpart		= curCycle;

-		return nVifUpk[((usnpart+maskpart+packpart)*4) + (curpart)];
+		return nVifUpk[((usnpart+maskpart+packpart) * 4) + (curpart)];
 	}
 	
 	void xSetCall(int packType) const {
@ -158,6 +158,12 @@ void nVifGen(int usn, int mask, int curCycle) {

 	// A | B5 | G5 | R5
 	// ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000
+	
+	// Optimization: This function has a *really* long dependency chain.
+	// It would be better if the [edx] is loaded into multiple regs and
+	// then the regs are shifted each independently, instead of using the
+	// progressive shift->move pattern below. --air
+
 	indexer.xSetCall(0xf); // V4-5
 		xMOV16		(xmm0, ptr32[edx]);
 		xMOVAPS		(xmm1, xmm0);
@ -184,3 +190,27 @@ void nVifGen(int usn, int mask, int curCycle) {

 	pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
 }
+
+void initNewVif(int idx) {
+	nVif[idx].idx		= idx;
+	nVif[idx].VU		= idx ? &VU1     : &VU0;
+	nVif[idx].vif		= idx ? &vif1    : &vif0;
+	nVif[idx].vifRegs	= idx ? vif1Regs : vif0Regs;
+	nVif[idx].vuMemEnd  = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
+	nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
+	nVif[idx].vifCache	= NULL;
+
+	HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
+	memset8<0xcc>( nVifUpkExec );
+
+	xSetPtr( nVifUpkExec );
+
+	for (int a = 0; a < 2; a++) {
+		for (int b = 0; b < 2; b++) {
+			for (int c = 0; c < 4; c++) {
+				nVifGen(a, b, c);
+			}
+		}}
+
+	HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
+}