--updated to r956

--uses test8 instead of test32 when possible --exception handling checks are a bit more strict git-svn-id: http://pcsx2.googlecode.com/svn/branches/vtlb-exp@957 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-04-12 02:49:23 +00:00 · 2009-04-12 02:49:23 +00:00 · f3370ce28f
parent 3fb09c2192 e100933069
commit f3370ce28f
79 changed files with 8768 additions and 7640 deletions
--- a/common/include/PS2Etypes.h
+++ b/common/include/PS2Etypes.h
@ -55,8 +55,6 @@
 // disable the default case in a switch
 #define jNO_DEFAULT \
 { \
-	break; \
-	\
 default: \
 	jASSUME(0); \
 	break; \
--- a/common/include/Pcsx2Config.h
+++ b/common/include/Pcsx2Config.h
@ -20,11 +20,11 @@
 #define __PCSX2CONFIG_H__

 // Hack so that you can still use this file from C (not C++), or from a plugin without access to Paths.h.
-#ifdef PLUGIN_ONLY
+// .. and removed in favor of a less hackish approach (air)
+
+#ifndef g_MaxPath
 #define g_MaxPath 255
-#else
-#include "Paths.h"
- #endif
+#endif
 
 /////////////////////////////////////////////////////////////////////////
 // Session Configuration Override Flags
@ -45,6 +45,7 @@ extern SessionOverrideFlags g_Session;
 // Pcsx2 User Configuration Options!

 //#define PCSX2_MICROVU	// Use Micro VU recs instead of Zero VU Recs
+//#define PCSX2_MICROVU_	// Fully enable Micro VU recs (temporary option for now)
 #define PCSX2_GSMULTITHREAD 1 // uses multi-threaded gs
 #define PCSX2_EEREC 0x10
 #define PCSX2_VU0REC 0x20
--- a/pcsx2/Common.h
+++ b/pcsx2/Common.h
@ -29,6 +29,8 @@

 #define PCSX2_VERSION "(beta)"

+#include "System.h"
+
 #include "Plugins.h"
 #include "SaveState.h"

@ -40,7 +42,4 @@
 #include "Elfheader.h"
 #include "Patch.h"

-#include "System.h"
-#include "Pcsx2Config.h"
-
 #endif /* __COMMON_H__ */
--- a/pcsx2/Counters.cpp
+++ b/pcsx2/Counters.cpp
@ -164,7 +164,7 @@ struct vSyncTimingInfo
 static vSyncTimingInfo vSyncInfo;


-static __forceinline void vSyncInfoCalc( vSyncTimingInfo* info, u32 framesPerSecond, u32 scansPerFrame )
+static void vSyncInfoCalc( vSyncTimingInfo* info, u32 framesPerSecond, u32 scansPerFrame )
 {
 	// Important: Cannot use floats or doubles here.  The emulator changes rounding modes
 	// depending on user-set speedhack options, and it can break float/double code
@ -270,8 +270,6 @@ u32 UpdateVSyncRate()
 	return (u32)m_iTicks;
 }

-extern u32 vu0time;
-
 void frameLimitReset()
 {
 	m_iStart = GetCPUTicks();
@ -282,13 +280,13 @@ void frameLimitReset()
 // See the GS FrameSkip function for details on why this is here and not in the GS.
 static __forceinline void frameLimit()
 {
+	if( CHECK_FRAMELIMIT == PCSX2_FRAMELIMIT_NORMAL ) return;
+	if( Config.CustomFps >= 999 ) return;	// means the user would rather just have framelimiting turned off...
+	
 	s64 sDeltaTime;
 	u64 uExpectedEnd;
 	u64 iEnd;

-	if( CHECK_FRAMELIMIT == PCSX2_FRAMELIMIT_NORMAL ) return;
-	if( Config.CustomFps >= 999 ) return;	// means the user would rather just have framelimiting turned off...
-
 	uExpectedEnd = m_iStart + m_iTicks;
 	iEnd = GetCPUTicks();

@ -465,7 +463,7 @@ __forceinline bool rcntUpdate_vSync()
 	return false;
 }

-static __forceinline void __fastcall _cpuTestTarget( int i )
+static __forceinline void _cpuTestTarget( int i )
 {
 	if (counters[i].count < counters[i].target) return;

@ -538,7 +536,7 @@ __forceinline bool rcntUpdate()
 	return retval;
 }

-static void _rcntSetGate( int index )
+static __forceinline void _rcntSetGate( int index )
 {
 	if (counters[index].mode.EnableGate)
 	{
@ -563,7 +561,7 @@ static void _rcntSetGate( int index )
 }

 // mode - 0 means hblank source, 8 means vblank source.
-void __fastcall rcntStartGate(bool isVblank, u32 sCycle)
+__forceinline void rcntStartGate(bool isVblank, u32 sCycle)
 {
 	int i;

@ -624,7 +622,7 @@ void __fastcall rcntStartGate(bool isVblank, u32 sCycle)
 }

 // mode - 0 means hblank signal, 8 means vblank signal.
-void __fastcall rcntEndGate(bool isVblank , u32 sCycle)
+__forceinline void rcntEndGate(bool isVblank , u32 sCycle)
 {
 	int i;

@ -665,7 +663,7 @@ void __fastcall rcntEndGate(bool isVblank , u32 sCycle)
 	// rcntUpdate, since we're being called from there anyway.
 }

-void __fastcall rcntWmode(int index, u32 value)  
+__forceinline void rcntWmode(int index, u32 value)  
 {
 	if(counters[index].mode.IsCounting) {
 		if(counters[index].mode.ClockSource != 0x3) {
@ -696,7 +694,7 @@ void __fastcall rcntWmode(int index, u32 value)
 	_rcntSet( index );
 }

-void __fastcall rcntWcount(int index, u32 value) 
+__forceinline void rcntWcount(int index, u32 value) 
 {
 	EECNT_LOG("EE Counter[%d] writeCount = %x,   oldcount=%x, target=%x", index, value, counters[index].count, counters[index].target );

@ -722,7 +720,7 @@ void __fastcall rcntWcount(int index, u32 value)
 	_rcntSet( index );
 }

-void __fastcall rcntWtarget(int index, u32 value)
+__forceinline void rcntWtarget(int index, u32 value)
 {
 	EECNT_LOG("EE Counter[%d] writeTarget = %x", index, value);

@ -738,13 +736,13 @@ void __fastcall rcntWtarget(int index, u32 value)
 	_rcntSet( index );
 }

-void __fastcall rcntWhold(int index, u32 value)
+__forceinline void rcntWhold(int index, u32 value)
 {
 	EECNT_LOG("EE Counter[%d] Hold Write = %x", index, value);
 	counters[index].hold = value;
 }

-u32 __fastcall rcntRcount(int index)
+__forceinline u32 rcntRcount(int index)
 {
 	u32 ret;

@ -759,7 +757,7 @@ u32 __fastcall rcntRcount(int index)
 	return ret;
 }

-u32 __fastcall rcntCycle(int index)
+__forceinline u32 rcntCycle(int index)
 {
 	if (counters[index].mode.IsCounting && (counters[index].mode.ClockSource != 0x3)) 
 		return counters[index].count + ((cpuRegs.cycle - counters[index].sCycleT) / counters[index].rate);
--- a/pcsx2/Counters.h
+++ b/pcsx2/Counters.h
@ -139,14 +139,14 @@ extern bool rcntUpdate_vSync();
 extern bool rcntUpdate();

 extern void rcntInit();
-extern void __fastcall rcntStartGate(bool mode, u32 sCycle);
-extern void __fastcall rcntEndGate(bool mode, u32 sCycle);
-extern void __fastcall rcntWcount(int index, u32 value);
-extern void __fastcall rcntWmode(int index, u32 value);
-extern void __fastcall rcntWtarget(int index, u32 value);
-extern void __fastcall rcntWhold(int index, u32 value);
-extern u32	 __fastcall rcntRcount(int index);
-extern u32	 __fastcall rcntCycle(int index);
+extern void rcntStartGate(bool mode, u32 sCycle);
+extern void rcntEndGate(bool mode, u32 sCycle);
+extern void rcntWcount(int index, u32 value);
+extern void rcntWmode(int index, u32 value);
+extern void rcntWtarget(int index, u32 value);
+extern void rcntWhold(int index, u32 value);
+extern u32	rcntRcount(int index);
+extern u32	rcntCycle(int index);

 u32 UpdateVSyncRate();
 void frameLimitReset();
--- a/pcsx2/DebugTools/Debug.h
+++ b/pcsx2/DebugTools/Debug.h
@ -190,6 +190,8 @@ extern bool SrcLog_GPU( const char* fmt, ... );
 #define MEMCARDS_LOG 0&&
 #endif

+//#define VIFUNPACKDEBUG //enable unpack debugging output
+
 #ifdef VIFUNPACKDEBUG
 #define VIFUNPACK_LOG VIF_LOG
 #else
--- a/pcsx2/Exceptions.h
+++ b/pcsx2/Exceptions.h
@ -16,11 +16,7 @@
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 */

-#ifndef _PCSX2_EXCEPTIONS_H_
-#define _PCSX2_EXCEPTIONS_H_
-
-#include <stdexcept>
-#include "StringUtils.h"
+#pragma once

 // This class provides an easy and clean method for ensuring objects are not copyable.
 class NoncopyableObject
@ -380,5 +376,3 @@ namespace Exception
 		{}
 	};
 }
-
-#endif
--- a/pcsx2/PrecompiledHeader.h
+++ b/pcsx2/PrecompiledHeader.h
@ -33,6 +33,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // Include the STL junk that's actually handy.

+#include <stdexcept>
 #include <algorithm>
 #include <vector>
 #include <string>
@ -69,7 +70,9 @@ typedef int BOOL;

 #include "zlib/zlib.h"
 #include "PS2Etypes.h"
+#include "MemcpyFast.h"
 #include "StringUtils.h"
+#include "Exceptions.h"

 ////////////////////////////////////////////////////////////////////
 // Compiler/OS specific macros and defines -- Begin Section
@ -155,24 +158,3 @@ static __forceinline u32 timeGetTime()
 #	define __releaseinline __forceinline
 #endif

-//////////////////////////////////////////////////////////////////////////////////////////
-// Emitter Instance Identifiers.  If you add a new emitter, do it here also.
-// Note: Currently most of the instances map back to 0, since existing dynarec code all
-// shares iCore and must therefore all share the same emitter instance.
-// (note: these don't really belong here per-se, but it's an easy spot to use for now)
-enum
-{
-	EmitterId_R5900 = 0,
-	EmitterId_R3000a = EmitterId_R5900,
-	EmitterId_VU0micro = EmitterId_R5900,
-	EmitterId_VU1micro = EmitterId_R5900,
-	
-	// Cotton's new microVU, which is iCore-free
-	EmitterId_microVU0,
-	EmitterId_microVU1,
-
-	// Air's eventual IopRec, which will also be iCore-free
-	EmitterId_R3000air,
-		
-	EmitterId_Count			// must always be last!
-};
--- a/pcsx2/R3000A.cpp
+++ b/pcsx2/R3000A.cpp
@ -224,7 +224,7 @@ static __forceinline void _psxTestInterrupts()
 	}
 }

-void psxBranchTest()
+__releaseinline void psxBranchTest()
 {
 	if( psxTestCycle( psxNextsCounter, psxNextCounter ) )
 	{
--- a/pcsx2/R3000A.h
+++ b/pcsx2/R3000A.h
@ -200,7 +200,7 @@ extern R3000Acpu psxRec;
 void psxReset();
 void psxShutdown();
 void psxException(u32 code, u32 step);
-void psxBranchTest();
+extern void psxBranchTest();
 void psxExecuteBios();
 void psxMemReset();

--- a/pcsx2/R5900.cpp
+++ b/pcsx2/R5900.cpp
@ -106,7 +106,7 @@ void cpuShutdown()
 	disR5900FreeSyms();
 }

-__releaseinline void __fastcall cpuException(u32 code, u32 bd)
+__releaseinline void cpuException(u32 code, u32 bd)
 {
 	cpuRegs.branch = 0;		// Tells the interpreter that an exception occurred during a branch.
 	bool errLevel2, checkStatus;
@ -244,7 +244,7 @@ void cpuTestMissingHwInts() {
 }

 // sets a branch test to occur some time from an arbitrary starting point.
-__forceinline int __fastcall cpuSetNextBranch( u32 startCycle, s32 delta )
+__forceinline void cpuSetNextBranch( u32 startCycle, s32 delta )
 {
 	// typecast the conditional to signed so that things don't blow up
 	// if startCycle is greater than our next branch cycle.
@ -252,20 +252,18 @@ __forceinline int __fastcall cpuSetNextBranch( u32 startCycle, s32 delta )
 	if( (int)(g_nextBranchCycle - startCycle) > delta )
 	{
 		g_nextBranchCycle = startCycle + delta;
-		return 1;
 	}
-	return 0;
 }

 // sets a branch to occur some time from the current cycle
-__forceinline int __fastcall cpuSetNextBranchDelta( s32 delta )
+__forceinline void cpuSetNextBranchDelta( s32 delta )
 {
-	return cpuSetNextBranch( cpuRegs.cycle, delta );
+	cpuSetNextBranch( cpuRegs.cycle, delta );
 }

 // tests the cpu cycle agaisnt the given start and delta values.
 // Returns true if the delta time has passed.
-__forceinline int __fastcall cpuTestCycle( u32 startCycle, s32 delta )
+__forceinline int cpuTestCycle( u32 startCycle, s32 delta )
 {
 	// typecast the conditional to signed so that things don't explode
 	// if the startCycle is ahead of our current cpu cycle.
@ -279,7 +277,7 @@ __forceinline void cpuSetBranch()
 	g_nextBranchCycle = cpuRegs.cycle;
 }

-void cpuClearInt( uint i )
+__forceinline void cpuClearInt( uint i )
 {
 	jASSUME( i < 32 );
 	cpuRegs.interrupt &= ~(1 << i);
--- a/pcsx2/R5900.h
+++ b/pcsx2/R5900.h
@ -257,14 +257,14 @@ extern void cpuInit();
 extern void cpuReset();		// can throw Exception::FileNotFound.
 extern void cpuShutdown();
 extern void cpuExecuteBios();
-extern void __fastcall cpuException(u32 code, u32 bd);
+extern void cpuException(u32 code, u32 bd);
 extern void cpuTlbMissR(u32 addr, u32 bd);
 extern void cpuTlbMissW(u32 addr, u32 bd);
 extern void cpuTestHwInts();

-extern int __fastcall cpuSetNextBranch( u32 startCycle, s32 delta );
-extern int __fastcall cpuSetNextBranchDelta( s32 delta );
-extern int __fastcall cpuTestCycle( u32 startCycle, s32 delta );
+extern void cpuSetNextBranch( u32 startCycle, s32 delta );
+extern void cpuSetNextBranchDelta( s32 delta );
+extern int  cpuTestCycle( u32 startCycle, s32 delta );
 extern void cpuSetBranch();

 extern bool _cpuBranchTest_Shared();		// for internal use by the Dynarecs and Ints inside R5900:
--- a/pcsx2/R5900OpcodeTables.h
+++ b/pcsx2/R5900OpcodeTables.h
@ -18,8 +18,6 @@
 #ifndef _R5900_OPCODETABLES_H
 #define _R5900_OPCODETABLES_H

-#include <string>
-
 #include "PS2Etypes.h"

 // TODO : Move these into the OpcodeTables namespace
--- a/pcsx2/Sio.cpp
+++ b/pcsx2/Sio.cpp
@ -451,16 +451,16 @@ void SIO_CommandWrite(u8 value,int way) {
 				break;
 			case 0x21:
 				// Set pad slot.
-				sio.mtapst = 0x21;
+				sio.mtapst = value;
 				sio.bufcount = 6; // No idea why this is 6, saved from old code.
 				break;
 			case 0x22:
 				// Set memcard slot.
-				sio.mtapst = 0x22;
+				sio.mtapst = value;
 				sio.bufcount = 6; // No idea why this is 6, saved from old code.
 				break;
 			}
-			// Commented out values are from original code.  Break multitap in bios.
+			// Commented out values are from original code.  They break multitap in bios.
 			sio.buf[sio.bufcount-1]=0;//'+';
 			sio.buf[sio.bufcount]=0;//'Z';
 			return;
@ -554,6 +554,7 @@ void InitializeSIO(u8 value)
 				int port = sio.GetMultitapPort();
 				if (!IsMtapPresent(port))
 				{
+					// If "unplug" multitap mid game, set active slots to 0.
 					sio.activePadSlot[port] = 0;
 					sio.activeMemcardSlot[port] = 0;
 				}
--- a/pcsx2/System.h
+++ b/pcsx2/System.h
@ -20,9 +20,9 @@
 #define __SYSTEM_H__

 #include "PS2Etypes.h"
+#include "Paths.h"
 #include "Pcsx2Config.h"
 #include "Exceptions.h"
-#include "Paths.h"
 #include "MemcpyFast.h"
 #include "SafeArray.h"
 #include "Misc.h"
--- a/pcsx2/VUops.cpp
+++ b/pcsx2/VUops.cpp
@ -2508,13 +2508,23 @@ void _vuRegsMTIR(VURegs * VU, _VURegsNum *VUregsn) {
 	VUregsn->pipe = VUPIPE_FMAC;
    VUregsn->VFwrite = 0;
 	VUregsn->VFread0 = _Fs_;
-    VUregsn->VFr0xyzw= _XYZW;
+    VUregsn->VFr0xyzw= 1 << (3-_Fsf_);
 	VUregsn->VFread1 = 0;
    VUregsn->VIwrite = 1 << _Ft_;
    VUregsn->VIread  = GET_VF0_FLAG(_Fs_);
 }

-VUREGS_FTFS(MR32);
+void _vuRegsMR32(VURegs * VU, _VURegsNum *VUregsn) { 
+	VUregsn->pipe = VUPIPE_FMAC; 
+	VUregsn->VFwrite = _Ft_; 
+	VUregsn->VFwxyzw = _XYZW; 
+	VUregsn->VFread0 = _Fs_; 
+	VUregsn->VFr0xyzw= (_XYZW >> 1) | ((_XYZW << 3) & 0xf);  //rotate
+	VUregsn->VFread1 = 0; 
+	VUregsn->VFr1xyzw = 0xff; 
+	VUregsn->VIwrite = 0; 
+	VUregsn->VIread  = (_Ft_ ? GET_VF0_FLAG(_Fs_) : 0); 
+}

 void _vuRegsLQ(VURegs * VU, _VURegsNum *VUregsn) {
 	VUregsn->pipe = VUPIPE_FMAC;
--- a/pcsx2/Vif.cpp
+++ b/pcsx2/Vif.cpp
@ -25,10 +25,10 @@
 #include "Vif.h"
 #include "VifDma.h"

-VIFregisters *_vifRegs;
-u32* _vifRow = NULL, *_vifCol = NULL;
-u32* _vifMaskRegs = NULL;
-vifStruct *_vif;
+VIFregisters *vifRegs;
+u32* vifRow = NULL, *vifCol = NULL;
+u32* vifMaskRegs = NULL;
+vifStruct *vif;

 PCSX2_ALIGNED16(u32 g_vifRow0[4]);
 PCSX2_ALIGNED16(u32 g_vifCol0[4]);
@ -44,35 +44,37 @@ enum UnpackOffset
 	OFFSET_X = 0,
 	OFFSET_Y = 1,
 	OFFSET_Z = 2,
-	OFFSET_W =3
+	OFFSET_W = 3
 };

-#define spr0 ((DMACh*)&PS2MEM_HW[0xD000])
-
 __forceinline static int _limit(int a, int max)
 {
 	return (a > max) ? max : a;
 }
 	
-static __releaseinline void writeX(u32 &dest, u32 data)
+static __releaseinline void writeXYZW(u32 offnum, u32 &dest, u32 data)
 {
 	int n;
+	u32 vifRowReg = getVifRowRegs(offnum);
 	
-	if (_vifRegs->code & 0x10000000)
+	if (vifRegs->code & 0x10000000)
 	{
-		switch (_vif->cl)
+		switch (vif->cl)
 		{
 			case 0:
-				n = (_vifRegs->mask) & 0x3;
+				if (offnum == OFFSET_X)
+					n = (vifRegs->mask) & 0x3;
+				else 
+					n = (vifRegs->mask >> (offnum * 2)) & 0x3;
 				break;
 			case 1:
-				n = (_vifRegs->mask >> 8) & 0x3;
+				n = (vifRegs->mask >> ( 8 + (offnum * 2))) & 0x3;
 				break;
 			case 2:
-				n = (_vifRegs->mask >> 16) & 0x3;
+				n = (vifRegs->mask >> (16 + (offnum * 2))) & 0x3;
 				break;
 			default:
-				n = (_vifRegs->mask >> 24) & 0x3;
+				n = (vifRegs->mask >> (24 + (offnum * 2))) & 0x3;
 				break;
 		}
 	}
@ -81,355 +83,144 @@ static __releaseinline void writeX(u32 &dest, u32 data)
 	switch (n)
 	{
 		case 0:
-			if ((_vif->cmd & 0x6F) == 0x6f)
+			if ((vif->cmd & 0x6F) == 0x6f)
 			{
 				dest = data;
 			}
-			else if (_vifRegs->mode == 1)
-			{
-				dest = data + _vifRegs->r0;
-			}
-			else if (_vifRegs->mode == 2)
-			{
-				_vifRegs->r0 += data;
-				dest = _vifRegs->r0;
-			}
-			else
+			else switch (vifRegs->mode)
 			{
+				case 1:
+					dest = data + vifRowReg;
+					break;
+				case 2:
+					// vifRowReg isn't used after this, or I would make it equal to dest here.
+					dest = setVifRowRegs(offnum, vifRowReg + data);
+					break;
+				default:
 					dest = data;
+					break;
 			}
 			break;
 		case 1:
-			dest = _vifRegs->r0;
+			dest = vifRowReg;
 			break;
 		case 2:
-			switch (_vif->cl)
-			{
-				case 0:
-					dest = _vifRegs->c0;
+			dest = getVifColRegs((vif->cl > 2) ? 3 : vif->cl);
 			break;
-				case 1:
-					dest = _vifRegs->c1;
-					break;
-				case 2:
-					dest = _vifRegs->c2;
-					break;
-				default:
-					dest = _vifRegs->c3;
+		case 3:
 			break;
 	}
-			break;
-	}
-//	VIF_LOG("writeX %8.8x : Mode %d, r0 = %x, data %8.8x", *dest,_vifRegs->mode,_vifRegs->r0,data);
-}
-
-static __releaseinline void writeY(u32 &dest, u32 data)
-{
-	int n;
-	
-	if (_vifRegs->code & 0x10000000)
-	{
-		switch (_vif->cl)
-		{
-			case 0:
-				n = (_vifRegs->mask >> 2) & 0x3;
-				break;
-			case 1:
-				n = (_vifRegs->mask >> 10) & 0x3;
-				break;
-			case 2:
-				n = (_vifRegs->mask >> 18) & 0x3;
-				break;
-			default:
-				n = (_vifRegs->mask >> 26) & 0x3;
-				break;
-		}
-	}
-	else n = 0;
-
-	switch (n)
-	{
-		case 0:
-			if ((_vif->cmd & 0x6F) == 0x6f)
-			{
-				dest = data;
-			}
-			else if (_vifRegs->mode == 1)
-			{
-				dest = data + _vifRegs->r1;
-			}
-			else if (_vifRegs->mode == 2)
-			{
-				_vifRegs->r1 += data;
-				dest = _vifRegs->r1;
-			}
-			else
-			{
-				dest = data;
-			}
-			break;
-		case 1:
-			dest = _vifRegs->r1;
-			break;
-		case 2:
-			switch (_vif->cl)
-			{
-				case 0:
-					dest = _vifRegs->c0;
-					break;
-				case 1:
-					dest = _vifRegs->c1;
-					break;
-				case 2:
-					dest = _vifRegs->c2;
-					break;
-				default:
-					dest = _vifRegs->c3;
-					break;
-			}
-			break;
-	}
-//	VIF_LOG("writeY %8.8x : Mode %d, r1 = %x, data %8.8x", *dest,_vifRegs->mode,_vifRegs->r1,data);
-}
-
-static __releaseinline void writeZ(u32 &dest, u32 data)
-{
-	int n;
-	
-	if (_vifRegs->code & 0x10000000)
-	{
-		switch (_vif->cl)
-		{
-			case 0:
-				n = (_vifRegs->mask >> 4) & 0x3;
-				break;
-			case 1:
-				n = (_vifRegs->mask >> 12) & 0x3;
-				break;
-			case 2:
-				n = (_vifRegs->mask >> 20) & 0x3;
-				break;
-			default:
-				n = (_vifRegs->mask >> 28) & 0x3;
-				break;
-		}
-	}
-	else n = 0;
-
-	switch (n)
-	{
-		case 0:
-			if ((_vif->cmd & 0x6F) == 0x6f)
-			{
-				dest = data;
-			}
-			else if (_vifRegs->mode == 1)
-			{
-				dest = data + _vifRegs->r2;
-			}
-			else if (_vifRegs->mode == 2)
-			{
-				_vifRegs->r2 += data;
-				dest = _vifRegs->r2;
-			}
-			else
-			{
-				dest = data;
-			}
-			break;
-		case 1:
-			dest = _vifRegs->r2;
-			break;
-		case 2:
-			switch (_vif->cl)
-			{
-				case 0:
-					dest = _vifRegs->c0;
-					break;
-				case 1:
-					dest = _vifRegs->c1;
-					break;
-				case 2:
-					dest = _vifRegs->c2;
-					break;
-				default:
-					dest = _vifRegs->c3;
-					break;
-			}
-			break;
-	}
-//	VIF_LOG("writeZ %8.8x : Mode %d, r2 = %x, data %8.8x", *dest,_vifRegs->mode,_vifRegs->r2,data);
-}
-
-static __releaseinline void writeW(u32 &dest, u32 data)
-{
-	int n;
-	
-	if (_vifRegs->code & 0x10000000)
-	{
-		switch (_vif->cl)
-		{
-			case 0:
-				n = (_vifRegs->mask >> 6) & 0x3;
-				break;
-			case 1:
-				n = (_vifRegs->mask >> 14) & 0x3;
-				break;
-			case 2:
-				n = (_vifRegs->mask >> 22) & 0x3;
-				break;
-			default:
-				n = (_vifRegs->mask >> 30) & 0x3;
-				break;
-		}
-	}
-	else n = 0;
-
-	switch (n)
-	{
-		case 0:
-			if ((_vif->cmd & 0x6F) == 0x6f)
-			{
-				dest = data;
-			}
-			else if (_vifRegs->mode == 1)
-			{
-				dest = data + _vifRegs->r3;
-			}
-			else if (_vifRegs->mode == 2)
-			{
-				_vifRegs->r3 += data;
-				dest = _vifRegs->r3;
-			}
-			else
-			{
-				dest = data;
-			}
-			break;
-		case 1:
-			dest = _vifRegs->r3;
-			break;
-		case 2:
-			switch (_vif->cl)
-			{
-				case 0:
-					dest = _vifRegs->c0;
-					break;
-				case 1:
-					dest = _vifRegs->c1;
-					break;
-				case 2:
-					dest = _vifRegs->c2;
-					break;
-				default:
-					dest = _vifRegs->c3;
-					break;
-			}
-			break;
-	}
-//	VIF_LOG("writeW %8.8x : Mode %d, r3 = %x, data %8.8x", *dest,_vifRegs->mode,_vifRegs->r3,data);
-}
-
-template <class T>
-static void _UNPACKpart(u32 offnum,  u32 &x, T y)
-{
-	if (_vifRegs->offset == offnum) 
-	{
-		switch (offnum)
-		{
-			case OFFSET_X:
-				writeX(x,y);
-				break;
-			case OFFSET_Y: 
-				writeY(x,y);
-				break;
-			case OFFSET_Z:
-				writeZ(x,y);
-				break;
-			case OFFSET_W:
-				writeW(x,y);
-				break;
-			default:
-				break;
-		}
-		_vifRegs->offset++;
-	}
-}
-
-template <class T>
-static void _UNPACKpart(u32 offnum,  u32 &x, T y, int &size)
-{
-	if (_vifRegs->offset == offnum) 
-	{
-		switch (offnum)
-		{
-			case OFFSET_X:
-				writeX(x,y);
-				break;
-			case OFFSET_Y: 
-				writeY(x,y);
-				break;
-			case OFFSET_Z:
-				writeZ(x,y);
-				break;
-			case OFFSET_W:
-				writeW(x,y);
-				break;
-			default:
-				break;
-		}
-		size--;
-		_vifRegs->offset++;
-	}
+//	VIF_LOG("writeX %8.8x : Mode %d, r0 = %x, data %8.8x", *dest,vifRegs->mode,vifRegs->r0,data);
 }

 template <class T>
 void __fastcall UNPACK_S(u32 *dest, T *data, int size)
 {
-	_UNPACKpart(OFFSET_X, *dest++, *data, size);
-	_UNPACKpart(OFFSET_Y, *dest++, *data, size);
-	_UNPACKpart(OFFSET_Z, *dest++, *data, size);
-	_UNPACKpart(OFFSET_W, *dest , *data, size);
-	if (_vifRegs->offset == 4) _vifRegs->offset = 0;
+	//S-# will always be a complete packet, no matter what. So we can skip the offset bits
+	writeXYZW(OFFSET_X, *dest++, *data);
+	writeXYZW(OFFSET_Y, *dest++, *data);
+	writeXYZW(OFFSET_Z, *dest++, *data);
+	writeXYZW(OFFSET_W, *dest  , *data);
 }

 template <class T>
 void __fastcall UNPACK_V2(u32 *dest, T *data, int size)
 {
-	_UNPACKpart(OFFSET_X, *dest++, *data++, size);
-	_UNPACKpart(OFFSET_Y, *dest++, *data--, size);
-	_UNPACKpart(OFFSET_Z, *dest++, *data++);
-	_UNPACKpart(OFFSET_W, *dest , *data);
-	if (_vifRegs->offset == 4) _vifRegs->offset = 0;
+	if (vifRegs->offset == OFFSET_X)
+	{
+		if (size > 0)
+		{
+			writeXYZW(vifRegs->offset, *dest++, *data++);
+			vifRegs->offset = OFFSET_Y;
+			size--;
+		}
+	}
+	
+	if (vifRegs->offset == OFFSET_Y) 
+	{
+		if (size > 0)
+		{
+			writeXYZW(vifRegs->offset, *dest++, *data);
+			vifRegs->offset = OFFSET_Z;
+			size--;
+		}
+	}
+	
+	if (vifRegs->offset == OFFSET_Z)
+	{
+		writeXYZW(vifRegs->offset, *dest++, *dest-2);
+		vifRegs->offset = OFFSET_W;
+	}
+	
+	if (vifRegs->offset == OFFSET_W)
+	{
+		writeXYZW(vifRegs->offset, *dest, *data);
+		vifRegs->offset = OFFSET_X;
+	}
 }

 template <class T>
 void __fastcall UNPACK_V3(u32 *dest, T *data, int size)
 {
-	_UNPACKpart(OFFSET_X, *dest++, *data++, size);
-	_UNPACKpart(OFFSET_Y, *dest++, *data++, size);
-	_UNPACKpart(OFFSET_Z, *dest++, *data++, size);
-	_UNPACKpart(OFFSET_W, *dest, *data);
-	if (_vifRegs->offset == 4) _vifRegs->offset = 0;
+	if(vifRegs->offset == OFFSET_X)
+	{
+		if (size > 0)
+		{
+			writeXYZW(vifRegs->offset, *dest++, *data++);
+			vifRegs->offset = OFFSET_Y;
+			size--;
+		}
+	}
+	
+	if(vifRegs->offset == OFFSET_Y) 
+	{
+		if (size > 0)
+		{
+			writeXYZW(vifRegs->offset, *dest++, *data++);
+			vifRegs->offset = OFFSET_Z;
+			size--;
+		}
+	}
+	
+	if(vifRegs->offset == OFFSET_Z)
+	{
+		if (size > 0)
+		{
+			writeXYZW(vifRegs->offset, *dest++, *data++);
+			vifRegs->offset = OFFSET_W;
+			size--;
+		}
+	}
+	
+	if(vifRegs->offset == OFFSET_W)
+	{
+		//V3-# does some bizzare thing with alignment, every 6qw of data the W becomes 0 (strange console!)
+		//Ape Escape doesnt seem to like it tho (what the hell?) gonna have to investigate
+		writeXYZW(vifRegs->offset, *dest, *data);
+		vifRegs->offset = OFFSET_X;
+	}
 }

 template <class T>
 void __fastcall UNPACK_V4(u32 *dest, T *data , int size)
 {
-	_UNPACKpart(OFFSET_X, *dest++, *data++, size);
-	_UNPACKpart(OFFSET_Y, *dest++, *data++, size);
-	_UNPACKpart(OFFSET_Z, *dest++, *data++, size);
-	_UNPACKpart(OFFSET_W, *dest , *data, size);
-	if (_vifRegs->offset == 4) _vifRegs->offset = 0;
+	while (size > 0)
+	{
+		writeXYZW(vifRegs->offset, *dest++, *data++); 
+		vifRegs->offset++;
+		size--;
+	}
+
+	if (vifRegs->offset > OFFSET_W) vifRegs->offset = OFFSET_X;
 }

 void __fastcall UNPACK_V4_5(u32 *dest, u32 *data, int size)
 {
-	_UNPACKpart(OFFSET_X, *dest++,  ((*data & 0x001f) << 3), size);
-	_UNPACKpart(OFFSET_Y, *dest++, ((*data & 0x03e0) >> 2), size);
-	_UNPACKpart(OFFSET_Z, *dest++, ((*data & 0x7c00) >> 7), size);
-	_UNPACKpart(OFFSET_W, *dest, ((*data & 0x8000) >> 8), size);
-	if (_vifRegs->offset == 4) _vifRegs->offset = 0;
+	//As with S-#, this will always be a complete packet
+	writeXYZW(OFFSET_X, *dest++,  ((*data & 0x001f) << 3));
+	writeXYZW(OFFSET_Y, *dest++, ((*data & 0x03e0) >> 2));
+	writeXYZW(OFFSET_Z, *dest++, ((*data & 0x7c00) >> 7));
+	writeXYZW(OFFSET_W, *dest, ((*data & 0x8000) >> 8));
 }

 void __fastcall UNPACK_S_32(u32 *dest, u32 *data, int size) 
@ -599,7 +390,7 @@ static __forceinline int mfifoVIF1rbTransfer()
 	return ret;
 }

-static __forceinline int mfifoVIF1chain()
+static __forceinline int mfifo_VIF1chain()
 {
 	int ret;

@ -739,7 +530,7 @@ void vifMFIFOInterrupt()
 {
 	g_vifCycles = 0;
 	
-	if (vif1.inprogress == 1) mfifoVIF1chain();
+	if (vif1.inprogress == 1) mfifo_VIF1chain();

 	if (vif1.irq && vif1.tag.size == 0)
 	{
--- a/pcsx2/Vif.h
+++ b/pcsx2/Vif.h
@ -24,6 +24,7 @@ struct vifCycle {
 	u8 pad[2];
 };

+// r0-r3 and c0-c3 would be more managable as arrays.
 struct VIFregisters {
 	u32 stat;
 	u32 pad0[3];
@ -80,14 +81,97 @@ struct VIFregisters {
 extern "C"
 {
 	// these use cdecl for Asm code references.
-	extern VIFregisters *_vifRegs;
-	extern u32* _vifMaskRegs;
-	extern u32* _vifRow;
+	extern VIFregisters *vifRegs;
+	extern u32* vifMaskRegs;
+	extern u32* vifRow;
 	extern u32* _vifCol;
 }

+static __forceinline u32 setVifRowRegs(u32 reg, u32 data)
+{
+	switch (reg)
+	{
+		case 0:
+			vifRegs->r0 = data;
+			break;
+		case 1:
+			vifRegs->r1 = data;
+			break;
+		case 2:
+			vifRegs->r2 = data;
+			break;
+		case 3:
+			vifRegs->r3 = data;
+			break;
+			jNO_DEFAULT;
+	}
+	return data;
+}
+
+static __forceinline u32 getVifRowRegs(u32 reg)
+{
+	switch (reg)
+	{
+		case 0:
+			return vifRegs->r0;
+			break;
+		case 1:
+			return vifRegs->r1;
+			break;
+		case 2:
+			return vifRegs->r2;
+			break;
+		case 3:
+			return vifRegs->r3;
+			break;
+			jNO_DEFAULT;
+	}
+}
+
+static __forceinline u32 setVifColRegs(u32 reg, u32 data)
+{
+	switch (reg)
+	{
+		case 0:
+			vifRegs->c0 = data;
+			break;
+		case 1:
+			vifRegs->c1 = data;
+			break;
+		case 2:
+			vifRegs->c2 = data;
+			break;
+		case 3:
+			vifRegs->c3 = data;
+			break;
+			jNO_DEFAULT;
+	}
+	return data;
+}
+
+static __forceinline u32 getVifColRegs(u32 reg)
+{
+	switch (reg)
+	{
+		case 0:
+			return vifRegs->c0;
+			break;
+		case 1:
+			return vifRegs->c1;
+			break;
+		case 2:
+			return vifRegs->c2;
+			break;
+		case 3:
+			return vifRegs->c3;
+			break;
+			jNO_DEFAULT;
+	}
+}
+
 #define vif0Regs ((VIFregisters*)&PS2MEM_HW[0x3800])
 #define vif1Regs ((VIFregisters*)&PS2MEM_HW[0x3c00])
+#define spr0 ((DMACh*)&PS2MEM_HW[0xD000])

 void dmaVIF0();
 void dmaVIF1();
--- a/pcsx2/VifDma.cpp
+++ b/pcsx2/VifDma.cpp
@ -29,7 +29,7 @@

 using namespace std;			// for min / max

-//#define VIFUNPACKDEBUG //enable unpack debugging output
+

 #define gif ((DMACh*)&PS2MEM_HW[0xA000])

@ -37,10 +37,10 @@ using namespace std;			// for min / max
 extern "C"
 {
 	// Need cdecl on these for ASM references.
-	extern VIFregisters *_vifRegs;
-	extern u32* _vifMaskRegs;
-	extern u32* _vifRow;
-	extern u32* _vifCol;
+	extern VIFregisters *vifRegs;
+	extern u32* vifMaskRegs;
+	extern u32* vifRow;
+	extern u32* vifCol;
 }

 PCSX2_ALIGNED16_EXTERN(u32 g_vifRow0[4]);
@ -48,7 +48,7 @@ PCSX2_ALIGNED16_EXTERN(u32 g_vifCol0[4]);
 PCSX2_ALIGNED16_EXTERN(u32 g_vifRow1[4]);
 PCSX2_ALIGNED16_EXTERN(u32 g_vifCol1[4]);

-extern vifStruct *_vif;
+extern vifStruct *vif;

 vifStruct vif0, vif1;

@ -254,57 +254,45 @@ __forceinline static int _limit(int a, int max)
 static void ProcessMemSkip(int size, unsigned int unpackType, const unsigned int VIFdmanum)
 {
 	const VIFUnpackFuncTable *unpack;
-	vifStruct *vif;
-	VIFregisters *vifRegs;
-	unpack = &VIFfuncTable[ unpackType ];
 	
-	if (VIFdmanum == 0)
-	{
-		vif = &vif0;
-		vifRegs = vif0Regs;
-	}
-	else
-	{
-		vif = &vif1;
-		vifRegs = vif1Regs;
-	}
+	unpack = &VIFfuncTable[ unpackType ];

 	switch (unpackType)
 	{
 		case 0x0:
-			vif->tag.addr += size * 4;
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing S-32 skip, size = %d", size);
 			break;
 		case 0x1:
-			vif->tag.addr += size * 8;
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing S-16 skip, size = %d", size);
 			break;
 		case 0x2:
-			vif->tag.addr += size * 16;
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing S-8 skip, size = %d", size);
 			break;
 		case 0x4:
-			vif->tag.addr += size + ((size / unpack->gsize) * 8);
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing V2-32 skip, size = %d", size);
 			break;
 		case 0x5:
-			vif->tag.addr += (size * 2) + ((size / unpack->gsize) * 8);
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing V2-16 skip, size = %d", size);
 			break;
 		case 0x6:
-			vif->tag.addr += (size * 4) + ((size / unpack->gsize) * 8);
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing V2-8 skip, size = %d", size);
 			break;
 		case 0x8:
-			vif->tag.addr += size + ((size / unpack->gsize) * 4);
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing V3-32 skip, size = %d", size);
 			break;
 		case 0x9:
-			vif->tag.addr += (size * 2) + ((size / unpack->gsize) * 4);
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing V3-16 skip, size = %d", size);
 			break;
 		case 0xA:
-			vif->tag.addr += (size * 4) + ((size / unpack->gsize) * 4);
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing V3-8 skip, size = %d", size);
 			break;
 		case 0xC:
@ -312,15 +300,15 @@ static void ProcessMemSkip(int size, unsigned int unpackType, const unsigned int
 			VIFUNPACK_LOG("Processing V4-32 skip, size = %d, CL = %d, WL = %d", size, vif1Regs->cycle.cl, vif1Regs->cycle.wl);
 			break;
 		case 0xD:
-			vif->tag.addr += size * 2;
+		    vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing V4-16 skip, size = %d", size);
 			break;
 		case 0xE:
-			vif->tag.addr += size * 4;
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing V4-8 skip, size = %d", size);
 			break;
 		case 0xF:
-			vif->tag.addr +=  size * 8;
+			vif->tag.addr += (size / unpack->gsize) * 16;
 			VIFUNPACK_LOG("Processing V4-5 skip, size = %d", size);
 			break;
 		default:
@ -328,87 +316,59 @@ static void ProcessMemSkip(int size, unsigned int unpackType, const unsigned int
 			break;
 	}

-	if ((vif->tag.addr & 0xf) == unpack->gsize)
+	//Append any skips in to the equasion
+	
+	if (vifRegs->cycle.cl > vifRegs->cycle.wl)
 	{
-		vif->tag.addr += 16 - unpack->gsize;
+		VIFUNPACK_LOG("Old addr %x CL %x WL %x", vif->tag.addr, vifRegs->cycle.cl, vifRegs->cycle.wl);
+		vif->tag.addr += (size / (unpack->gsize*vifRegs->cycle.wl)) * ((vifRegs->cycle.cl - vifRegs->cycle.wl)*16);
+		VIFUNPACK_LOG("New addr %x CL %x WL %x", vif->tag.addr, vifRegs->cycle.cl, vifRegs->cycle.wl);
 	}
+
+	//This is sorted out later
+	if((vif->tag.addr & 0xf) != (vifRegs->offset * 4))
+	{
+		VIFUNPACK_LOG("addr aligned to %x", vif->tag.addr);
+		vif->tag.addr = (vif->tag.addr & ~0xf) + (vifRegs->offset * 4);
+	}
+	
 }

-static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdmanum)
+static int VIFalign(u32 *data, vifCode *v, int size, const unsigned int VIFdmanum)
 {
 	u32 *dest;
 	u32 unpackType;
 	UNPACKFUNCTYPE func;
 	const VIFUnpackFuncTable *ft;
-	vifStruct *vif;
-	VIFregisters *vifRegs;
 	VURegs * VU;
 	u8 *cdata = (u8*)data;
 	
+	
 #ifdef _DEBUG
 	u32 memsize = VIFdmanum ? 0x4000 : 0x1000;
 #endif

-	_mm_prefetch((char*)data, _MM_HINT_NTA);
-
 	if (VIFdmanum == 0)
 	{
 		VU = &VU0;
-		vif = &vif0;
-		vifRegs = vif0Regs;
 		assert(v->addr < memsize);
 	}
 	else
 	{
-
 		VU = &VU1;
-		vif = &vif1;
-		vifRegs = vif1Regs;
 		assert(v->addr < memsize);
-
-		if (vu1MicroIsSkipping())
-		{
-			// don't process since the frame is dummy
-			vif->tag.addr += (size / (VIFfuncTable[ vif->cmd & 0xf ].gsize * vifRegs->cycle.wl)) * ((vifRegs->cycle.cl - vifRegs->cycle.wl) * 16);
-			return;
-		}
 	}

 	dest = (u32*)(VU->Mem + v->addr);

-	VIF_LOG("VIF%d UNPACK: Mode=%x, v->size=%d, size=%d, v->addr=%x",
-	        VIFdmanum, v->cmd & 0xf, v->size, size, v->addr);
-
-#ifdef _DEBUG
-	if (v->size != size)
-	{
-		VIF_LOG("*PCSX2*: warning v->size != size");
-	}
-	
-	if ((v->addr + size*4) > memsize)
-	{
-		Console::Notice("*PCSX2*: fixme unpack overflow");
-		Console::WriteLn("VIF%d UNPACK: Mode=%x, v->size=%d, size=%d, v->addr=%x",
-		                 params VIFdmanum, v->cmd & 0xf, v->size, size, v->addr);
-	}
-#endif
+	VIF_LOG("VIF%d UNPACK Align: Mode=%x, v->size=%d, size=%d, v->addr=%x v->num=%x",
+	        VIFdmanum, v->cmd & 0xf, v->size, size, v->addr, vifRegs->num);
 	
 	// The unpack type
 	unpackType = v->cmd & 0xf;
 	
-	if (size == 0)
-	{
-		VIFUNPACK_LOG("*PCSX2*: Unpack %x with size 0!! v->size = %d cl = %d, wl = %d, mode %d mask %x", v->cmd, v->size, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mode, vifRegs->mask);
-	}
-
-	_mm_prefetch((char*)data + 128, _MM_HINT_NTA);
-	
-	_vifRegs = (VIFregisters*)vifRegs;
-	_vifMaskRegs = VIFdmanum ? g_vif1Masks : g_vif0Masks;
-	_vif = vif;
-	_vifRow = VIFdmanum ? g_vifRow1 : g_vifRow0;
 	ft = &VIFfuncTable[ unpackType ];
-	func = _vif->usn ? ft->funcU : ft->funcS;
+	func = vif->usn ? ft->funcU : ft->funcS;

 	size <<= 2;
 	
@ -416,56 +376,64 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
 	memsize = size;
 #endif
 	
-	if (_vifRegs->offset > 0)
+	if(vif1Regs->offset != 0)
 	{		
-		int destinc, unpacksize;
+		int unpacksize;
+
+		//This is just to make sure the alignment isnt loopy on a split packet
+		if(vifRegs->offset != ((vif->tag.addr & 0xf) >> 2))
+		{
+			DevCon::Error("Warning: Unpack alignment error");
+		}

 		VIFUNPACK_LOG("Aligning packet size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr);

-		// SSE doesn't handle such small data
-		if (v->size != (size >> 2))
-			ProcessMemSkip(size, unpackType, VIFdmanum);
+		if(((size / ft->dsize) + vifRegs->offset) < (u32)ft->qsize)
+			VIFUNPACK_LOG("Warning! Size needed to align %x size chunks available %x offset %x", ft->qsize - ((size / ft->dsize) + vifRegs->offset), vifRegs->offset);
 		
-		if (vifRegs->offset < (u32)ft->qsize)
-		{
 		if (((u32)size / (u32)ft->dsize) < ((u32)ft->qsize - vifRegs->offset))
 		{
-				Console::WriteLn("Wasn't enough left size/dsize = %x left to write %x", params(size / ft->dsize), (ft->qsize - vifRegs->offset));
+				DevCon::Error("Wasn't enough left size/dsize = %x left to write %x", params(size / ft->dsize), (ft->qsize - vifRegs->offset));
 		}
 			unpacksize = min(((u32)size / (u32)ft->dsize), ((u32)ft->qsize - vifRegs->offset));
-		}
-		else
-		{
-			unpacksize = 0;
-			Console::WriteLn("Unpack align offset = 0");
-		}
-		destinc = (4 - ft->qsize) + unpacksize;
+		
+
+		VIFUNPACK_LOG("Increasing dest by %x from offset %x", (4 - ft->qsize) + unpacksize, vifRegs->offset);
 			
 		func(dest, (u32*)cdata, unpacksize);
 		size -= unpacksize * ft->dsize;
-		cdata += unpacksize * ft->dsize;
 		
 		vifRegs->num--;
 		++vif->cl;
+
 		if (vif->cl == vifRegs->cycle.wl)
 		{
 			if (vifRegs->cycle.cl != vifRegs->cycle.wl)
-				dest += ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + destinc;
-			else
-				dest += destinc;
-			vif->cl = 0;
+			{
+				vif->tag.addr += (((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + ((4 - ft->qsize) + unpacksize)) * 4;
+				//dest += ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + destinc;
 			}
 			else
 			{
-			dest += destinc;
+				vif->tag.addr += ((4 - ft->qsize) + unpacksize) * 4;
+				//dest += destinc;
 			}
+			vif->cl = 0;
 			VIFUNPACK_LOG("Aligning packet done size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr);
+			return size >> 2;

 		}
-	else if (v->size != (size >> 2))
-		ProcessMemSkip(size, unpackType, VIFdmanum);
+		else
+		{
+			vif->tag.addr += ((4 - ft->qsize) + unpacksize) * 4;
+			dest += (4 - ft->qsize) + unpacksize;		
+			cdata += unpacksize * ft->dsize;
+			VIFUNPACK_LOG("Aligning packet done size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr);
+		}
+	}
 	
-	if (vifRegs->cycle.cl >= vifRegs->cycle.wl)   // skipping write
+
+	if (vif->cl != 0)  //Check alignment for SSE unpacks
 	{

 #ifdef _DEBUG
@ -474,7 +442,7 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma

 		int incdest;

-		if (vif->cl != 0)
+		if (vifRegs->cycle.cl >= vifRegs->cycle.wl)  // skipping write
 		{
 			// continuation from last stream

@ -491,22 +459,109 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
 				if (vif->cl == vifRegs->cycle.wl)
 				{
 					dest += incdest;
+					vif->tag.addr += incdest * 4;
 					vif->cl = 0;
 					break;
 				}

 				dest += 4;
+				vif->tag.addr += 16;
 			}

-			// have to update
-			_vifRow[0] = _vifRegs->r0;
-			_vifRow[1] = _vifRegs->r1;
-			_vifRow[2] = _vifRegs->r2;
-			_vifRow[3] = _vifRegs->r3;
-
+			if(vifRegs->mode == 2)
+			{
+				//Update the reg rows for SSE
+				vifRow = VIFdmanum ? g_vifRow1 : g_vifRow0;
+				vifRow[0] = vifRegs->r0;
+				vifRow[1] = vifRegs->r1;
+				vifRow[2] = vifRegs->r2;
+				vifRow[3] = vifRegs->r3;
 			}	

-		if ((size >= ft->gsize) && !(v->addr&0xf))
+		}
+	}
+	return size>>2;
+}
+
+
+static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdmanum)
+{
+	u32 *dest;
+	u32 unpackType;
+	UNPACKFUNCTYPE func;
+	const VIFUnpackFuncTable *ft;
+	VURegs * VU;
+	u8 *cdata = (u8*)data;
+	
+#ifdef _DEBUG
+	u32 memsize = VIFdmanum ? 0x4000 : 0x1000;
+#endif
+
+	_mm_prefetch((char*)data, _MM_HINT_NTA);
+
+	if (VIFdmanum == 0)
+	{
+		VU = &VU0;
+		//vifRegs = vif0Regs;
+		assert(v->addr < memsize);
+	}
+	else
+	{
+
+		VU = &VU1;
+		//vifRegs = vif1Regs;
+		assert(v->addr < memsize);
+
+		if (vu1MicroIsSkipping())
+		{
+			// don't process since the frame is dummy
+			vif->tag.addr += (size / (VIFfuncTable[ vif->cmd & 0xf ].gsize * vifRegs->cycle.wl)) * ((vifRegs->cycle.cl - vifRegs->cycle.wl) * 16);
+			return;
+		}
+	}
+
+	dest = (u32*)(VU->Mem + v->addr);
+
+	VIF_LOG("VIF%d UNPACK: Mode=%x, v->size=%d, size=%d, v->addr=%x v->num=%x",
+	        VIFdmanum, v->cmd & 0xf, v->size, size, v->addr, vifRegs->num);
+
+	VIFUNPACK_LOG("USN %x Masking %x Mask %x Mode %x CL %x WL %x Offset %x", vif->usn, (vifRegs->code & 0x10000000) >> 28, vifRegs->mask, vifRegs->mode, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->offset);
+
+	// The unpack type
+	unpackType = v->cmd & 0xf;
+
+	_mm_prefetch((char*)data + 128, _MM_HINT_NTA);
+	
+	ft = &VIFfuncTable[ unpackType ];
+	func = vif->usn ? ft->funcU : ft->funcS;
+
+	size <<= 2;
+	
+#ifdef _DEBUG
+	memsize = size;
+#endif
+	
+
+#ifdef VIFUNPACKDEBUG
+
+	if()vif->tag.addr + (size / (VIFfuncTable[ vif->cmd & 0xf ].gsize * vifRegs->cycle.wl)) * 
+		((vifRegs->cycle.cl - vifRegs->cycle.wl) * 16)) > (u32)(VIFdmanum ? 0x4000 : 0x1000)) 
+	{
+		//Sanity Check (memory overflow)
+		DevCon::Notice("VIF%x Unpack ending %x > %x", params VIFdmanum, vif->tag.addr, VIFdmanum ? 0x4000 : 0x1000);
+		
+	}
+#endif
+
+	if (vifRegs->cycle.cl >= vifRegs->cycle.wl)   // skipping write
+	{
+
+#ifdef _DEBUG
+		static int s_count = 0;
+#endif
+
+
+		if (size >= ft->gsize)
 		{
 			const UNPACKPARTFUNCTYPESSE* pfn;
 			int writemask;
@ -554,6 +609,16 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma

 			if (oldcycle != -1) *(u32*)&vifRegs->cycle = oldcycle;

+			if(vifRegs->mode == 2)
+			{
+				//Update the reg rows for non SSE
+				vifRegs->r0 = vifRow[0];
+				vifRegs->r1 = vifRow[1];
+				vifRegs->r2 = vifRow[2];
+				vifRegs->r3 = vifRow[3];
+			}
+			
+
 			// if size is left over, update the src,dst pointers
 			if (writemask > 0)
 			{
@ -561,107 +626,65 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
 				cdata += left * ft->gsize;
 				dest = (u32*)((u8*)dest + ((left / vifRegs->cycle.wl) * vifRegs->cycle.cl + left % vifRegs->cycle.wl) * 16);
 				vifRegs->num -= left;
-				_vif->cl = (size % (ft->gsize * vifRegs->cycle.wl)) / ft->gsize;
-			}
-			else
-			{
-				vifRegs->num -= size / ft->gsize;
-				if (vifRegs->num > 0) _vif->cl = (size % (ft->gsize * vifRegs->cycle.wl)) / ft->gsize;
-			}
-
+				vif->cl = (size % (ft->gsize * vifRegs->cycle.wl)) / ft->gsize;
 				size = writemask;

-			_vifRegs->r0 = _vifRow[0];
-			_vifRegs->r1 = _vifRow[1];
-			_vifRegs->r2 = _vifRow[2];
-			_vifRegs->r3 = _vifRow[3];
-		}
-		else
-		{
-
-			if ((unpackType == 0xC) && (vifRegs->cycle.cl == vifRegs->cycle.wl))  //No use when SSE is available
-			{
-				// v4-32
-				if ((vifRegs->mode == 0) && !(vifRegs->code & 0x10000000) && (vif->usn == 0))
-				{
-					vifRegs->num -= size >> 4;
-					memcpy_fast((u8*)dest, cdata, size);
-					size = 0;
-					return;
-				}
-			}
-
-			incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4;
-
-			while ((size >= ft->gsize) && (vifRegs->num > 0))
-			{
-				func(dest, (u32*)cdata, ft->qsize);
-				cdata += ft->gsize;
-				size -= ft->gsize;
-
-				vifRegs->num--;
-				++vif->cl;
-				if (vif->cl == vifRegs->cycle.wl)
-				{
-					dest += incdest;
-					vif->cl = 0;
-				}
-				else
-				{
-					dest += 4;
-				}
-			}
-
-			// have to update
-			_vifRow[0] = _vifRegs->r0;
-			_vifRow[1] = _vifRegs->r1;
-			_vifRow[2] = _vifRegs->r2;
-			_vifRow[3] = _vifRegs->r3;
-		}
-
-		// used for debugging vif
-//		{
-//			int i, j, k;
-//			u32* curdest = olddest;
-//            FILE* ftemp = fopen("temp.txt", s_count?"a+":"w");
-//			fprintf(ftemp, "%x %x %x\n", s_count, size, vif->tag.addr);
-//			fprintf(ftemp, "%x %x %x\n", vifRegs->code>>24, vifRegs->mode, *(u32*)&vifRegs->cycle);
-//			fprintf(ftemp, "row: %x %x %x %x\n", _vifRow[0], _vifRow[1], _vifRow[2], _vifRow[3]);
-//			//fprintf(ftemp, "row2: %x %x %x %x\n", _vifRegs->r0, _vifRegs->r1, _vifRegs->r2, _vifRegs->r3);
-//
-//			for(i = 0; i < memsize; ) {
-//                for(k = 0; k < vifRegs->cycle.wl; ++k) {
-//				    for(j = 0; j <= ((vifRegs->code>>26)&3); ++j) {
-//					    fprintf(ftemp, "%x ", curdest[4*k+j]);
-//				    }
-//                }
-//
-//				fprintf(ftemp, "\n");
-//				curdest += 4*vifRegs->cycle.cl;
-//				i += (((vifRegs->code>>26)&3)+1)*ft->dsize*vifRegs->cycle.wl;
-//			}
-//			fclose(ftemp);
-//		}
-//		s_count++;
-
 				if (size >= ft->dsize && vifRegs->num > 0)
 				{
 					//VIF_LOG("warning, end with size = %d", size);

 					/* unpack one qword */
+					vif->tag.addr += (size / ft->dsize) * 4;
 					func(dest, (u32*)cdata, size / ft->dsize);
 					size = 0;

+					if(vifRegs->mode == 2)
+					{
+						//Update the reg rows for SSE
+						vifRow[0] = vifRegs->r0;
+						vifRow[1] = vifRegs->r1;
+						vifRow[2] = vifRegs->r2;
+						vifRow[3] = vifRegs->r3;
+					}	
 					VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, vif->tag.addr);
 				}
+			}
+			else
+			{
+				vifRegs->num -= size / ft->gsize;
+				if (vifRegs->num > 0) vif->cl = (size % (ft->gsize * vifRegs->cycle.wl)) / ft->gsize;
+				size = 0;
+			}

 		} 
+		else if (size >= ft->dsize && vifRegs->num > 0) //Else write what we do have
+		{
+			//VIF_LOG("warning, end with size = %d", size);
+
+			/* unpack one qword */
+			vif->tag.addr += (size / ft->dsize) * 4;
+			func(dest, (u32*)cdata, size / ft->dsize);
+			size = 0;
+
+			if(vifRegs->mode == 2)
+			{
+				//Update the reg rows for SSE
+				vifRow[0] = vifRegs->r0;
+				vifRow[1] = vifRegs->r1;
+				vifRow[2] = vifRegs->r2;
+				vifRow[3] = vifRegs->r3;
+			}
+			VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, vif->tag.addr);
+		}
+	}
 	else   /* filling write */
 	{
-		VIF_LOG("VIFunpack - filling write");
+
+		if((u32)(size / ft->gsize) < vifRegs->num && vifRegs->cycle.cl != 0) 
+			DevCon::Notice("Filling write warning! Size < packet size and CL != 0");
 				
 		VIFUNPACK_LOG("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType);
-		while (size >= ft->gsize || vifRegs->num > 0)
+		while (vifRegs->num > 0)
 		{
 			if (vif->cl == vifRegs->cycle.wl)
 			{
@ -679,6 +702,11 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
 				{
 					vif->cl = 0;
 				}
+				if(size < ft->gsize) 
+				{
+					VIF_LOG("Out of Filling write data");
+					break;
+				}
 			}
 			else
 			{
@ -786,17 +814,21 @@ static __forceinline void vif0UNPACK(u32 *data)
 		len = ((((32 >> vl) * (vn + 1)) * n) + 31) >> 5;
 	}

-	vif0.wl = 0;
 	vif0.cl = 0;
 	vif0.tag.cmd  = vif0.cmd;
 	vif0.tag.addr &= 0xfff;
 	vif0.tag.size = len;
 	vif0Regs->offset = 0;
+
+	vifRegs = (VIFregisters*)vif0Regs;
+	vifMaskRegs = g_vif0Masks;
+	vif = &vif0;
+	vifRow = g_vifRow0;
 }

-static __forceinline void _vif0mpgTransfer(u32 addr, u32 *data, int size)
+static __forceinline void vif0mpgTransfer(u32 addr, u32 *data, int size)
 {
-	/*	Console::WriteLn("_vif0mpgTransfer addr=%x; size=%x", params addr, size);
+	/*	Console::WriteLn("vif0mpgTransfer addr=%x; size=%x", params addr, size);
 		{
 			FILE *f = fopen("vu1.raw", "wb");
 			fwrite(data, 1, size*4, f);
@ -900,7 +932,7 @@ static int __fastcall Vif0TransMPG(u32 *data)  // MPG
 {
 	if (vif0.vifpacketsize < vif0.tag.size)
 	{
-		_vif0mpgTransfer(vif0.tag.addr, data, vif0.vifpacketsize);
+		vif0mpgTransfer(vif0.tag.addr, data, vif0.vifpacketsize);
 		vif0.tag.addr += vif0.vifpacketsize << 2;
 		vif0.tag.size -= vif0.vifpacketsize;
 		return vif0.vifpacketsize;
@ -909,7 +941,7 @@ static int __fastcall Vif0TransMPG(u32 *data)  // MPG
 	{
 		int ret;
 		
-		_vif0mpgTransfer(vif0.tag.addr, data, vif0.tag.size);
+		vif0mpgTransfer(vif0.tag.addr, data, vif0.tag.size);
 		ret = vif0.tag.size;
 		vif0.tag.size = 0;
 		vif0.cmd = 0;
@ -924,6 +956,9 @@ static int __fastcall Vif0TransUnpack(u32 *data)	// UNPACK
 	{
 		/* size is less that the total size, transfer is 'in pieces' */
 		VIFunpack(data, &vif0.tag, vif0.vifpacketsize, VIF0dmanum);
+		
+		ProcessMemSkip(vif0.vifpacketsize << 2, (vif0.cmd & 0xf), VIF0dmanum);
+
 		vif0.tag.size -= vif0.vifpacketsize;
 		FreezeXMMRegs(0);
 		return vif0.vifpacketsize;
@ -931,15 +966,28 @@ static int __fastcall Vif0TransUnpack(u32 *data)	// UNPACK
 	else
 	{
 		/* we got all the data, transfer it fully */
-		int ret;
+		int ret = vif0.tag.size;
 		
-		VIFunpack(data, &vif0.tag, vif0.tag.size, VIF0dmanum);
-		ret = vif0.tag.size;
+		//Align data after a split transfer first
+		if(vif0Regs->offset != 0 || vif0.cl != 0) 
+		{
+			vif0.tag.size = VIFalign(data, &vif0.tag, vif0.tag.size, VIF0dmanum);
+			data += ret - vif0.tag.size;
+			if(vif0.tag.size > 0) VIFunpack(data, &vif0.tag, vif0.tag.size, VIF0dmanum);
 			vif0.tag.size = 0;
 			vif0.cmd = 0;
 			FreezeXMMRegs(0);
 			return ret;
 		}
+		else
+		{
+			VIFunpack(data, &vif0.tag, vif0.tag.size, VIF0dmanum);
+			vif0.tag.size = 0;
+			vif0.cmd = 0;
+			FreezeXMMRegs(0);
+			return ret;
+		}
+	}
 }

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -1516,15 +1564,20 @@ static __forceinline void vif1UNPACK(u32 *data)
 	else
 		vif1.tag.addr = vif1Regs->code & 0x3ff;

+	vif1Regs->offset = 0;
 	vif1.cl = 0;
 	vif1.tag.addr <<= 4;
-
 	vif1.tag.cmd  = vif1.cmd;
+
+	vifRegs = (VIFregisters*)vif1Regs;
+	vifMaskRegs = g_vif1Masks;
+	vif = &vif1;
+	vifRow = g_vifRow1;
 }

-static __forceinline void _vif1mpgTransfer(u32 addr, u32 *data, int size)
+static __forceinline void vif1mpgTransfer(u32 addr, u32 *data, int size)
 {
-	/*	Console::WriteLn("_vif1mpgTransfer addr=%x; size=%x", params addr, size);
+	/*	Console::WriteLn("vif1mpgTransfer addr=%x; size=%x", params addr, size);
 		{
 			FILE *f = fopen("vu1.raw", "wb");
 			fwrite(data, 1, size*4, f);
@ -1626,7 +1679,7 @@ static int __fastcall Vif1TransMPG(u32 *data)
 {
 	if (vif1.vifpacketsize < vif1.tag.size)
 	{
-		_vif1mpgTransfer(vif1.tag.addr, data, vif1.vifpacketsize);
+		vif1mpgTransfer(vif1.tag.addr, data, vif1.vifpacketsize);
 		vif1.tag.addr += vif1.vifpacketsize << 2;
 		vif1.tag.size -= vif1.vifpacketsize;
 		return vif1.vifpacketsize;
@ -1634,7 +1687,7 @@ static int __fastcall Vif1TransMPG(u32 *data)
 	else
 	{
 		int ret;
-		_vif1mpgTransfer(vif1.tag.addr, data, vif1.tag.size);
+		vif1mpgTransfer(vif1.tag.addr, data, vif1.tag.size);
 		ret = vif1.tag.size;
 		vif1.tag.size = 0;
 		vif1.cmd = 0;
@ -1735,21 +1788,36 @@ static int  __fastcall Vif1TransUnpack(u32 *data)
 		/* size is less that the total size, transfer is
 		   'in pieces' */
 		VIFunpack(data, &vif1.tag, vif1.vifpacketsize, VIF1dmanum);
+
+		ProcessMemSkip(vif1.vifpacketsize << 2, (vif1.cmd & 0xf), VIF1dmanum);
 		vif1.tag.size -= vif1.vifpacketsize;
 		FreezeXMMRegs(0);
 		return vif1.vifpacketsize;
 	}
 	else
 	{
-		int ret;
-		/* we got all the data, transfer it fully */
-		VIFunpack(data, &vif1.tag, vif1.tag.size, VIF1dmanum);
-		ret = vif1.tag.size;
+		int ret = vif1.tag.size;
+
+		if(vif1Regs->offset != 0 || vif1.cl != 0) 
+		{
+			vif1.tag.size = VIFalign(data, &vif1.tag, vif1.tag.size, VIF1dmanum);
+			data += ret - vif1.tag.size;
+			if(vif1.tag.size > 0) VIFunpack(data, &vif1.tag, vif1.tag.size, VIF1dmanum);
 			vif1.tag.size = 0;
 			vif1.cmd = 0;
 			FreezeXMMRegs(0);
 			return ret;
 		} 
+		else
+		{
+			/* we got all the data, transfer it fully */
+			VIFunpack(data, &vif1.tag, vif1.tag.size, VIF1dmanum);
+			vif1.tag.size = 0;
+			vif1.cmd = 0;
+			FreezeXMMRegs(0);
+			return ret;
+		}
+	}

 }

--- a/pcsx2/VifDma.h
+++ b/pcsx2/VifDma.h
@ -32,7 +32,7 @@ struct vifStruct {
 	int cmd;
 	int irq;
 	int cl;
-	int wl;
+	int qwcalign;
 	u8 usn;
 	
 	// The next three should be boolean, and will be next time I break savestate compatability. --arcum42
--- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
+++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
@ -947,7 +947,6 @@
 					<Tool
 						Name="VCCLCompilerTool"
 						UsePrecompiledHeader="1"
-						PrecompiledHeaderFile="$(IntDir)\$(TargetName).pch"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
@ -2507,6 +2506,10 @@
 							RelativePath="..\..\x86\microVU_Compile.inl"
 							>
 						</File>
+						<File
+							RelativePath="..\..\x86\microVU_Execute.inl"
+							>
+						</File>
 						<File
 							RelativePath="..\..\x86\microVU_Lower.inl"
 							>
@ -2912,149 +2915,36 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\x86\ix86\ix86.inl"
+				RelativePath="..\..\x86\ix86\ix86_3dnow.cpp"
 				>
 			</File>
-			<File
-				RelativePath="..\..\x86\ix86\ix86_3dnow.inl"
-				>
-				<FileConfiguration
-					Name="Devel vm|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug vm|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release vm|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-			</File>
 			<File
 				RelativePath="..\..\x86\ix86\ix86_cpudetect.cpp"
 				>
 			</File>
 			<File
-				RelativePath="..\..\x86\ix86\ix86_fpu.inl"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Devel vm|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug vm|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release vm|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\x86\ix86\ix86_macros.h"
+				RelativePath="..\..\x86\ix86\ix86_fpu.cpp"
 				>
 			</File>
 			<File
-				RelativePath="..\..\x86\ix86\ix86_mmx.inl"
+				RelativePath="..\..\x86\ix86\ix86_group1.cpp"
 				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Devel vm|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug vm|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release vm|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\x86\ix86\ix86_sse.inl"
+				RelativePath="..\..\x86\ix86\ix86_internal.h"
 				>
-				<FileConfiguration
-					Name="Debug|Win32"
+			</File>
+			<File
+				RelativePath="..\..\x86\ix86\ix86_legacy.cpp"
 				>
-					<Tool
-						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Devel vm|Win32"
+			</File>
+			<File
+				RelativePath="..\..\x86\ix86\ix86_mmx.cpp"
 				>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug vm|Win32"
+			</File>
+			<File
+				RelativePath="..\..\x86\ix86\ix86_sse.cpp"
 				>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release vm|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
 			</File>
 			<File
 				RelativePath="..\..\x86\ix86\ix86_sse_helpers.h"
--- a/pcsx2/windows/WinSysExec.cpp
+++ b/pcsx2/windows/WinSysExec.cpp
@ -39,6 +39,8 @@ const char* g_pRunGSState = NULL;

 #define CmdSwitchIs( text ) ( stricmp( command, text ) == 0 )

+extern u8 *recMem;
+
 int SysPageFaultExceptionFilter( EXCEPTION_POINTERS* eps )
 {
 	const _EXCEPTION_RECORD& ExceptionRecord = *eps->ExceptionRecord;
@ -50,6 +52,7 @@ int SysPageFaultExceptionFilter( EXCEPTION_POINTERS* eps )

 	// get bad virtual address
 	uptr addr=ExceptionRecord.ExceptionInformation[1];
+	u8* pcode=(u8*)ExceptionRecord.ExceptionAddress;
 	
 	//this is a *hackfix* for a bug on x64 windows kernels.They do not give correct address
 	//if the error is a missaligned access (they return 0)
@ -60,16 +63,17 @@ int SysPageFaultExceptionFilter( EXCEPTION_POINTERS* eps )
 	}
 	u32 offset = addr-(uptr)psM;
 	
-	if (addr&0x80000000)
+	if (addr&0x80000000 && ((pcode-recMem)<(16*1024*1024)) )
 	{
 		uptr _vtlb_HandleRewrite(u32 info,u8* ra);
-		u8* pcode=(u8*)ExceptionRecord.ExceptionAddress;

-		u32 patch_point=1;
+		s32 patch_point=1;
 		//01 C1
 		while(pcode[-patch_point]!=0x81 || pcode[-patch_point-1]!=0xC1 || pcode[-patch_point-2]!=0x01)
 		{
 			patch_point++;
+			if (patch_point>0x100)
+				return EXCEPTION_CONTINUE_SEARCH;
 		}
 		assert(pcode[-patch_point]==0x81);
 		pcode[-patch_point]=0xF;//js32, 0x81 is add32
--- a/pcsx2/x86/BaseblockEx.cpp
+++ b/pcsx2/x86/BaseblockEx.cpp
@ -70,10 +70,27 @@ int BaseBlocks::LastIndex(u32 startpc) const
 	return imin;
 }

-BASEBLOCKEX* BaseBlocks::GetByX86(uptr ip) const
+BASEBLOCKEX* BaseBlocks::GetByX86(uptr ip)
 {
-	// TODO
+	if (0 == blocks.size())
 		return 0;
+
+	int imin = 0, imax = blocks.size() - 1, imid;
+
+	while(imin != imax) {
+		imid = (imin+imax+1)>>1;
+
+		if (blocks[imid].fnptr > ip)
+			imax = imid - 1;
+		else
+			imin = imid;
+	}
+
+	if (ip < blocks[imin].fnptr ||
+		ip >= blocks[imin].fnptr + blocks[imin].x86size)
+		return 0;
+
+	return &blocks[imin];
 }

 void BaseBlocks::Link(u32 pc, uptr jumpptr)
--- a/pcsx2/x86/BaseblockEx.h
+++ b/pcsx2/x86/BaseblockEx.h
@ -18,14 +18,9 @@

 #pragma once

-#include "PrecompiledHeader.h"
-#include <vector>
-#include <map>
+#include <map>			// used by BaseBlockEx
 #include <utility>

-// used to keep block information
-#define BLOCKTYPE_DELAYSLOT	1		// if bit set, delay slot
-
 // Every potential jump point in the PS2's addressable memory has a BASEBLOCK
 // associated with it. So that means a BASEBLOCK for every 4 bytes of PS2
 // addressable memory.  Yay!
@ -73,7 +68,7 @@ public:

 	BASEBLOCKEX* New(u32 startpc, uptr fnptr);
 	int LastIndex (u32 startpc) const;
-	BASEBLOCKEX* GetByX86(uptr ip) const;
+	BASEBLOCKEX* GetByX86(uptr ip);

 	inline int Index (u32 startpc) const
 	{
@ -119,7 +114,6 @@ public:
 	}
 };

-#define GET_BLOCKTYPE(b) ((b)->Type)
 #define PC_GETBLOCK_(x, reclut) ((BASEBLOCK*)(reclut[((u32)(x)) >> 16] + (x)*(sizeof(BASEBLOCK)/4)))

 static void recLUT_SetPage(uptr reclut[0x10000], uptr hwlut[0x10000],
--- a/pcsx2/x86/aVif.S
+++ b/pcsx2/x86/aVif.S
@ -18,9 +18,9 @@
 */
 .intel_syntax noprefix
        
-.extern _vifRegs
-.extern _vifMaskRegs
-.extern _vifRow
+.extern vifRegs
+.extern vifMaskRegs
+.extern vifRow
        
 #define VIF_ESP esp
 #define VIF_SRC	esi
@ -108,7 +108,7 @@

 // setting up masks
 #define UNPACK_Setup_Mask_SSE(CL) \
-	mov VIF_TMPADDR, _vifMaskRegs; \
+	mov VIF_TMPADDR, vifMaskRegs; \
 	movdqa XMM_ROWMASK, xmmword ptr [VIF_TMPADDR + 64*(CL) + 16]; \
 	movdqa XMM_ROWCOLMASK, xmmword ptr [VIF_TMPADDR + 64*(CL) + 32]; \
 	movdqa XMM_WRITEMASK, xmmword ptr [VIF_TMPADDR + 64*(CL)]; \
@ -118,7 +118,7 @@

 #define UNPACK_Start_Setup_Mask_SSE_0(CL) UNPACK_Setup_Mask_SSE(CL)
 #define UNPACK_Start_Setup_Mask_SSE_1(CL) \
-	mov VIF_TMPADDR, _vifMaskRegs; \
+	mov VIF_TMPADDR, vifMaskRegs; \
 	movdqa XMM_ROWMASK, xmmword ptr [VIF_TMPADDR + 64*(CL) + 16]; \
 	movdqa XMM_ROWCOLMASK, xmmword ptr [VIF_TMPADDR + 64*(CL) + 32]; \
 	pand XMM_ROWMASK, XMM_ROW; \
@ -129,12 +129,12 @@

 #define UNPACK_Setup_Mask_SSE_0_1(CL) 
 #define UNPACK_Setup_Mask_SSE_1_1(CL) \
-	mov VIF_TMPADDR, _vifMaskRegs; \
+	mov VIF_TMPADDR, vifMaskRegs; \
 	movdqa XMM_WRITEMASK, xmmword ptr [VIF_TMPADDR + 64*(0)]; \

 // ignore CL, since vif.cycle.wl == 1
 #define UNPACK_Setup_Mask_SSE_2_1(CL) \
-	mov VIF_TMPADDR, _vifMaskRegs; \
+	mov VIF_TMPADDR, vifMaskRegs; \
 	movdqa XMM_ROWMASK, xmmword ptr [VIF_TMPADDR + 64*(0) + 16]; \
 	movdqa XMM_ROWCOLMASK, xmmword ptr [VIF_TMPADDR + 64*(0) + 32]; \
 	movdqa XMM_WRITEMASK, xmmword ptr [VIF_TMPADDR + 64*(0)]; \
@ -1312,9 +1312,9 @@
 #pragma warning(disable:4731)

 #define SAVE_ROW_REG_BASE \
-	mov VIF_TMPADDR, _vifRow; \
+	mov VIF_TMPADDR, vifRow; \
 	movdqa xmmword ptr [VIF_TMPADDR], XMM_ROW; \
-	mov VIF_TMPADDR, _vifRegs; \
+	mov VIF_TMPADDR, vifRegs; \
 	movss dword ptr [VIF_TMPADDR+0x100], XMM_ROW; \
 	psrldq XMM_ROW, 4; \
 	movss dword ptr [VIF_TMPADDR+0x110], XMM_ROW; \
@ -1349,7 +1349,7 @@
 .globl UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType; \
 UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType: \
    INIT_ARGS(); \
-    mov VIF_TMPADDR, _vifRegs; \
+    mov VIF_TMPADDR, vifRegs; \
    movzx VIF_INC, byte ptr [VIF_TMPADDR + 0x40]; \
    movzx VIF_SAVEEBX, byte ptr [VIF_TMPADDR + 0x41]; \
    sub VIF_INC, VIF_SAVEEBX; \
--- a/pcsx2/x86/iCore.cpp
+++ b/pcsx2/x86/iCore.cpp
@ -18,7 +18,7 @@

 #include "PrecompiledHeader.h"

-#include "Misc.h"
+#include "System.h"
 #include "iR5900.h"
 #include "Vif.h"
 #include "VU.h"
--- a/pcsx2/x86/iMMI.cpp
+++ b/pcsx2/x86/iMMI.cpp
@ -1956,14 +1956,14 @@ CPU_SSE_XMMCACHE_END
 // Both Macros are 16 bytes so we can use a shift instead of a Mul instruction
 #define QFSRVhelper0() {  \
 	ajmp[0] = JMP32(0);  \
-	x86Ptr[0] += 11;  \
+	x86Ptr += 11;  \
 }

 #define QFSRVhelper(shift1, shift2) {  \
 	SSE2_PSRLDQ_I8_to_XMM(EEREC_D, shift1);  \
 	SSE2_PSLLDQ_I8_to_XMM(t0reg, shift2);  \
 	ajmp[shift1] = JMP32(0);  \
-	x86Ptr[0] += 1;  \
+	x86Ptr += 1;  \
 }

 void recQFSRV()
@ -1982,8 +1982,8 @@ void recQFSRV()

 		MOV32MtoR(EAX, (uptr)&cpuRegs.sa);
 		SHL32ItoR(EAX, 4); // Multiply SA bytes by 16 bytes (the amount of bytes in QFSRVhelper() macros)
-		AND32I8toR(EAX, 0xf0); // This can possibly be removed but keeping it incase theres garbage in SA (cottonvibes)
-		ADD32ItoEAX((uptr)x86Ptr[0] + 7); // ADD32 = 5 bytes, JMPR = 2 bytes
+		AND32ItoR(EAX, 0xf0); // This can possibly be removed but keeping it incase theres garbage in SA (cottonvibes)
+		ADD32ItoR(EAX, (uptr)x86Ptr + 7); // ADD32 = 5 bytes, JMPR = 2 bytes
 		JMPR(EAX); // Jumps to a QFSRVhelper() case below (a total of 16 different cases)
 	
 		// Case 0:
@ -2676,9 +2676,6 @@ CPU_SSE_XMMCACHE_END
 	recCall( Interp::PHMADH, _Rd_ );
 }

-////////////////////////////////////////////////////
-//upper word of each doubleword in LO and HI is undocumented/undefined
-//contains the NOT of the upper multiplication result (before the substraction of the lower multiplication result)
 void recPMSUBH()
 {
 	CPU_SSE2_XMMCACHE_START((_Rd_?XMMINFO_WRITED:0)|XMMINFO_READS|XMMINFO_READT|XMMINFO_READLO|XMMINFO_READHI|XMMINFO_WRITELO|XMMINFO_WRITEHI)
@ -2740,12 +2737,8 @@ CPU_SSE_XMMCACHE_END
 }

 ////////////////////////////////////////////////////
-
-//  rs = ... a1 a0
-//  rt = ... b1 b0
-//  rd = ... a1*b1 - a0*b0
-//  hi = ...
-//  lo = ... (undefined by doc)NOT(a1*b1), a1*b1 - a0*b0
+//upper word of each doubleword in LO and HI is undocumented/undefined
+//it contains the NOT of the upper multiplication result (before the substraction of the lower multiplication result)
 void recPHMSBH()
 {
 CPU_SSE2_XMMCACHE_START((_Rd_?XMMINFO_WRITED:0)|XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITELO|XMMINFO_WRITEHI)
--- a/pcsx2/x86/iR3000A.cpp
+++ b/pcsx2/x86/iR3000A.cpp
@ -24,6 +24,8 @@
 #include "PrecompiledHeader.h"

 #include "iR3000A.h"
+#include "BaseblockEx.h"
+
 #include <time.h>

 #ifndef _WIN32
@ -171,7 +173,7 @@ static void iIopDumpBlock( int startpc, u8 * ptr )
 #ifdef __LINUX__
    // dump the asm
    f = fopen( "mydump1", "wb" );
-	fwrite( ptr, 1, (uptr)x86Ptr[0] - (uptr)ptr, f );
+	fwrite( ptr, 1, (uptr)x86Ptr - (uptr)ptr, f );
 	fclose( f );
 	sprintf( command, "objdump -D --target=binary --architecture=i386 -M intel mydump1 | cat %s - > tempdump", filename );
 	system( command );
@ -316,7 +318,7 @@ void _psxMoveGPRtoM(u32 to, int fromgpr)
 void _psxMoveGPRtoRm(x86IntRegType to, int fromgpr)
 {
 	if( PSX_IS_CONST1(fromgpr) )
-		MOV32ItoRmOffset( to, g_psxConstRegs[fromgpr], 0 );
+		MOV32ItoRm( to, g_psxConstRegs[fromgpr] );
 	else {
 		// check x86
 		MOV32MtoR(EAX, (uptr)&psxRegs.GPR.r[ fromgpr ] );
@ -647,7 +649,7 @@ static void recExecute()
 	//for (;;) R3000AExecute();
 }

-static s32 recExecuteBlock( s32 eeCycles )
+static __forceinline s32 recExecuteBlock( s32 eeCycles )
 {
 	psxBreak = 0;
 	psxCycleEE = eeCycles;
@ -741,7 +743,7 @@ static __forceinline u32 psxRecClearMem(u32 pc)
 	return upperextent - pc;
 }

-static void recClear(u32 Addr, u32 Size)
+static __forceinline void recClearIOP(u32 Addr, u32 Size)
 {
 	u32 pc = Addr;
 	while (pc < Addr + Size*4)
@ -772,7 +774,7 @@ void psxSetBranchReg(u32 reg)
 	_psxFlushCall(FLUSH_EVERYTHING);
 	iPsxBranchTest(0xffffffff, 1);

-	JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
+	JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr + 5 ));
 }

 void psxSetBranchImm( u32 imm )
@ -796,7 +798,7 @@ void psxSetBranchImm( u32 imm )
 //		  So for now these are new settings that work.
 //		  (rama)

-static u32 psxScaleBlockCycles()
+static __forceinline u32 psxScaleBlockCycles()
 {
 	return s_psxBlockCycles * (CHECK_IOP_CYCLERATE ? 2 : 1);
 }
@ -828,7 +830,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch)
 	if( newpc != 0xffffffff )
 	{
 		CMP32ItoM((uptr)&psxRegs.pc, newpc);
-		JNE32((uptr)iopDispatcherReg - ( (uptr)x86Ptr[0] + 6 ));
+		JNE32((uptr)iopDispatcherReg - ( (uptr)x86Ptr + 6 ));
 	}

 	// Skip branch jump target here:
@ -864,7 +866,7 @@ void rpsxSYSCALL()

 	ADD32ItoM((uptr)&psxRegs.cycle, psxScaleBlockCycles() );
 	SUB32ItoM((uptr)&psxCycleEE, psxScaleBlockCycles()*8 );
-	JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
+	JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr + 5 ));

 	// jump target for skipping blockCycle updates
 	x86SetJ8(j8Ptr[0]);
@ -884,7 +886,7 @@ void rpsxBREAK()
 	j8Ptr[0] = JE8(0);
 	ADD32ItoM((uptr)&psxRegs.cycle, psxScaleBlockCycles() );
 	SUB32ItoM((uptr)&psxCycleEE, psxScaleBlockCycles()*8 );
-	JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
+	JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr + 5 ));
 	x86SetJ8(j8Ptr[0]);

 	//if (!psxbranch) psxbranch = 2;
@ -1004,7 +1006,7 @@ void iopRecRecompile(u32 startpc)
 	
 	x86SetPtr( recPtr );
 	x86Align(16);
-	recPtr = x86Ptr[_EmitterId_];
+	recPtr = x86Ptr;

 	s_pCurBlock = PSX_GETBLOCK(startpc);
 	
@ -1025,7 +1027,7 @@ void iopRecRecompile(u32 startpc)
 	
    psxbranch = 0;

-	s_pCurBlock->SetFnptr( (uptr)x86Ptr[0] );
+	s_pCurBlock->SetFnptr( (uptr)x86Ptr );
 	s_psxBlockCycles = 0;

 	// reset recomp state variables
@ -1160,7 +1162,7 @@ StartRecomp:

 		iPsxBranchTest(0xffffffff, 1);	

-		JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
+		JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr + 5 ));
 	}
 	else {
 		if( psxbranch ) assert( !willbranch3 );
@ -1180,12 +1182,12 @@ StartRecomp:
 		}
 	}

-	assert( x86Ptr[0] < recMem+RECMEM_SIZE );
+	assert( x86Ptr < recMem+RECMEM_SIZE );

-	assert(x86Ptr[_EmitterId_] - recPtr < 0x10000);
-	s_pCurBlockEx->x86size = x86Ptr[_EmitterId_] - recPtr;
+	assert(x86Ptr - recPtr < 0x10000);
+	s_pCurBlockEx->x86size = x86Ptr - recPtr;

-	recPtr = x86Ptr[0];
+	recPtr = x86Ptr;

 	assert( (g_psxHasConstReg&g_psxFlushedConstReg) == g_psxHasConstReg );

@ -1198,7 +1200,7 @@ R3000Acpu psxRec = {
 	recResetIOP,
 	recExecute,
 	recExecuteBlock,
-	recClear,
+	recClearIOP,
 	recShutdown
 };

--- a/pcsx2/x86/iR3000A.h
+++ b/pcsx2/x86/iR3000A.h
@ -18,12 +18,10 @@
 #ifndef _R3000A_SUPERREC_
 #define _R3000A_SUPERREC_

-#define _EmitterId_ EmitterId_R3000a
 #include "ix86/ix86.h"

 #include "R3000A.h"
 #include "iCore.h"
-#include "BaseblockEx.h"

 // Cycle penalties for particularly slow instructions.
 static const int psxInstCycles_Mult = 7;
--- a/pcsx2/x86/iR3000Atables.cpp
+++ b/pcsx2/x86/iR3000Atables.cpp
@ -1258,7 +1258,7 @@ void rpsxJALR()
 static void* s_pbranchjmp;
 static u32 s_do32 = 0;

-#define JUMPVALID(pjmp) (( x86Ptr[0] - (u8*)pjmp ) <= 0x80)
+#define JUMPVALID(pjmp) (( x86Ptr - (u8*)pjmp ) <= 0x80)

 void rpsxSetBranchEQ(int info, int process)
 {
@ -1305,7 +1305,7 @@ void rpsxBEQ_process(int info, int process)
 	else
 	{
 		_psxFlushAllUnused();
-		u8* prevx86 = x86Ptr[0];
+		u8* prevx86 = x86Ptr;
 		s_do32 = 0;
 		psxSaveBranchState();

@ -1318,7 +1318,7 @@ void rpsxBEQ_process(int info, int process)
 			x86SetJ8A( (u8*)s_pbranchjmp ); 
 		}
 		else {
-			x86Ptr[0] = prevx86;
+			x86SetPtr( prevx86 );
 			s_do32 = 1;
 			psxpc -= 4;
 			psxRegs.code = iopMemRead32( psxpc - 4 );
@ -1369,7 +1369,7 @@ void rpsxBNE_process(int info, int process)
 	}

 	_psxFlushAllUnused();
-	u8* prevx86 = x86Ptr[0];
+	u8* prevx86 = x86Ptr;
 	s_do32 = 0;
 	rpsxSetBranchEQ(info, process);

@ -1381,7 +1381,7 @@ void rpsxBNE_process(int info, int process)
 		x86SetJ8A( (u8*)s_pbranchjmp ); 
 	}
 	else {
-		x86Ptr[0] = prevx86;
+		x86SetPtr( prevx86 );
 		s_do32 = 1;
 		psxpc -= 4;
 		psxRegs.code = iopMemRead32( psxpc - 4 );
@ -1423,7 +1423,7 @@ void rpsxBLTZ()
 	}

 	CMP32ItoM((uptr)&psxRegs.GPR.r[_Rs_], 0);
-	u8* prevx86 = x86Ptr[0];
+	u8* prevx86 = x86Ptr;
 	u8* pjmp = JL8(0);

 	psxSaveBranchState();
@ -1435,7 +1435,7 @@ void rpsxBLTZ()
 		x86SetJ8A( pjmp ); 
 	}
 	else {
-		x86Ptr[0] = prevx86;
+		x86SetPtr( prevx86 );
 		psxpc -= 4;
 		psxRegs.code = iopMemRead32( psxpc - 4 );
 		psxLoadBranchState();
@ -1470,7 +1470,7 @@ void rpsxBGEZ()
 	}

 	CMP32ItoM((uptr)&psxRegs.GPR.r[_Rs_], 0);
-	u8* prevx86 = x86Ptr[0];
+	u8* prevx86 = x86Ptr;
 	u8* pjmp = JGE8(0);

 	psxSaveBranchState();
@ -1482,7 +1482,7 @@ void rpsxBGEZ()
 		x86SetJ8A( pjmp ); 
 	}
 	else {
-		x86Ptr[0] = prevx86;
+		x86SetPtr( prevx86 );
 		psxpc -= 4;
 		psxRegs.code = iopMemRead32( psxpc - 4 );
 		psxLoadBranchState();
@ -1524,7 +1524,7 @@ void rpsxBLTZAL()
 	}

 	CMP32ItoM((uptr)&psxRegs.GPR.r[_Rs_], 0);
-	u8* prevx86 = x86Ptr[0];
+	u8* prevx86 = x86Ptr;
 	u8* pjmp = JL8(0);

 	psxSaveBranchState();
@ -1538,7 +1538,7 @@ void rpsxBLTZAL()
 		x86SetJ8A( pjmp ); 
 	}
 	else {
-		x86Ptr[0] = prevx86;
+		x86SetPtr( prevx86 );
 		psxpc -= 4;
 		psxRegs.code = iopMemRead32( psxpc - 4 );
 		psxLoadBranchState();
@ -1577,7 +1577,7 @@ void rpsxBGEZAL()
 	}

 	CMP32ItoM((uptr)&psxRegs.GPR.r[_Rs_], 0);
-	u8* prevx86 = x86Ptr[0];
+	u8* prevx86 = x86Ptr;
 	u8* pjmp = JGE8(0);

 	MOV32ItoM((uptr)&psxRegs.GPR.r[31], psxpc+4);
@ -1591,7 +1591,7 @@ void rpsxBGEZAL()
 		x86SetJ8A( pjmp ); 
 	}
 	else {
-		x86Ptr[0] = prevx86;
+		x86SetPtr( prevx86 );
 		psxpc -= 4;
 		psxRegs.code = iopMemRead32( psxpc - 4 );
 		psxLoadBranchState();
@ -1631,7 +1631,7 @@ void rpsxBLEZ()
 	_clearNeededX86regs();

 	CMP32ItoM((uptr)&psxRegs.GPR.r[_Rs_], 0);
-	u8* prevx86 = x86Ptr[0];
+	u8* prevx86 = x86Ptr;
 	u8* pjmp = JLE8(0);

 	psxSaveBranchState();
@ -1642,7 +1642,7 @@ void rpsxBLEZ()
 		x86SetJ8A( pjmp ); 
 	}
 	else {
-		x86Ptr[0] = prevx86;
+		x86SetPtr( prevx86 );
 		psxpc -= 4;
 		psxRegs.code = iopMemRead32( psxpc - 4 );
 		psxLoadBranchState();
@ -1679,7 +1679,7 @@ void rpsxBGTZ()
 	_clearNeededX86regs();

 	CMP32ItoM((uptr)&psxRegs.GPR.r[_Rs_], 0);
-	u8* prevx86 = x86Ptr[0];
+	u8* prevx86 = x86Ptr;
 	u8* pjmp = JG8(0);

 	psxSaveBranchState();
@ -1690,7 +1690,7 @@ void rpsxBGTZ()
 		x86SetJ8A( pjmp ); 
 	}
 	else {
-		x86Ptr[0] = prevx86;
+		x86SetPtr( prevx86 );
 		psxpc -= 4;
 		psxRegs.code = iopMemRead32( psxpc - 4 );
 		psxLoadBranchState();
--- a/pcsx2/x86/iR5900.h
+++ b/pcsx2/x86/iR5900.h
@ -19,13 +19,11 @@
 #ifndef __IR5900_H__
 #define __IR5900_H__

-#define _EmitterId_ EmitterId_R5900
 #include "ix86/ix86.h"
 #include "ix86/ix86_sse_helpers.h"
 #include "R5900.h"
 #include "VU.h"
 #include "iCore.h"
-#include "BaseblockEx.h"	// needed for recClear and stuff

 // Yay!  These work now! (air) ... almost (air)
 #define ARITHMETICIMM_RECOMPILE
--- a/pcsx2/x86/iVU0micro.cpp
+++ b/pcsx2/x86/iVU0micro.cpp
@ -23,6 +23,7 @@
 #include "VUmicro.h"
 #include "iVUzerorec.h"

+#ifndef PCSX2_MICROVU_
 namespace VU0micro
 {
 	void recAlloc() 
@ -62,6 +63,34 @@ namespace VU0micro
 		FreezeXMMRegs(0);
 	}
 }
+#else
+
+extern void initVUrec(VURegs* vuRegs, const int vuIndex);
+extern void closeVUrec(const int vuIndex);
+extern void resetVUrec(const int vuIndex);
+extern void clearVUrec(u32 addr, u32 size, const int vuIndex);
+extern void runVUrec(u32 startPC, u32 cycles, const int vuIndex);
+
+namespace VU0micro
+{
+	void recAlloc()								 { initVUrec(&VU0, 0); }
+	void __fastcall recClear(u32 Addr, u32 Size) { clearVUrec(Addr, Size, 0); }
+	void recShutdown()							 { closeVUrec(0); }
+	static void recReset()						 { resetVUrec(0); x86FpuState = FPU_STATE; }
+	static void recStep()						 {}
+	static void recExecuteBlock()
+	{
+		if((VU0.VI[REG_VPU_STAT].UL & 1) == 0) return;
+
+		FreezeXMMRegs(1);
+		FreezeMMXRegs(1);
+		runVUrec(VU0.VI[REG_TPC].UL & 0xfff, 0xffffffff, 0);
+		FreezeXMMRegs(0);
+		FreezeMMXRegs(0);
+	}
+
+}
+#endif

 using namespace VU0micro;

--- a/pcsx2/x86/iVU1micro.cpp
+++ b/pcsx2/x86/iVU1micro.cpp
@ -29,7 +29,7 @@
 #ifdef _DEBUG
 extern u32 vudump;
 #endif
-
+#ifndef PCSX2_MICROVU_
 namespace VU1micro
 {
 	void recAlloc()
@ -121,6 +121,34 @@ namespace VU1micro
 		FreezeXMMRegs(0);
 	}
 }
+#else
+
+extern void initVUrec(VURegs* vuRegs, const int vuIndex);
+extern void closeVUrec(const int vuIndex);
+extern void resetVUrec(const int vuIndex);
+extern void clearVUrec(u32 addr, u32 size, const int vuIndex);
+extern void runVUrec(u32 startPC, u32 cycles, const int vuIndex);
+
+namespace VU1micro
+{
+	void recAlloc()								 { initVUrec(&VU1, 1); }
+	void __fastcall recClear(u32 Addr, u32 Size) { clearVUrec(Addr, Size, 1); }
+	void recShutdown()							 { closeVUrec(1); }
+	static void recReset()						 { resetVUrec(1); x86FpuState = FPU_STATE; }
+	static void recStep()						 {}
+	static void recExecuteBlock() {
+
+		if((VU0.VI[REG_VPU_STAT].UL & 0x100) == 0) return;
+		assert( (VU1.VI[REG_TPC].UL&7) == 0 );
+
+		FreezeXMMRegs(1);
+		FreezeMMXRegs(0);
+		runVUrec(VU1.VI[REG_TPC].UL & 0x3fff, 0xffffffff, 1);
+		FreezeXMMRegs(0);
+		FreezeMMXRegs(0);
+	}
+}
+#endif

 using namespace VU1micro;

--- a/pcsx2/x86/iVUmicro.cpp
+++ b/pcsx2/x86/iVUmicro.cpp
@ -280,6 +280,7 @@ void _recvuIALUTestStall(VURegs * VU, int reg) {

 	VU->ialu[i].enable = 0;
 	vucycle+= cycle;
+	_recvuTestPipes(VU, true); 
 }

 void _recvuFMACAdd(VURegs * VU, int reg, int xyzw) {
@ -387,7 +388,7 @@ void _recvuFlushFDIV(VURegs * VU) {

 	if (VU->fdiv.enable == 0) return;

-	cycle = VU->fdiv.Cycle - (vucycle - VU->fdiv.sCycle);
+	cycle = VU->fdiv.Cycle + 1 - (vucycle - VU->fdiv.sCycle); //VU->fdiv.Cycle contains the latency minus 1 (6 or 12)
 //	Console::WriteLn("waiting FDIV pipe %d", params cycle);
 	VU->fdiv.enable = 0;
 	vucycle+= cycle;
--- a/pcsx2/x86/iVUmicroLower.cpp
+++ b/pcsx2/x86/iVUmicroLower.cpp
@ -354,7 +354,7 @@ void recVUMI_IADD( VURegs *VU, int info )

 		if( fdreg == fsreg ) ADD32RtoR(fdreg, ftreg);
 		else if( fdreg == ftreg ) ADD32RtoR(fdreg, fsreg);
-		else LEA16RRtoR(fdreg, fsreg, ftreg);
+		else LEA32RRtoR(fdreg, fsreg, ftreg);
 		MOVZX32R16toR(fdreg, fdreg); // neeed since don't know if fdreg's upper bits are 0
 	}
 }
@ -609,31 +609,31 @@ void _loadEAX(VURegs *VU, int x86reg, uptr offset, int info)
 	if( x86reg >= 0 ) {
 		switch(_X_Y_Z_W) {
 			case 3: // ZW
-				SSE_MOVHPS_RmOffset_to_XMM(EEREC_T, x86reg, offset+8);
+				SSE_MOVHPS_Rm_to_XMM(EEREC_T, x86reg, offset+8);
 				break;
 			case 6: // YZ
-				SSE_SHUFPS_RmOffset_to_XMM(EEREC_T, x86reg, offset, 0x9c);
+				SSE_SHUFPS_Rm_to_XMM(EEREC_T, x86reg, offset, 0x9c);
 				SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0x78);
 				break;

 			case 8: // X
-				SSE_MOVSS_RmOffset_to_XMM(EEREC_TEMP, x86reg, offset);
+				SSE_MOVSS_Rm_to_XMM(EEREC_TEMP, x86reg, offset);
 				SSE_MOVSS_XMM_to_XMM(EEREC_T, EEREC_TEMP);
 				break;
 			case 9: // XW
-				SSE_SHUFPS_RmOffset_to_XMM(EEREC_T, x86reg, offset, 0xc9);
+				SSE_SHUFPS_Rm_to_XMM(EEREC_T, x86reg, offset, 0xc9);
 				SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, 0xd2);
 				break;
 			case 12: // XY
-				SSE_MOVLPS_RmOffset_to_XMM(EEREC_T, x86reg, offset);
+				SSE_MOVLPS_Rm_to_XMM(EEREC_T, x86reg, offset);
 				break;
 			case 15:
-				if( VU == &VU1 ) SSE_MOVAPSRmtoROffset(EEREC_T, x86reg, offset);
-				else SSE_MOVUPSRmtoROffset(EEREC_T, x86reg, offset);
+				if( VU == &VU1 ) SSE_MOVAPSRmtoR(EEREC_T, x86reg, offset);
+				else SSE_MOVUPSRmtoR(EEREC_T, x86reg, offset);
 				break;
 			default:
-				if( VU == &VU1 ) SSE_MOVAPSRmtoROffset(EEREC_TEMP, x86reg, offset);
-				else SSE_MOVUPSRmtoROffset(EEREC_TEMP, x86reg, offset);
+				if( VU == &VU1 ) SSE_MOVAPSRmtoR(EEREC_TEMP, x86reg, offset);
+				else SSE_MOVUPSRmtoR(EEREC_TEMP, x86reg, offset);

 				VU_MERGE_REGS(EEREC_T, EEREC_TEMP);
 				break;
@ -795,15 +795,15 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
 	if ( _Fs_ == 0 ) {
 		if ( _XYZW_SS ) {
 			u32 c = _W ? 0x3f800000 : 0;
-			if ( x86reg >= 0 ) MOV32ItoRmOffset(x86reg, c, offset+(_W?12:(_Z?8:(_Y?4:0))));
+			if ( x86reg >= 0 ) MOV32ItoRm(x86reg, c, offset+(_W?12:(_Z?8:(_Y?4:0))));
 			else MOV32ItoM(offset+(_W?12:(_Z?8:(_Y?4:0))), c);
 		}
 		else {
 			if ( x86reg >= 0 ) {
-				if ( _X ) MOV32ItoRmOffset(x86reg, 0x00000000, offset);
-				if ( _Y ) MOV32ItoRmOffset(x86reg, 0x00000000, offset+4);
-				if ( _Z ) MOV32ItoRmOffset(x86reg, 0x00000000, offset+8);
-				if ( _W ) MOV32ItoRmOffset(x86reg, 0x3f800000, offset+12);
+				if ( _X ) MOV32ItoRm(x86reg, 0x00000000, offset);
+				if ( _Y ) MOV32ItoRm(x86reg, 0x00000000, offset+4);
+				if ( _Z ) MOV32ItoRm(x86reg, 0x00000000, offset+8);
+				if ( _W ) MOV32ItoRm(x86reg, 0x3f800000, offset+12);
 			}
 			else {
 				if ( _X ) MOV32ItoM(offset,		0x00000000);
@ -818,29 +818,29 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
 	switch ( _X_Y_Z_W ) {
 		case 1: // W
 			SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x27);
-			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
+			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+12);
 			else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
 			break;
 		case 2: // Z
 			SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
-			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+8);
+			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+8);
 			else SSE_MOVSS_XMM_to_M32(offset+8, EEREC_TEMP);
 			break;
 		case 3: // ZW
-			if ( x86reg >= 0 ) SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+8);
+			if ( x86reg >= 0 ) SSE_MOVHPS_XMM_to_Rm(x86reg, EEREC_S, offset+8);
 			else SSE_MOVHPS_XMM_to_M64(offset+8, EEREC_S);
 			break;
 		case 4: // Y
 			SSE2_PSHUFLW_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x4e);
-			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
+			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+4);
 			else SSE_MOVSS_XMM_to_M32(offset+4, EEREC_TEMP);
 			break;
 		case 5: // YW
 			SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB1);
 			SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
 			if ( x86reg >= 0 ) {
-				SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
-				SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
+				SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_S, offset+4);
+				SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+12);
 			}
 			else {
 				SSE_MOVSS_XMM_to_M32(offset+4, EEREC_S);
@ -850,14 +850,14 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
 			break;
 		case 6: // YZ
 			SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0xc9);
-			if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
+			if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+4);
 			else SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_TEMP);
 			break;
 		case 7: // YZW
 			SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x93); //ZYXW
 			if ( x86reg >= 0 ) {
-				SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
-				SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
+				SSE_MOVHPS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+4);
+				SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+12);
 			}
 			else {
 				SSE_MOVHPS_XMM_to_M64(offset+4, EEREC_TEMP);
@ -865,26 +865,26 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
 			}
 			break;
 		case 8: // X
-			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
+			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_S, offset);
 			else SSE_MOVSS_XMM_to_M32(offset, EEREC_S);
 			break;
 		case 9: // XW
 			SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
-			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
+			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_S, offset);
 			else SSE_MOVSS_XMM_to_M32(offset, EEREC_S);
 			
 			if ( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSLDUP_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
 			else SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x55);

-			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
+			if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+12);
 			else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);

 			break;
 		case 10: //XZ
 			SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
 			if ( x86reg >= 0 ) {
-				SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
-				SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+8);
+				SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_S, offset);
+				SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+8);
 			}
 			else {
 				SSE_MOVSS_XMM_to_M32(offset, EEREC_S);
@ -893,8 +893,8 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
 			break;
 		case 11: //XZW
 			if ( x86reg >= 0 ) {
-				SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
-				SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+8);
+				SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_S, offset);
+				SSE_MOVHPS_XMM_to_Rm(x86reg, EEREC_S, offset+8);
 			}
 			else {
 				SSE_MOVSS_XMM_to_M32(offset, EEREC_S);
@ -902,14 +902,14 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
 			}
 			break;
 		case 12: // XY
-			if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+0);
+			if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_Rm(x86reg, EEREC_S, offset+0);
 			else SSE_MOVLPS_XMM_to_M64(offset, EEREC_S);
 			break;
 		case 13: // XYW
 			SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x4b); //YXZW
 			if ( x86reg >= 0 ) {
-				SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+0);
-				SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
+				SSE_MOVHPS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+0);
+				SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+12);
 			}
 			else {
 				SSE_MOVHPS_XMM_to_M64(offset, EEREC_TEMP);
@ -919,8 +919,8 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
 		case 14: // XYZ
 			SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
 			if ( x86reg >= 0 ) {
-				SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+0);
-				SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+8);
+				SSE_MOVLPS_XMM_to_Rm(x86reg, EEREC_S, offset+0);
+				SSE_MOVSS_XMM_to_Rm(x86reg, EEREC_TEMP, offset+8);
 			}
 			else {
 				SSE_MOVLPS_XMM_to_M64(offset, EEREC_S);
@ -929,11 +929,11 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
 			break;
 		case 15: // XYZW
 			if ( VU == &VU1 ) {
-				if( x86reg >= 0 ) SSE_MOVAPSRtoRmOffset(x86reg, EEREC_S, offset+0);
+				if( x86reg >= 0 ) SSE_MOVAPSRtoRm(x86reg, EEREC_S, offset+0);
 				else SSE_MOVAPS_XMM_to_M128(offset, EEREC_S);
 			}
 			else {
-				if( x86reg >= 0 ) SSE_MOVUPSRtoRmOffset(x86reg, EEREC_S, offset+0);
+				if( x86reg >= 0 ) SSE_MOVUPSRtoRm(x86reg, EEREC_S, offset+0);
 				else {
 					if( offset & 15 ) SSE_MOVUPS_XMM_to_M128(offset, EEREC_S);
 					else SSE_MOVAPS_XMM_to_M128(offset, EEREC_S);
@ -1018,7 +1018,7 @@ void recVUMI_ILW(VURegs *VU, int info)
 	}
 	else {
 		int fsreg = ALLOCVI(_Fs_, MODE_READ);
-		MOV32RmtoROffset(ftreg, recVUTransformAddr(fsreg, VU, _Fs_, imm), (uptr)VU->Mem + off);
+		MOV32RmtoR(ftreg, recVUTransformAddr(fsreg, VU, _Fs_, imm), (uptr)VU->Mem + off);
 	}
 }
 //------------------------------------------------------------------
@ -1051,10 +1051,10 @@ void recVUMI_ISW( VURegs *VU, int info )

 		x86reg = recVUTransformAddr(fsreg, VU, _Fs_, imm);

-		if (_X) MOV32RtoRmOffset(x86reg, ftreg, (uptr)VU->Mem);
-		if (_Y) MOV32RtoRmOffset(x86reg, ftreg, (uptr)VU->Mem+4);
-		if (_Z) MOV32RtoRmOffset(x86reg, ftreg, (uptr)VU->Mem+8);
-		if (_W) MOV32RtoRmOffset(x86reg, ftreg, (uptr)VU->Mem+12);
+		if (_X) MOV32RtoRm(x86reg, ftreg, (uptr)VU->Mem);
+		if (_Y) MOV32RtoRm(x86reg, ftreg, (uptr)VU->Mem+4);
+		if (_Z) MOV32RtoRm(x86reg, ftreg, (uptr)VU->Mem+8);
+		if (_W) MOV32RtoRm(x86reg, ftreg, (uptr)VU->Mem+12);
 	}
 }
 //------------------------------------------------------------------
@ -1082,7 +1082,7 @@ void recVUMI_ILWR( VURegs *VU, int info )
 	}
 	else {
 		int fsreg = ALLOCVI(_Fs_, MODE_READ);
-		MOVZX32Rm16toROffset(ftreg, recVUTransformAddr(fsreg, VU, _Fs_, 0), (uptr)VU->Mem + off);
+		MOVZX32Rm16toR(ftreg, recVUTransformAddr(fsreg, VU, _Fs_, 0), (uptr)VU->Mem + off);
 	}
 }
 //------------------------------------------------------------------
@ -1109,10 +1109,10 @@ void recVUMI_ISWR( VURegs *VU, int info )
 		int fsreg = ALLOCVI(_Fs_, MODE_READ);
 		x86reg = recVUTransformAddr(fsreg, VU, _Fs_, 0);

-		if (_X) MOV32RtoRmOffset(x86reg, ftreg, (uptr)VU->Mem);
-		if (_Y) MOV32RtoRmOffset(x86reg, ftreg, (uptr)VU->Mem+4);
-		if (_Z) MOV32RtoRmOffset(x86reg, ftreg, (uptr)VU->Mem+8);
-		if (_W) MOV32RtoRmOffset(x86reg, ftreg, (uptr)VU->Mem+12);
+		if (_X) MOV32RtoRm(x86reg, ftreg, (uptr)VU->Mem);
+		if (_Y) MOV32RtoRm(x86reg, ftreg, (uptr)VU->Mem+4);
+		if (_Z) MOV32RtoRm(x86reg, ftreg, (uptr)VU->Mem+8);
+		if (_W) MOV32RtoRm(x86reg, ftreg, (uptr)VU->Mem+12);
 	}
 }
 //------------------------------------------------------------------
--- a/pcsx2/x86/iVUzerorec.cpp
+++ b/pcsx2/x86/iVUzerorec.cpp
@ -58,7 +58,7 @@ extern void iDumpVU1Registers();
 #define SUPERVU_PROPAGATEFLAGS  // the correct behavior of VUs, for some reason superman breaks gfx with it on...

 #ifndef _DEBUG
-#define SUPERVU_INTERCACHING	// registers won't be flushed at block boundaries (faster)
+//#define SUPERVU_INTERCACHING	// registers won't be flushed at block boundaries (faster) (nothing noticable speed-wise, causes SPS in Ratchet and clank (Nneeve) )
 #endif

 #define SUPERVU_CHECKCONDITION 0 // has to be 0!!
@ -833,7 +833,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)

 	SuperVURecompile();

-	s_recVUPtr = x86Ptr[0];
+	s_recVUPtr = x86Ptr;

 	// set the function's range
 	VuFunctionHeader::RANGE r;
@ -1889,7 +1889,7 @@ void VuBaseBlock::AssignVFRegs()
 		if( i == XMMREGS ) return; // nothing changed
 	}

-	u8* oldX86 = x86Ptr[0];
+	u8* oldX86 = x86Ptr;

 	FORIT(itinst, insts) {

@ -2060,7 +2060,7 @@ void VuBaseBlock::AssignVFRegs()
 				_freeXMMreg(free1);
 				_freeXMMreg(free2);
 			}
-			else if( regs->VIwrite & (1<<REG_P) || regs->VIwrite & (1<<REG_Q)) {
+			else if( regs->VIwrite & (1<<REG_P) || regs->VIwrite & (1<<REG_Q) || regs->VIread & (1<<REG_VF0_FLAG)) {				
 				free1 = _allocTempXMMreg(XMMT_FPS, -1);				
 				// protects against insts like esadd vf0 and sqrt vf0				
 				if( free0 == -1 )
@ -2078,7 +2078,7 @@ void VuBaseBlock::AssignVFRegs()
 		}
 	}

-	assert( x86Ptr[0] == oldX86 );
+	assert( x86Ptr == oldX86 );
 	u32 analyzechildren = !(type&BLOCKTYPE_ANALYZED);
 	type |= BLOCKTYPE_ANALYZED;

@ -2302,10 +2302,11 @@ void SuperVUCleanupProgram(u32 startpc, int vuindex)

 	//memset(recVUStack, 0, SUPERVU_STACKSIZE * 4);

-	// Clear allocation info to prevent bad data being used in other parts of pcsx2; doing this just incase (cottonvibes)
-	_initXMMregs();
-	_initMMXregs();
-	_initX86regs();
+	// Could clear allocation info to prevent possibly bad data being used in other parts of pcsx2; 
+	// not doing this because it's slow and not needed (rama)
+	// _initXMMregs();
+	// _initMMXregs();
+	// _initX86regs();
 }

 #if defined(_MSC_VER)
@ -2466,7 +2467,7 @@ static void SuperVURecompile()
 					AND32ItoM( (uptr)&VU->vifRegs->stat, ~0x4 );

 					MOV32ItoM((uptr)&VU->VI[REG_TPC], pchild->endpc);
-					JMP32( (uptr)SuperVUEndProgram - ( (uptr)x86Ptr[0] + 5 ));
+					JMP32( (uptr)SuperVUEndProgram - ( (uptr)x86Ptr + 5 ));
 				}
 				// only other case is when there are two branches
 				else assert( (*itblock)->insts.back().regs[0].pipe == VUPIPE_BRANCH );
@ -2606,11 +2607,11 @@ void SuperVUTestVU0Condition(u32 incstack)

 		ADD32ItoR(ESP, incstack);
 		//CALLFunc((u32)timeout);
-		JMP32( (uptr)SuperVUEndProgram - ( (uptr)x86Ptr[0] + 5 ));
+		JMP32( (uptr)SuperVUEndProgram - ( (uptr)x86Ptr + 5 ));

 		x86SetJ8(ptr);
 	}
-	else JAE32( (uptr)SuperVUEndProgram - ( (uptr)x86Ptr[0] + 6 ) );
+	else JAE32( (uptr)SuperVUEndProgram - ( (uptr)x86Ptr + 6 ) );
 }

 void VuBaseBlock::Recompile()
@ -2618,7 +2619,7 @@ void VuBaseBlock::Recompile()
 	if( type & BLOCKTYPE_ANALYZED ) return;
 	
 	x86Align(16);
-	pcode = x86Ptr[0];
+	pcode = x86Ptr;

 #ifdef _DEBUG
 	MOV32ItoM((uptr)&s_vufnheader, s_pFnHeader->startpc);
@ -2726,7 +2727,7 @@ void VuBaseBlock::Recompile()
 		AND32ItoM( (uptr)&VU0.VI[ REG_VPU_STAT ].UL, s_vu?~0x100:~0x001 ); // E flag 
 		AND32ItoM( (uptr)&VU->vifRegs->stat, ~0x4 );
 		if( !branch ) MOV32ItoM((uptr)&VU->VI[REG_TPC], endpc);
-		JMP32( (uptr)SuperVUEndProgram - ( (uptr)x86Ptr[0] + 5 ));
+		JMP32( (uptr)SuperVUEndProgram - ( (uptr)x86Ptr + 5 ));
 	}
 	else {

@ -2868,7 +2869,7 @@ void VuBaseBlock::Recompile()
 		}
 	}

-    pendcode = x86Ptr[0];
+    pendcode = x86Ptr;
 	type |= BLOCKTYPE_ANALYZED;

 	LISTBLOCKS::iterator itchild;
@ -3569,7 +3570,7 @@ void recVUMI_BranchHandle()
 	if( (s_pCurBlock->type & BLOCKTYPE_HASEOP) || s_vu == 0 || SUPERVU_CHECKCONDITION)
        MOV32ItoM(SuperVUGetVIAddr(REG_TPC, 0), bpc);
 	MOV32ItoR(s_JumpX86, 0);
-	s_pCurBlock->pChildJumps[curjump] = (u32*)x86Ptr[0]-1;
+	s_pCurBlock->pChildJumps[curjump] = (u32*)x86Ptr-1;

 	if( !(s_pCurInst->type & INST_BRANCH_DELAY) ) {
 		j8Ptr[1] = JMP8(0);
@ -3578,7 +3579,7 @@ void recVUMI_BranchHandle()
 		if( (s_pCurBlock->type & BLOCKTYPE_HASEOP) || s_vu == 0 || SUPERVU_CHECKCONDITION )
            MOV32ItoM(SuperVUGetVIAddr(REG_TPC, 0), pc+8);
 		MOV32ItoR(s_JumpX86, 0);
-		s_pCurBlock->pChildJumps[curjump+1] = (u32*)x86Ptr[0]-1;
+		s_pCurBlock->pChildJumps[curjump+1] = (u32*)x86Ptr-1;

 		x86SetJ8( j8Ptr[ 1 ] );
 	}
@ -3815,7 +3816,7 @@ void recVUMI_B( VURegs* vuu, s32 info )
 	if( s_pCurBlock->blocks.size() > 1 ) {
 		s_JumpX86 = _allocX86reg(-1, X86TYPE_VUJUMP, 0, MODE_WRITE);
 		MOV32ItoR(s_JumpX86, 0);
-        s_pCurBlock->pChildJumps[(s_pCurInst->type & INST_BRANCH_DELAY)?1:0] = (u32*)x86Ptr[0]-1;
+        s_pCurBlock->pChildJumps[(s_pCurInst->type & INST_BRANCH_DELAY)?1:0] = (u32*)x86Ptr-1;
        s_UnconditionalDelay = 1;
 	}

@ -3841,7 +3842,7 @@ void recVUMI_BAL( VURegs* vuu, s32 info )
 	if( s_pCurBlock->blocks.size() > 1 ) {
 		s_JumpX86 = _allocX86reg(-1, X86TYPE_VUJUMP, 0, MODE_WRITE);
 		MOV32ItoR(s_JumpX86, 0);
-        s_pCurBlock->pChildJumps[(s_pCurInst->type & INST_BRANCH_DELAY)?1:0] = (u32*)x86Ptr[0]-1;
+        s_pCurBlock->pChildJumps[(s_pCurInst->type & INST_BRANCH_DELAY)?1:0] = (u32*)x86Ptr-1;
        s_UnconditionalDelay = 1;
 	}

--- a/pcsx2/x86/ix86-32/aVif_proc-32.asm
+++ b/pcsx2/x86/ix86-32/aVif_proc-32.asm
@ -5,9 +5,9 @@
 .xmm


-extern _vifRegs:ptr
-extern _vifMaskRegs:ptr
-extern _vifRow:ptr
+extern vifRegs:ptr
+extern vifMaskRegs:ptr
+extern vifRow:ptr
 extern s_TempDecompress:ptr


@ -104,7 +104,7 @@ UNPACK_Regular_SSE_2 macro r0


 UNPACK_Setup_Mask_SSE macro CL
-	mov eax, [_vifMaskRegs]
+	mov eax, [vifMaskRegs]
 	movdqa xmm4, [eax + 64*(CL) + 16]
 	movdqa xmm5, [eax + 64*(CL) + 32]
 	movdqa xmm3, [eax + 64*(CL)]
@ -118,7 +118,7 @@ UNPACK_Start_Setup_Mask_SSE_0 macro CL
 	endm

 UNPACK_Start_Setup_Mask_SSE_1 macro CL
-	mov eax, [_vifMaskRegs]
+	mov eax, [vifMaskRegs]
 	movdqa xmm4, [eax + 64*(CL) + 16]
 	movdqa xmm5, [eax + 64*(CL) + 32]
 	pand xmm4, xmm6
@ -132,14 +132,14 @@ UNPACK_Start_Setup_Mask_SSE_2 macro CL
 UNPACK_Setup_Mask_SSE_0_1 macro CL
 	endm
 UNPACK_Setup_Mask_SSE_1_1 macro CL
-	mov eax, [_vifMaskRegs]
+	mov eax, [vifMaskRegs]
 	movdqa xmm3, [eax + 64*(0)]
 	endm


 UNPACK_Setup_Mask_SSE_2_1 macro CL

-	mov eax, [_vifMaskRegs]
+	mov eax, [vifMaskRegs]
 	movdqa xmm4, [eax + 64*(0) + 16]
 	movdqa xmm5, [eax + 64*(0) + 32]
 	movdqa xmm3, [eax + 64*(0)]
@ -1521,9 +1521,9 @@ UNPACK_V4_5SSE_1A macro CL, TOTALCL, MaskType, ModeType


 SAVE_ROW_REG_BASE macro
-	mov eax, [_vifRow]
+	mov eax, [vifRow]
 	movdqa [eax], xmm6
-	mov eax, [_vifRegs]
+	mov eax, [vifRegs]
 	movss dword ptr [eax+0100h], xmm6
 	psrldq xmm6, 4
 	movss dword ptr [eax+0110h], xmm6
@ -1557,7 +1557,7 @@ defUNPACK_SkippingWrite macro name, MaskType, ModeType, qsize, sign, SAVE_ROW_RE
 	push ebx

 	INIT_ARGS
-    mov eax, [_vifRegs]
+    mov eax, [vifRegs]
    movzx ecx, byte ptr [eax + 040h]
    movzx ebx, byte ptr [eax + 041h]
    sub ecx, ebx
--- a/pcsx2/x86/ix86-32/iCore-32.cpp
+++ b/pcsx2/x86/ix86-32/iCore-32.cpp
@ -17,7 +17,7 @@
 */
 #include "PrecompiledHeader.h"

-#include "Misc.h"
+#include "System.h"
 #include "iR5900.h"
 #include "Vif.h"
 #include "VU.h"
@ -161,7 +161,7 @@ void _flushConstRegs()
 		zero_cnt++;
 	}

-	rewindPtr = x86Ptr[_EmitterId_];
+	rewindPtr = x86Ptr;

 	for (i = 1, j = 0; i < 32; j++ && ++i, j %= 2) {
 		if (!GPR_IS_CONST1(i) || g_cpuFlushedConstReg & (1<<i))
@ -178,7 +178,7 @@ void _flushConstRegs()
 	}

 	if (minusone_cnt == 1 && !zero_cnt) { // not worth it for one byte
-		x86Ptr[_EmitterId_] = rewindPtr;
+		x86SetPtr( rewindPtr );
 	} else {
 		done[0] |= done[2];
 		done[1] |= done[3];
@ -1050,12 +1050,12 @@ void _recMove128MtoM(u32 to, u32 from)
 // fixme - see above function!
 void _recMove128RmOffsettoM(u32 to, u32 offset)
 {
-	MOV32RmtoROffset(EAX, ECX, offset);
-	MOV32RmtoROffset(EDX, ECX, offset+4);
+	MOV32RmtoR(EAX, ECX, offset);
+	MOV32RmtoR(EDX, ECX, offset+4);
 	MOV32RtoM(to, EAX);
 	MOV32RtoM(to+4, EDX);
-	MOV32RmtoROffset(EAX, ECX, offset+8);
-	MOV32RmtoROffset(EDX, ECX, offset+12);
+	MOV32RmtoR(EAX, ECX, offset+8);
+	MOV32RmtoR(EDX, ECX, offset+12);
 	MOV32RtoM(to+8, EAX);
 	MOV32RtoM(to+12, EDX);
 }
@ -1065,12 +1065,12 @@ void _recMove128MtoRmOffset(u32 offset, u32 from)
 {
 	MOV32MtoR(EAX, from);
 	MOV32MtoR(EDX, from+4);
-	MOV32RtoRmOffset(ECX, EAX, offset);
-	MOV32RtoRmOffset(ECX, EDX, offset+4);
+	MOV32RtoRm(ECX, EAX, offset);
+	MOV32RtoRm(ECX, EDX, offset+4);
 	MOV32MtoR(EAX, from+8);
 	MOV32MtoR(EDX, from+12);
-	MOV32RtoRmOffset(ECX, EAX, offset+8);
-	MOV32RtoRmOffset(ECX, EDX, offset+12);
+	MOV32RtoRm(ECX, EAX, offset+8);
+	MOV32RtoRm(ECX, EDX, offset+12);
 }

 static PCSX2_ALIGNED16(u32 s_ones[2]) = {0xffffffff, 0xffffffff};
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@ -30,6 +30,9 @@
 #include "iR5900Jump.h"
 #include "iR5900LoadStore.h"
 #include "iR5900Move.h"
+
+#include "BaseblockEx.h"
+
 #include "iMMI.h"
 #include "iFPU.h"
 #include "iCOP0.h"
@ -73,7 +76,7 @@ u32 g_cpuHasConstReg = 0, g_cpuFlushedConstReg = 0;
 static const int RECSTACK_SIZE = 0x00010000;
 static const int EE_NUMBLOCKS = (1<<15);

-static u8 *recMem = NULL;			// the recompiled blocks will be here
+u8 *recMem = NULL;			// the recompiled blocks will be here
 static u8* recStack = NULL;			// stack mem
 static BASEBLOCK *recRAM = NULL;		// and the ptr to the blocks here
 static BASEBLOCK *recROM = NULL;		// and here
@ -128,11 +131,14 @@ static void iDumpBlock( int startpc, u8 * ptr )

 	Console::Status( "dump1 %x:%x, %x", params startpc, pc, cpuRegs.cycle );
 	Path::CreateDirectory( "dumps" );
+#ifndef __LINUX__
 	ssprintf( filename, "dumps\\R5900dump%.8X.txt", startpc );
-
+#else
+	ssprintf( filename, "dumps/R5900dump%.8X.txt", startpc );
+#endif
 	fflush( stdout );
 //	f = fopen( "dump1", "wb" );
-//	fwrite( ptr, 1, (u32)x86Ptr[0] - (u32)ptr, f );
+//	fwrite( ptr, 1, (u32)x86Ptr - (u32)ptr, f );
 //	fclose( f );
 //
 //	sprintf( command, "objdump -D --target=binary --architecture=i386 dump1 > %s", filename );
@ -367,7 +373,7 @@ void _eeMoveGPRtoM(u32 to, int fromgpr)
 void _eeMoveGPRtoRm(x86IntRegType to, int fromgpr)
 {
 	if( GPR_IS_CONST1(fromgpr) )
-		MOV32ItoRmOffset( to, g_cpuConstRegs[fromgpr].UL[0], 0 );
+		MOV32ItoRm( to, g_cpuConstRegs[fromgpr].UL[0] );
 	else {
 		int mmreg;
 		
@ -380,7 +386,7 @@ void _eeMoveGPRtoRm(x86IntRegType to, int fromgpr)
 		}
 		else {
 			MOV32MtoR(EAX, (int)&cpuRegs.GPR.r[ fromgpr ].UL[ 0 ] );
-			MOV32RtoRm(to, EAX );
+			MOV32RtoRm( to, EAX );
 		}
 	}
 }
@ -579,8 +585,8 @@ void recResetEE( void )
 	// so a fix will have to wait until later. -_- (air)

 	//x86SetPtr(recMem+REC_CACHEMEM);
-	//dyna_block_discard_recmem=(u8*)x86Ptr[0];
-	//JMP32( (uptr)&dyna_block_discard - ( (u32)x86Ptr[0] + 5 ));
+	//dyna_block_discard_recmem=(u8*)x86Ptr;
+	//JMP32( (uptr)&dyna_block_discard - ( (u32)x86Ptr + 5 ));

 	x86SetPtr(recMem);

@ -677,7 +683,7 @@ static void __naked DispatcherReg()
 	}
 }

-__forceinline void recExecute()
+void recExecute()
 {
 	// Optimization note : Compared pushad against manually pushing the regs one-by-one.
 	// Manually pushing is faster, especially on Core2's and such. :)
@ -791,7 +797,7 @@ void recSYSCALL( void ) {
 	CMP32ItoM((uptr)&cpuRegs.pc, pc);
 	j8Ptr[0] = JE8(0);
 	ADD32ItoM((uptr)&cpuRegs.cycle, eeScaleBlockCycles());
-	JMP32((uptr)DispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
+	JMP32((uptr)DispatcherReg - ( (uptr)x86Ptr + 5 ));
 	x86SetJ8(j8Ptr[0]);
 	//branch = 2;
 }
@ -1148,7 +1154,7 @@ static void iBranchTest(u32 newpc, bool noDispatch)

 	if (!noDispatch) {
 		if (newpc == 0xffffffff)
-			JS32((uptr)DispatcherReg - ( (uptr)x86Ptr[0] + 6 ));
+			JS32((uptr)DispatcherReg - ( (uptr)x86Ptr + 6 ));
 		else
 			iBranch(newpc, 1);
 	}
@ -1379,7 +1385,7 @@ void recRecompile( const u32 startpc )

 	x86SetPtr( recPtr );
 	x86Align(16);
-	recPtr = x86Ptr[_EmitterId_];
+	recPtr = x86Ptr;

 	s_pCurBlock = PC_GETBLOCK(startpc);

@ -1732,8 +1738,11 @@ StartRecomp:
 					if (bit==31)
 					{
 						vtlb_alloc_bits[writen_start]&=~mask;
+						if ((u8)mask==mask)
+							TEST8ItoM((uptr)&vtlb_alloc_bits[writen_start],mask);
+						else
 							TEST32ItoM((uptr)&vtlb_alloc_bits[writen_start],mask);
-						JNZ32(((u32)&dyna_block_discard)- ( (u32)x86Ptr[0] + 6 ));
+						JNZ32(((u32)&dyna_block_discard)- ( (u32)x86Ptr + 6 ));
 						SysPrintf("%08X %d %d\n",mask,pgsz,pgsz>>4);
 						mask=0;
 					}
@ -1755,8 +1764,11 @@ StartRecomp:
 				if (mask)
 				{
 					vtlb_alloc_bits[writen_start]&=~mask;
+					if ((u8)mask==mask)
+						TEST8ItoM((uptr)&vtlb_alloc_bits[writen_start],mask);
+					else
 						TEST32ItoM((uptr)&vtlb_alloc_bits[writen_start],mask);
-					JNZ32(((u32)&dyna_block_discard)- ( (u32)x86Ptr[0] + 6 ));
+					JNZ32(((u32)&dyna_block_discard)- ( (u32)x86Ptr + 6 ));
 					SysPrintf("%08X %d %d\n",mask,pgsz,pgsz>>4);
 					mask=0;
 				}
@ -1768,14 +1780,14 @@ StartRecomp:
 				{
 					// was dyna_block_discard_recmem.  See note in recResetEE for details.
 					CMP32ItoM((uptr)PSM(lpc),*(u32*)PSM(lpc));
-					JNE32(((u32)&dyna_block_discard)- ( (u32)x86Ptr[0] + 6 ));
+					JNE32(((u32)&dyna_block_discard)- ( (u32)x86Ptr + 6 ));

 					stg-=4;
 					lpc+=4;
 				}
 				*/
-				DbgCon::WriteLn("Manual block @ %08X : %08X %d %d %d %d", params
-					startpc,inpage_ptr,pgsz,0x1000-inpage_offs,inpage_sz,sz*4);
+				//DbgCon::WriteLn("Manual block @ %08X : %08X %d %d %d %d", params
+				//	startpc,inpage_ptr,pgsz,0x1000-inpage_offs,inpage_sz,sz*4);
 			}
 		}
 		inpage_ptr+=pgsz;
@ -1855,14 +1867,14 @@ StartRecomp:
 		}
 	}

-	assert( x86Ptr[0] < recMem+REC_CACHEMEM );
+	assert( x86Ptr < recMem+REC_CACHEMEM );
 	assert( recStackPtr < recStack+RECSTACK_SIZE );
 	assert( x86FpuState == 0 );

-	assert(x86Ptr[_EmitterId_] - recPtr < 0x10000);
-	s_pCurBlockEx->x86size = x86Ptr[_EmitterId_] - recPtr;
+	assert(x86Ptr - recPtr < 0x10000);
+	s_pCurBlockEx->x86size = x86Ptr - recPtr;

-	recPtr = x86Ptr[0];
+	recPtr = x86Ptr;

 	assert( (g_cpuHasConstReg&g_cpuFlushedConstReg) == g_cpuHasConstReg );

--- a/pcsx2/x86/ix86-32/iR5900LoadStore.cpp
+++ b/pcsx2/x86/ix86-32/iR5900LoadStore.cpp
@ -1930,7 +1930,7 @@ void recLQC2( void )
 		dohw = recSetMemLocation(_Rs_, _Imm_, mmregs, 2, 0);

 		if( _Ft_ ) {
-			u8* rawreadptr = x86Ptr[0];
+			u8* rawreadptr = x86Ptr;

 			if( mmreg >= 0 ) {
 				SSEX_MOVDQARmtoROffset(mmreg, ECX, PS2MEM_BASE_+s_nAddMemOffset);
@ -1945,7 +1945,7 @@ void recLQC2( void )

 				// check if writing to VUs
 				CMP32ItoR(ECX, 0x11000000);
-				JAE8(rawreadptr - (x86Ptr[0]+2));
+				JAE8(rawreadptr - (x86Ptr+2));

 				PUSH32I( (int)&VU0.VF[_Ft_].UD[0] );
 				CALLFunc( (int)recMemRead128 );
@ -1999,7 +1999,7 @@ void recSQC2( void )
 		mmregs = _eePrepareReg(_Rs_);
 		dohw = recSetMemLocation(_Rs_, _Imm_, mmregs, 2, 0);

-		rawreadptr = x86Ptr[0];
+		rawreadptr = x86Ptr;

 		if( (mmreg = _checkXMMreg(XMMTYPE_VFREG, _Ft_, MODE_READ)) >= 0) {
 			SSEX_MOVDQARtoRmOffset(ECX, mmreg, PS2MEM_BASE_+s_nAddMemOffset);
@ -2039,7 +2039,7 @@ void recSQC2( void )

 			// check if writing to VUs
 			CMP32ItoR(ECX, 0x11000000);
-			JAE8(rawreadptr - (x86Ptr[0]+2));
+			JAE8(rawreadptr - (x86Ptr+2));

 			// some type of hardware write
 			if( (mmreg = _checkXMMreg(XMMTYPE_VFREG, _Ft_, MODE_READ)) >= 0) {
@ -2101,7 +2101,7 @@ void recLoad64( u32 bits, bool sign )
 		if ( _Imm_ != 0 )
 			ADD32ItoR( ECX, _Imm_ );
 		if( bits == 128 )		// force 16 byte alignment on 128 bit reads
-			AND32I8toR(ECX,0xF0);
+			AND32ItoR(ECX,~0x0F);	// emitter automatically encodes this as an 8-bit sign-extended imm8

 		_eeOnLoadWrite(_Rt_);
 		EEINST_RESETSIGNEXT(_Rt_); // remove the sign extension
@ -2198,7 +2198,7 @@ void recStore(u32 sz, bool edxAlreadyAssigned=false)
 		if ( _Imm_ != 0 )
 			ADD32ItoR(ECX, _Imm_);
 		if (sz==128)
-			AND32I8toR(ECX,0xF0);
+			AND32ItoR(ECX,~0x0F);

 		vtlb_DynGenWrite(sz);
 	}
--- a/pcsx2/x86/ix86-32/recVTLB.cpp
+++ b/pcsx2/x86/ix86-32/recVTLB.cpp
@ -23,6 +23,7 @@

 #include "iCore.h"
 #include "iR5900.h"
+#include "x86\ix86\ix86_internal.h"

 u8* code_pos=0;
 u8* code_start=0;
@ -63,7 +64,7 @@ void execuCode(bool set)
 		SysPrintf("Leaking 2 megabytes of ram\n");
 		code_start=code_pos=(u8*)VirtualAlloc(0,2*1024*1024,MEM_COMMIT,PAGE_EXECUTE_READWRITE);
 		code_sz+=2*1024*1024;
-		int i=0;
+		u32 i=0;
 		while(i<code_sz)
 		{
 			//UD2 is 0xF 0xB.Fill the stream with it so that the cpu don't try to execute past branches ..
@ -87,11 +88,11 @@ void execuCode(bool set)
 u8* IndirectPlaceholderA()
 {
 	//Add32 <eax>,imm, 6 bytes form.
-	write8<_EmitterId_>( 0x81 ); 
-	ModRM<_EmitterId_>( 3, 0, EAX );
+	write8( 0x81 ); 
+	ModRM( 3, 0, EAX );

 	u8* rv=x86SetPtr(0);
-	write32<_EmitterId_>(0);
+	write32(0);

 	return rv;
 }
@ -106,10 +107,10 @@ void IndirectPlaceholderB(u8* pl,bool read,u32 sz,bool sx)
 	u8* old=x86SetPtr(pl);
 	inf.skip=old-pl-4;
 	//Add32 <eax>,imm, 6 bytes form, patch the imm value
-	write32<_EmitterId_>( inf.full );
+	write32( inf.full );
 	x86SetPtr(old);
 }
-PCSX2_ALIGNED16( static u64 g_globalXMMData[2*XMMREGS] );
+PCSX2_ALIGNED16( extern u64 g_globalXMMData[2*XMMREGS] );
 void MOVx_SSE( x86IntRegType destRm, x86IntRegType srcRm,u32 srcAddr=0,u32 dstAddr=0,bool half=false )
 {
 	int reg;
@ -130,24 +131,24 @@ void MOVx_SSE( x86IntRegType destRm, x86IntRegType srcRm,u32 srcAddr=0,u32 dstAd
 		if (srcAddr)
 			SSE_MOVLPS_M64_to_XMM(reg,srcAddr);
 		else
-			SSE_MOVLPS_RmOffset_to_XMM(reg,srcRm,0);
+			SSE_MOVLPS_Rm_to_XMM(reg,srcRm);

 		if (dstAddr)
 			SSE_MOVLPS_XMM_to_M64(dstAddr,reg);
 		else
-			SSE_MOVLPS_XMM_to_RmOffset(destRm,reg,0);
+			SSE_MOVLPS_XMM_to_Rm(destRm,reg);
 	}
 	else
 	{
 		if (srcAddr)
 			SSE2_MOVDQA_M128_to_XMM(reg,srcAddr);
 		else
-			SSE2_MOVDQARmtoROffset(reg,srcRm,0);
+			SSE2_MOVDQARmtoR(reg,srcRm);

 		if (dstAddr)
 			SSE2_MOVDQA_XMM_to_M128(dstAddr,reg);
 		else
-			SSE2_MOVDQARtoRmOffset(destRm,reg,0);
+			SSE2_MOVDQARtoRm(destRm,reg);
 	}


@ -167,12 +168,12 @@ void MOV64_MMX( x86IntRegType destRm, x86IntRegType srcRm,u32 srcAddr=0,u32 dstA
 		if (srcAddr)
 			MOVQMtoR(freereg,srcAddr);
 		else
-			MOVQRmtoROffset(freereg,srcRm,0);
+			MOVQRmtoR(freereg,srcRm);

 		if (dstAddr)
 			MOVQRtoM(dstAddr,freereg);
 		else
-			MOVQRtoRmOffset(destRm,freereg,0);
+			MOVQRtoRm(destRm,freereg);

 		_freeMMXreg(freereg);
 	}
@ -482,7 +483,6 @@ static void _vtlb_DynGen_DirectWrite( u32 bits )
 	bits_base-=(alloc_base>>4)/8;//in bytes

 	BTS32MtoR(bits_base,ECX);
-//	BTS_wtf(asdasd,ECX);
 }

 static void _vtlb_DynGen_IndirectWrite( u32 bits )
@ -614,8 +614,7 @@ uptr _vtlb_HandleRewrite(u32 info,u8* ra)

 	u32 skip=GenIndirectMemOp(info);

-	JMP32(ra-x86Ptr[_EmitterId_]-5+skip);
-	
+	JMP32(ra-x86Ptr-5+skip);
 	execuCode(false);

 	return rv;
--- a/pcsx2/x86/ix86/Makefile.am
+++ b/pcsx2/x86/ix86/Makefile.am
@ -1,4 +1,5 @@
 INCLUDES = -I@srcdir@/.. -I@srcdir@/../../   -I@srcdir@/../../../common/include -I@srcdir@/../../../3rdparty
 noinst_LIBRARIES = libix86.a

-libix86_a_SOURCES = ix86.cpp ix86.inl ix86_3dnow.inl ix86.h ix86_fpu.inl ix86_mmx.inl ix86_sse.inl ix86_tools.cpp ix86_cpudetect.cpp ix86_macros.h
+libix86_a_SOURCES = ix86_mmx.cpp ix86_tools.cpp ix86.cpp ix86_3dnow.cpp ix86_fpu.cpp ix86_legacy.cpp ix86_sse.cpp ix86_cpudetect.cpp ix86_group1.cpp \
+ix86_internal.h ix86.h ix86_macros.h ix86_sse_helpers.h ix86_types.h
--- a/pcsx2/x86/ix86/ix86.cpp
+++ b/pcsx2/x86/ix86/ix86.cpp
@ -27,15 +27,465 @@
 #include "PrecompiledHeader.h"

 #include "System.h"
-#include "ix86.h"
+#include "ix86_internal.h"

-u8  *x86Ptr[EmitterId_Count];
-
-u8  *j8Ptr[32];
-u32 *j32Ptr[32];
+__threadlocal u8  *x86Ptr;
+__threadlocal u8  *j8Ptr[32];
+__threadlocal u32 *j32Ptr[32];

 PCSX2_ALIGNED16(u32 p[4]);
 PCSX2_ALIGNED16(u32 p2[4]);
 PCSX2_ALIGNED16(float f[4]);

 XMMSSEType g_xmmtypes[XMMREGS] = { XMMT_INT };
+
+namespace x86Emitter {
+
+const x86IndexerType ptr;
+
+//////////////////////////////////////////////////////////////////////////////////////////
+//
+const x86Register32 x86Register32::Empty( -1 );
+
+const x86Register32 eax( 0 );
+const x86Register32 ebx( 3 );
+const x86Register32 ecx( 1 );
+const x86Register32 edx( 2 );
+const x86Register32 esi( 6 );
+const x86Register32 edi( 7 );
+const x86Register32 ebp( 5 );
+const x86Register32 esp( 4 );
+
+const x86Register16 ax( 0 );
+const x86Register16 bx( 3 );
+const x86Register16 cx( 1 );
+const x86Register16 dx( 2 );
+const x86Register16 si( 6 );
+const x86Register16 di( 7 );
+const x86Register16 bp( 5 );
+const x86Register16 sp( 4 );
+
+const x86Register8 al( 0 );
+const x86Register8 cl( 1 );
+const x86Register8 dl( 2 );
+const x86Register8 bl( 3 );
+const x86Register8 ah( 4 );
+const x86Register8 ch( 5 );
+const x86Register8 dh( 6 );
+const x86Register8 bh( 7 );
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// x86Register Method Implementations
+//
+x86ModRm x86Register32::operator+( const x86Register32& right ) const
+{
+	return x86ModRm( *this, right );
+}
+
+x86ModRm x86Register32::operator+( const x86ModRm& right ) const
+{
+	return right + *this;
+}
+
+x86ModRm x86Register32::operator+( s32 right ) const
+{
+	return x86ModRm( *this, right );
+}
+
+x86ModRm x86Register32::operator*( u32 right ) const
+{
+	return x86ModRm( Empty, *this, right );
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// x86ModRm Method Implementations
+//
+x86ModRm& x86ModRm::Add( const x86IndexReg& src )
+{
+	if( src == Index )
+	{
+		Factor++;
+	}
+	else if( src == Base )
+	{
+		// Compound the existing register reference into the Index/Scale pair.
+		Base = x86IndexReg::Empty;
+
+		if( src == Index )
+			Factor++;
+		else
+		{
+			jASSUME( Index.IsEmpty() );		// or die if we already have an index!
+			Index = src;
+			Factor = 2;
+		}
+	}
+	else if( Base.IsEmpty() )
+		Base = src;
+	else if( Index.IsEmpty() )
+		Index = src;
+	else
+		assert( false );	// oops, only 2 regs allowed per ModRm!
+
+	return *this;
+}
+
+x86ModRm& x86ModRm::Add( const x86ModRm& src )
+{
+	Add( src.Base );
+	Add( src.Displacement );
+
+	// If the factor is 1, we can just treat index like a base register also.
+	if( src.Factor == 1 )
+	{
+		Add( src.Index );
+	}
+	else if( Index.IsEmpty() )
+	{
+		Index = src.Index;
+		Factor = 1;
+	}
+	else if( Index == src.Index )
+		Factor++;
+	else
+		assert( false );	// oops, only 2 regs allowed!
+
+	return *this;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// ModSib Method Implementations
+//
+
+// ------------------------------------------------------------------------
+// Generates a 'reduced' ModSib form, which has valid Base, Index, and Scale values.
+// Necessary because by default ModSib compounds registers into Index when possible.
+//
+void ModSib::Reduce()
+{
+	// If no index reg, then load the base register into the index slot.
+	if( Index.IsEmpty() )
+	{
+		Index = Base;
+		Scale = 0;
+		Base = x86IndexReg::Empty;
+		return;
+	}
+	
+	// The Scale has a series of valid forms, all shown here:
+	
+	switch( Scale )
+	{
+		case 0: break;
+		case 1: Scale = 0; break;
+		case 2: Scale = 1; break;
+
+		case 3:				// becomes [reg*2+reg]
+			jASSUME( Base.IsEmpty() );
+			Base = Index;
+			Scale = 1;
+		break;
+		
+		case 4: Scale = 2; break;
+
+		case 5:				// becomes [reg*4+reg]
+			jASSUME( Base.IsEmpty() );
+			Base = Index;
+			Scale = 2;
+		break;
+		
+		case 6:				// invalid!
+			assert( false );
+		break;
+		
+		case 7:				// so invalid!
+			assert( false );
+		break;
+		
+		case 8: Scale = 3; break;
+		case 9:				// becomes [reg*8+reg]
+			jASSUME( Base.IsEmpty() );
+			Base = Index;
+			Scale = 3;
+		break;
+	}
+}
+
+ModSib::ModSib( const x86ModRm& src ) :
+	Base( src.Base ),
+	Index( src.Index ),
+	Scale( src.Factor ),
+	Displacement( src.Displacement )
+{
+	Reduce();
+}
+
+ModSib::ModSib( x86IndexReg base, x86IndexReg index, int scale, s32 displacement ) :
+	Base( base ),
+	Index( index ),
+	Scale( scale ),
+	Displacement( displacement )
+{
+	Reduce();
+}
+
+ModSib::ModSib( s32 displacement ) :
+	Base(),
+	Index(),
+	Scale(0),
+	Displacement( displacement )
+{
+}
+
+// ------------------------------------------------------------------------
+// returns TRUE if this instruction requires SIB to be encoded, or FALSE if the
+// instruction ca be encoded as ModRm alone.
+bool NeedsSibMagic( const ModSib& info )
+{
+	// no registers? no sibs!
+	if( info.Index.IsEmpty() ) return false;
+
+	// A scaled register needs a SIB
+	if( info.Scale != 0 ) return true;
+
+	// two registers needs a SIB
+	if( !info.Base.IsEmpty() ) return true;
+
+	// If index register is ESP, then we need a SIB:
+	// (the ModSib::Reduce() ensures that stand-alone ESP will be in the
+	// index position for us)
+	if( info.Index == esp ) return true;
+
+	return false;
+}
+
+// ------------------------------------------------------------------------
+// Conditionally generates Sib encoding information!
+//
+// regfield - register field to be written to the ModRm.  This is either a register specifier
+//   or an opcode extension.  In either case, the instruction determines the value for us.
+//
+void EmitSibMagic( int regfield, const ModSib& info )
+{
+	int displacement_size = (info.Displacement == 0) ? 0 : 
+		( ( info.IsByteSizeDisp() ) ? 1 : 2 );
+
+	if( !NeedsSibMagic( info ) )
+	{
+		// Use ModRm-only encoding, with the rm field holding an index/base register, if
+		// one has been specified.  If neither register is specified then use Disp32 form,
+		// which is encoded as "EBP w/o displacement" (which is why EBP must always be
+		// encoded *with* a displacement of 0, if it would otherwise not have one).
+
+		if( info.Index.IsEmpty() )
+			ModRM( 0, regfield, ModRm_UseDisp32 );
+		else
+		{
+			if( info.Index == ebp && displacement_size == 0 )
+				displacement_size = 1;		// forces [ebp] to be encoded as [ebp+0]!
+
+			ModRM( displacement_size, regfield, info.Index.Id );
+		}
+	}
+	else
+	{
+		// In order to encode "just" index*scale (and no base), we have to encode
+		// it as a special [index*scale + displacement] form, which is done by
+		// specifying EBP as the base register and setting the displacement field
+		// to zero. (same as ModRm w/o SIB form above, basically, except the
+		// ModRm_UseDisp flag is specified in the SIB instead of the ModRM field).
+
+		if( info.Base.IsEmpty() )
+		{
+			ModRM( 0, regfield, ModRm_UseSib );
+			SibSB( info.Scale, info.Index.Id, ModRm_UseDisp32 );
+			displacement_size = 2;
+		}
+		else
+		{
+			if( info.Base == ebp && displacement_size == 0 )
+				displacement_size = 1;		// forces [ebp] to be encoded as [ebp+0]!
+
+			ModRM( displacement_size, regfield, ModRm_UseSib );
+			SibSB( info.Scale, info.Index.Id, info.Base.Id );
+		}
+	}
+
+	switch( displacement_size )
+	{
+		case 0: break;
+		case 1: write8( info.Displacement );  break;
+		case 2: write32( info.Displacement ); break;
+		jNO_DEFAULT
+	}
+}
+
+// ------------------------------------------------------------------------
+// Conditionally generates Sib encoding information!
+//
+// regfield - register field to be written to the ModRm.  This is either a register specifier
+//   or an opcode extension.  In either case, the instruction determines the value for us.
+//
+emitterT void EmitSibMagic( x86Register32 regfield, const ModSib& info )
+{
+	EmitSibMagic( regfield.Id, info );
+}
+
+template< typename ToReg >
+static void EmitLeaMagic( ToReg to, const ModSib& src, bool is16bit=false )
+{
+	int displacement_size = (src.Displacement == 0) ? 0 : 
+		( ( src.IsByteSizeDisp() ) ? 1 : 2 );
+
+	// See EmitSibMagic for commenting on SIB encoding.
+
+	if( !NeedsSibMagic( src ) )
+	{
+		// LEA Land: means we have either 1-register encoding or just an offset.
+		// offset is encodable as an immediate MOV, and a register is encodable
+		// as a register MOV.
+
+		if( src.Index.IsEmpty() )
+		{
+			if( is16bit )
+				MOV16ItoR( to.Id, src.Displacement );
+			else
+				MOV32ItoR( to.Id, src.Displacement );
+			return;
+		}
+		else if( displacement_size == 0 )
+		{
+			if( is16bit )
+				MOV16RtoR( to.Id, src.Index.Id );
+			else
+				MOV32RtoR( to.Id, src.Index.Id );
+			return;
+		}
+		else
+		{
+			// note: no need to do ebp+0 check since we encode all 0 displacements as
+			// register assignments above (via MOV)
+
+			write8( 0x8d );
+			ModRM( displacement_size, to.Id, src.Index.Id );
+		}
+	}
+	else
+	{
+		if( src.Base.IsEmpty() )
+		{
+			if( displacement_size == 0 )
+			{
+				// Encode [Index*Scale] as a combination of Mov and Shl.
+				// This is more efficient because of the bloated format which requires
+				// a 32 bit displacement.
+
+				if( is16bit )
+				{
+					MOV16RtoR( to.Id, src.Index.Id );
+					SHL16ItoR( to.Id, src.Scale );
+				}
+				else
+				{
+					MOV32RtoR( to.Id, src.Index.Id );
+					SHL32ItoR( to.Id, src.Scale );
+				}
+				return;
+			}
+
+			write8( 0x8d );
+			ModRM( 0, to.Id, ModRm_UseSib );
+			SibSB( src.Scale, src.Index.Id, ModRm_UseDisp32 );
+			displacement_size = 2;		// force 32bit displacement.
+		}
+		else
+		{
+			if( src.Base == ebp && displacement_size == 0 )
+				displacement_size = 1;		// forces [ebp] to be encoded as [ebp+0]!
+
+			write8( 0x8d );
+			ModRM( displacement_size, to.Id, ModRm_UseSib );
+			SibSB( src.Scale, src.Index.Id, src.Base.Id );
+		}
+	}
+	
+	switch( displacement_size )
+	{
+		case 0: break;
+		case 1: write8( src.Displacement );  break;
+		case 2: write32( src.Displacement ); break;
+		jNO_DEFAULT
+	}
+
+}
+
+emitterT void LEA32( x86Register32 to, const ModSib& src )
+{
+	EmitLeaMagic( to, src );
+}
+
+
+emitterT void LEA16( x86Register16 to, const ModSib& src )
+{
+	// fixme: is this right?  Does Lea16 use 32 bit displacement and ModRM form?
+	
+	write8( 0x66 );
+	EmitLeaMagic( to, src );
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Miscellaneous Section!
+// Various Instructions with no parameter and no special encoding logic.
+//
+emitterT void RET()		{ write8( 0xC3 ); }
+emitterT void CBW()		{ write16( 0x9866 );  }
+emitterT void CWD()		{ write8( 0x98 ); }
+emitterT void CDQ()		{ write8( 0x99 ); }
+emitterT void CWDE()	{ write8( 0x98 ); }
+
+emitterT void LAHF()	{ write8( 0x9f ); }
+emitterT void SAHF()	{ write8( 0x9e ); }
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Push / Pop Emitters
+//
+// fixme?  push/pop instructions always push and pop aligned to whatever mode the cpu
+// is running in.  So even thought these say push32, they would essentially be push64 on
+// an x64 build.  Should I rename them accordingly?  --air
+//
+// Note: pushad/popad implementations are intentionally left out.  The instructions are
+// invalid in x64, and are super slow on x32.  Use multiple Push/Pop instructions instead.
+
+
+emitterT void POP( x86Register32 from )
+{
+	write8( 0x58 | from.Id );
+}
+
+emitterT void POP( const ModSib& from )
+{
+	write8( 0x8f ); EmitSibMagic( 0, from );
+}
+
+emitterT void PUSH( u32 imm )
+{
+	write8( 0x68 ); write32( imm );
+}
+
+emitterT void PUSH( x86Register32 from )
+{
+	write8( 0x50 | from.Id );
+}
+
+emitterT void PUSH( const ModSib& from )
+{
+	write8( 0xff ); EmitSibMagic( 6, from );
+}
+
+// pushes the EFLAGS register onto the stack
+emitterT void PUSHFD() { write8( 0x9C ); }
+// pops the EFLAGS register from the stack
+emitterT void POPFD() { write8( 0x9D ); }
+
+}
--- a/pcsx2/x86/ix86/ix86.h
+++ b/pcsx2/x86/ix86/ix86.h
--- a/pcsx2/x86/ix86/ix86.inl
+++ b/pcsx2/x86/ix86/ix86.inl
--- a/pcsx2/x86/ix86/ix86_3dnow.cpp
+++ b/pcsx2/x86/ix86/ix86_3dnow.cpp
@ -0,0 +1,202 @@
+/*  Pcsx2 - Pc Ps2 Emulator
+ *  Copyright (C) 2002-2009  Pcsx2 Team
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *  
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
+#include "PrecompiledHeader.h"
+#include "ix86_internal.h"
+
+//------------------------------------------------------------------
+// 3DNOW instructions
+//------------------------------------------------------------------
+
+/* femms */
+emitterT void FEMMS( void ) 
+{
+	write16( 0x0E0F );
+}
+
+emitterT void PFCMPEQMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0xB0 );
+}
+
+emitterT void PFCMPGTMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0xA0 );
+}
+
+emitterT void PFCMPGEMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0x90 );
+}
+
+emitterT void PFADDMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0x9E );
+}
+
+emitterT void PFADDRtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from );
+	write8( 0x9E );
+}
+
+emitterT void PFSUBMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0x9A );
+}
+
+emitterT void PFSUBRtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from ); 
+	write8( 0x9A );
+}
+
+emitterT void PFMULMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0xB4 );
+}
+
+emitterT void PFMULRtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from ); 
+	write8( 0xB4 );
+}
+
+emitterT void PFRCPMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0x96 );
+}
+
+emitterT void PFRCPRtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from ); 
+	write8( 0x96 );
+}
+
+emitterT void PFRCPIT1RtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from ); 
+	write8( 0xA6 );
+}
+
+emitterT void PFRCPIT2RtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from ); 
+	write8( 0xB6 );
+}
+
+emitterT void PFRSQRTRtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from ); 
+	write8( 0x97 );
+}
+
+emitterT void PFRSQIT1RtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from ); 
+	write8( 0xA7 );
+}
+
+emitterT void PF2IDMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0x1D );
+}
+
+emitterT void PF2IDRtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from ); 
+	write8( 0x1D );
+}
+
+emitterT void PI2FDMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0x0D );
+}
+
+emitterT void PI2FDRtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from ); 
+	write8( 0x0D );
+}
+
+emitterT void PFMAXMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0xA4 );
+}
+
+emitterT void PFMAXRtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from ); 
+	write8( 0xA4 );
+}
+
+emitterT void PFMINMtoR( x86IntRegType to, uptr from )
+{
+	write16( 0x0F0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( from ); 
+	write8( 0x94 );
+}
+
+emitterT void PFMINRtoR( x86IntRegType to, x86IntRegType from )
+{
+	write16( 0x0F0F );
+	ModRM( 3, to, from );
+	write8( 0x94 );
+}
--- a/pcsx2/x86/ix86/ix86_3dnow.inl
+++ b/pcsx2/x86/ix86/ix86_3dnow.inl
@ -1,201 +0,0 @@
-/*  Pcsx2 - Pc Ps2 Emulator
- *  Copyright (C) 2002-2009  Pcsx2 Team
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *  
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *  
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
- */
-
-#pragma once
-
-//------------------------------------------------------------------
-// 3DNOW instructions
-//------------------------------------------------------------------
-
-/* femms */
-emitterT void eFEMMS( void ) 
-{
-	write16<I>( 0x0E0F );
-}
-
-emitterT void ePFCMPEQMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0xB0 );
-}
-
-emitterT void ePFCMPGTMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0xA0 );
-}
-
-emitterT void ePFCMPGEMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0x90 );
-}
-
-emitterT void ePFADDMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0x9E );
-}
-
-emitterT void ePFADDRtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from );
-	write8<I>( 0x9E );
-}
-
-emitterT void ePFSUBMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0x9A );
-}
-
-emitterT void ePFSUBRtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from ); 
-	write8<I>( 0x9A );
-}
-
-emitterT void ePFMULMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0xB4 );
-}
-
-emitterT void ePFMULRtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from ); 
-	write8<I>( 0xB4 );
-}
-
-emitterT void ePFRCPMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0x96 );
-}
-
-emitterT void ePFRCPRtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from ); 
-	write8<I>( 0x96 );
-}
-
-emitterT void ePFRCPIT1RtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from ); 
-	write8<I>( 0xA6 );
-}
-
-emitterT void ePFRCPIT2RtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from ); 
-	write8<I>( 0xB6 );
-}
-
-emitterT void ePFRSQRTRtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from ); 
-	write8<I>( 0x97 );
-}
-
-emitterT void ePFRSQIT1RtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from ); 
-	write8<I>( 0xA7 );
-}
-
-emitterT void ePF2IDMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0x1D );
-}
-
-emitterT void ePF2IDRtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from ); 
-	write8<I>( 0x1D );
-}
-
-emitterT void ePI2FDMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0x0D );
-}
-
-emitterT void ePI2FDRtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from ); 
-	write8<I>( 0x0D );
-}
-
-emitterT void ePFMAXMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0xA4 );
-}
-
-emitterT void ePFMAXRtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from ); 
-	write8<I>( 0xA4 );
-}
-
-emitterT void ePFMINMtoR( x86IntRegType to, uptr from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( from ); 
-	write8<I>( 0x94 );
-}
-
-emitterT void ePFMINRtoR( x86IntRegType to, x86IntRegType from )
-{
-	write16<I>( 0x0F0F );
-	ModRM<I>( 3, to, from );
-	write8<I>( 0x94 );
-}
--- a/pcsx2/x86/ix86/ix86_cpudetect.cpp
+++ b/pcsx2/x86/ix86/ix86_cpudetect.cpp
@ -18,10 +18,8 @@

 #include "PrecompiledHeader.h"

-#define _EmitterId_ 0
-
-#include "ix86.h"
-#include "Misc.h"
+#include "ix86_internal.h"
+#include "System.h"
 #include "Threading.h"

 #include "RedtapeWindows.h"
@ -400,6 +398,7 @@ void cpudetectInit()
 		cpudetectSSE3(recSSE);
 		HostSys::Munmap( recSSE, 0x1000 );
 	}
+	else { Console::Error("Error: Failed to allocate memory for SSE3 State detection."); }

 	//////////////////////////////////////
 	//  Core Counting!
--- a/pcsx2/x86/ix86/ix86_fpu.cpp
+++ b/pcsx2/x86/ix86/ix86_fpu.cpp
@ -0,0 +1,276 @@
+/*  Pcsx2 - Pc Ps2 Emulator
+ *  Copyright (C) 2002-2009  Pcsx2 Team
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *  
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
+#include "PrecompiledHeader.h"
+#include "ix86_internal.h"
+
+//------------------------------------------------------------------
+// FPU instructions
+//------------------------------------------------------------------
+
+/* fild m32 to fpu reg stack */
+emitterT void FILD32( u32 from )
+{
+	write8( 0xDB );
+	ModRM( 0, 0x0, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* fistp m32 from fpu reg stack */
+emitterT void FISTP32( u32 from ) 
+{
+	write8( 0xDB );
+	ModRM( 0, 0x3, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* fld m32 to fpu reg stack */
+emitterT void FLD32( u32 from )
+{
+	write8( 0xD9 );
+	ModRM( 0, 0x0, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+// fld st(i)
+emitterT void FLD(int st)	{ write16(0xc0d9+(st<<8)); }
+emitterT void FLD1()		{ write16(0xe8d9); }
+emitterT void FLDL2E()		{ write16(0xead9); }
+
+/* fst m32 from fpu reg stack */
+emitterT void FST32( u32 to ) 
+{
+   write8( 0xD9 );
+   ModRM( 0, 0x2, DISP32 );
+   write32( MEMADDR(to, 4) ); 
+}
+
+/* fstp m32 from fpu reg stack */
+emitterT void FSTP32( u32 to )
+{
+	write8( 0xD9 );
+	ModRM( 0, 0x3, DISP32 );
+	write32( MEMADDR(to, 4) ); 
+}
+
+// fstp st(i)
+emitterT void FSTP(int st)	{ write16(0xd8dd+(st<<8)); }
+
+/* fldcw fpu control word from m16 */
+emitterT void FLDCW( u32 from )
+{
+	write8( 0xD9 );
+	ModRM( 0, 0x5, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* fnstcw fpu control word to m16 */
+emitterT void FNSTCW( u32 to ) 
+{
+	write8( 0xD9 );
+	ModRM( 0, 0x7, DISP32 );
+	write32( MEMADDR(to, 4) ); 
+}
+
+emitterT void FNSTSWtoAX()	{ write16(0xE0DF); }
+emitterT void FXAM()		{ write16(0xe5d9); }
+emitterT void FDECSTP()	{ write16(0xf6d9); }
+emitterT void FRNDINT()	{ write16(0xfcd9); }
+emitterT void FXCH(int st)	{ write16(0xc8d9+(st<<8)); }
+emitterT void F2XM1()		{ write16(0xf0d9); }
+emitterT void FSCALE()		{ write16(0xfdd9); }
+emitterT void FPATAN(void)	{ write16(0xf3d9); }
+emitterT void FSIN(void)	{ write16(0xfed9); }
+
+/* fadd ST(src) to fpu reg stack ST(0) */
+emitterT void FADD32Rto0( x86IntRegType src )
+{
+   write8( 0xD8 );
+   write8( 0xC0 + src );
+}
+
+/* fadd ST(0) to fpu reg stack ST(src) */
+emitterT void FADD320toR( x86IntRegType src )
+{
+   write8( 0xDC );
+   write8( 0xC0 + src );
+}
+
+/* fsub ST(src) to fpu reg stack ST(0) */
+emitterT void FSUB32Rto0( x86IntRegType src )
+{
+   write8( 0xD8 );
+   write8( 0xE0 + src );
+}
+
+/* fsub ST(0) to fpu reg stack ST(src) */
+emitterT void FSUB320toR( x86IntRegType src )
+{
+   write8( 0xDC );
+   write8( 0xE8 + src );
+}
+
+/* fsubp -> substract ST(0) from ST(1), store in ST(1) and POP stack */
+emitterT void FSUBP( void )
+{
+   write8( 0xDE );
+   write8( 0xE9 );
+}
+
+/* fmul ST(src) to fpu reg stack ST(0) */
+emitterT void FMUL32Rto0( x86IntRegType src )
+{
+   write8( 0xD8 );
+   write8( 0xC8 + src );
+}
+
+/* fmul ST(0) to fpu reg stack ST(src) */
+emitterT void FMUL320toR( x86IntRegType src )
+{
+   write8( 0xDC );
+   write8( 0xC8 + src );
+}
+
+/* fdiv ST(src) to fpu reg stack ST(0) */
+emitterT void FDIV32Rto0( x86IntRegType src )
+{
+   write8( 0xD8 );
+   write8( 0xF0 + src );
+}
+
+/* fdiv ST(0) to fpu reg stack ST(src) */
+emitterT void FDIV320toR( x86IntRegType src )
+{
+   write8( 0xDC );
+   write8( 0xF8 + src );
+}
+
+emitterT void FDIV320toRP( x86IntRegType src )
+{
+	write8( 0xDE );
+	write8( 0xF8 + src );
+}
+
+/* fadd m32 to fpu reg stack */
+emitterT void FADD32( u32 from ) 
+{
+	write8( 0xD8 );
+	ModRM( 0, 0x0, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* fsub m32 to fpu reg stack */
+emitterT void FSUB32( u32 from ) 
+{
+	write8( 0xD8 );
+	ModRM( 0, 0x4, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* fmul m32 to fpu reg stack */
+emitterT void FMUL32( u32 from )
+{
+	write8( 0xD8 );
+	ModRM( 0, 0x1, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* fdiv m32 to fpu reg stack */
+emitterT void FDIV32( u32 from ) 
+{
+	write8( 0xD8 );
+	ModRM( 0, 0x6, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* fabs fpu reg stack */
+emitterT void FABS( void )
+{
+	write16( 0xE1D9 );
+}
+
+/* fsqrt fpu reg stack */
+emitterT void FSQRT( void ) 
+{
+	write16( 0xFAD9 );
+}
+
+/* fchs fpu reg stack */
+emitterT void FCHS( void ) 
+{
+	write16( 0xE0D9 );
+}
+
+/* fcomi st, st(i) */
+emitterT void FCOMI( x86IntRegType src )
+{
+	write8( 0xDB );
+	write8( 0xF0 + src ); 
+}
+
+/* fcomip st, st(i) */
+emitterT void FCOMIP( x86IntRegType src )
+{
+	write8( 0xDF );
+	write8( 0xF0 + src ); 
+}
+
+/* fucomi st, st(i) */
+emitterT void FUCOMI( x86IntRegType src )
+{
+	write8( 0xDB );
+	write8( 0xE8 + src ); 
+}
+
+/* fucomip st, st(i) */
+emitterT void FUCOMIP( x86IntRegType src )
+{
+	write8( 0xDF );
+	write8( 0xE8 + src ); 
+}
+
+/* fcom m32 to fpu reg stack */
+emitterT void FCOM32( u32 from ) 
+{
+	write8( 0xD8 );
+	ModRM( 0, 0x2, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* fcomp m32 to fpu reg stack */
+emitterT void FCOMP32( u32 from )
+{
+	write8( 0xD8 );
+	ModRM( 0, 0x3, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+#define FCMOV32( low, high ) \
+   { \
+	   write8( low ); \
+	   write8( high + from );  \
+   }
+
+emitterT void FCMOVB32( x86IntRegType from )     { FCMOV32( 0xDA, 0xC0 ); }
+emitterT void FCMOVE32( x86IntRegType from )     { FCMOV32( 0xDA, 0xC8 ); }
+emitterT void FCMOVBE32( x86IntRegType from )    { FCMOV32( 0xDA, 0xD0 ); }
+emitterT void FCMOVU32( x86IntRegType from )     { FCMOV32( 0xDA, 0xD8 ); }
+emitterT void FCMOVNB32( x86IntRegType from )    { FCMOV32( 0xDB, 0xC0 ); }
+emitterT void FCMOVNE32( x86IntRegType from )    { FCMOV32( 0xDB, 0xC8 ); }
+emitterT void FCMOVNBE32( x86IntRegType from )   { FCMOV32( 0xDB, 0xD0 ); }
+emitterT void FCMOVNU32( x86IntRegType from )    { FCMOV32( 0xDB, 0xD8 ); }
--- a/pcsx2/x86/ix86/ix86_fpu.inl
+++ b/pcsx2/x86/ix86/ix86_fpu.inl
@ -1,276 +0,0 @@
-/*  Pcsx2 - Pc Ps2 Emulator
- *  Copyright (C) 2002-2009  Pcsx2 Team
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *  
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *  
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
- */
-
-#pragma once
-//#include "PrecompiledHeader.h"
-
-//------------------------------------------------------------------
-// FPU instructions
-//------------------------------------------------------------------
-
-/* fild m32 to fpu reg stack */
-emitterT void eFILD32( u32 from )
-{
-	write8<I>( 0xDB );
-	ModRM<I>( 0, 0x0, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* fistp m32 from fpu reg stack */
-emitterT void eFISTP32( u32 from ) 
-{
-	write8<I>( 0xDB );
-	ModRM<I>( 0, 0x3, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* fld m32 to fpu reg stack */
-emitterT void eFLD32( u32 from )
-{
-	write8<I>( 0xD9 );
-	ModRM<I>( 0, 0x0, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-// fld st(i)
-emitterT void eFLD(int st)	{ write16<I>(0xc0d9+(st<<8)); }
-emitterT void eFLD1()		{ write16<I>(0xe8d9); }
-emitterT void eFLDL2E()		{ write16<I>(0xead9); }
-
-/* fst m32 from fpu reg stack */
-emitterT void eFST32( u32 to ) 
-{
-   write8<I>( 0xD9 );
-   ModRM<I>( 0, 0x2, DISP32 );
-   write32<I>( MEMADDR(to, 4) ); 
-}
-
-/* fstp m32 from fpu reg stack */
-emitterT void eFSTP32( u32 to )
-{
-	write8<I>( 0xD9 );
-	ModRM<I>( 0, 0x3, DISP32 );
-	write32<I>( MEMADDR(to, 4) ); 
-}
-
-// fstp st(i)
-emitterT void eFSTP(int st)	{ write16<I>(0xd8dd+(st<<8)); }
-
-/* fldcw fpu control word from m16 */
-emitterT void eFLDCW( u32 from )
-{
-	write8<I>( 0xD9 );
-	ModRM<I>( 0, 0x5, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* fnstcw fpu control word to m16 */
-emitterT void eFNSTCW( u32 to ) 
-{
-	write8<I>( 0xD9 );
-	ModRM<I>( 0, 0x7, DISP32 );
-	write32<I>( MEMADDR(to, 4) ); 
-}
-
-emitterT void eFNSTSWtoAX()	{ write16<I>(0xE0DF); }
-emitterT void eFXAM()		{ write16<I>(0xe5d9); }
-emitterT void eFDECSTP()	{ write16<I>(0xf6d9); }
-emitterT void eFRNDINT()	{ write16<I>(0xfcd9); }
-emitterT void eFXCH(int st)	{ write16<I>(0xc8d9+(st<<8)); }
-emitterT void eF2XM1()		{ write16<I>(0xf0d9); }
-emitterT void eFSCALE()		{ write16<I>(0xfdd9); }
-emitterT void eFPATAN(void)	{ write16<I>(0xf3d9); }
-emitterT void eFSIN(void)	{ write16<I>(0xfed9); }
-
-/* fadd ST(src) to fpu reg stack ST(0) */
-emitterT void eFADD32Rto0( x86IntRegType src )
-{
-   write8<I>( 0xD8 );
-   write8<I>( 0xC0 + src );
-}
-
-/* fadd ST(0) to fpu reg stack ST(src) */
-emitterT void eFADD320toR( x86IntRegType src )
-{
-   write8<I>( 0xDC );
-   write8<I>( 0xC0 + src );
-}
-
-/* fsub ST(src) to fpu reg stack ST(0) */
-emitterT void eFSUB32Rto0( x86IntRegType src )
-{
-   write8<I>( 0xD8 );
-   write8<I>( 0xE0 + src );
-}
-
-/* fsub ST(0) to fpu reg stack ST(src) */
-emitterT void eFSUB320toR( x86IntRegType src )
-{
-   write8<I>( 0xDC );
-   write8<I>( 0xE8 + src );
-}
-
-/* fsubp -> substract ST(0) from ST(1), store in ST(1) and POP stack */
-emitterT void eFSUBP( void )
-{
-   write8<I>( 0xDE );
-   write8<I>( 0xE9 );
-}
-
-/* fmul ST(src) to fpu reg stack ST(0) */
-emitterT void eFMUL32Rto0( x86IntRegType src )
-{
-   write8<I>( 0xD8 );
-   write8<I>( 0xC8 + src );
-}
-
-/* fmul ST(0) to fpu reg stack ST(src) */
-emitterT void eFMUL320toR( x86IntRegType src )
-{
-   write8<I>( 0xDC );
-   write8<I>( 0xC8 + src );
-}
-
-/* fdiv ST(src) to fpu reg stack ST(0) */
-emitterT void eFDIV32Rto0( x86IntRegType src )
-{
-   write8<I>( 0xD8 );
-   write8<I>( 0xF0 + src );
-}
-
-/* fdiv ST(0) to fpu reg stack ST(src) */
-emitterT void eFDIV320toR( x86IntRegType src )
-{
-   write8<I>( 0xDC );
-   write8<I>( 0xF8 + src );
-}
-
-emitterT void eFDIV320toRP( x86IntRegType src )
-{
-	write8<I>( 0xDE );
-	write8<I>( 0xF8 + src );
-}
-
-/* fadd m32 to fpu reg stack */
-emitterT void eFADD32( u32 from ) 
-{
-	write8<I>( 0xD8 );
-	ModRM<I>( 0, 0x0, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* fsub m32 to fpu reg stack */
-emitterT void eFSUB32( u32 from ) 
-{
-	write8<I>( 0xD8 );
-	ModRM<I>( 0, 0x4, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* fmul m32 to fpu reg stack */
-emitterT void eFMUL32( u32 from )
-{
-	write8<I>( 0xD8 );
-	ModRM<I>( 0, 0x1, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* fdiv m32 to fpu reg stack */
-emitterT void eFDIV32( u32 from ) 
-{
-	write8<I>( 0xD8 );
-	ModRM<I>( 0, 0x6, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* fabs fpu reg stack */
-emitterT void eFABS( void )
-{
-	write16<I>( 0xE1D9 );
-}
-
-/* fsqrt fpu reg stack */
-emitterT void eFSQRT( void ) 
-{
-	write16<I>( 0xFAD9 );
-}
-
-/* fchs fpu reg stack */
-emitterT void eFCHS( void ) 
-{
-	write16<I>( 0xE0D9 );
-}
-
-/* fcomi st, st(i) */
-emitterT void eFCOMI( x86IntRegType src )
-{
-	write8<I>( 0xDB );
-	write8<I>( 0xF0 + src ); 
-}
-
-/* fcomip st, st(i) */
-emitterT void eFCOMIP( x86IntRegType src )
-{
-	write8<I>( 0xDF );
-	write8<I>( 0xF0 + src ); 
-}
-
-/* fucomi st, st(i) */
-emitterT void eFUCOMI( x86IntRegType src )
-{
-	write8<I>( 0xDB );
-	write8<I>( 0xE8 + src ); 
-}
-
-/* fucomip st, st(i) */
-emitterT void eFUCOMIP( x86IntRegType src )
-{
-	write8<I>( 0xDF );
-	write8<I>( 0xE8 + src ); 
-}
-
-/* fcom m32 to fpu reg stack */
-emitterT void eFCOM32( u32 from ) 
-{
-	write8<I>( 0xD8 );
-	ModRM<I>( 0, 0x2, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* fcomp m32 to fpu reg stack */
-emitterT void eFCOMP32( u32 from )
-{
-	write8<I>( 0xD8 );
-	ModRM<I>( 0, 0x3, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-#define FCMOV32( low, high ) \
-   { \
-	   write8<I>( low ); \
-	   write8<I>( high + from );  \
-   }
-
-emitterT void eFCMOVB32( x86IntRegType from )     { FCMOV32( 0xDA, 0xC0 ); }
-emitterT void eFCMOVE32( x86IntRegType from )     { FCMOV32( 0xDA, 0xC8 ); }
-emitterT void eFCMOVBE32( x86IntRegType from )    { FCMOV32( 0xDA, 0xD0 ); }
-emitterT void eFCMOVU32( x86IntRegType from )     { FCMOV32( 0xDA, 0xD8 ); }
-emitterT void eFCMOVNB32( x86IntRegType from )    { FCMOV32( 0xDB, 0xC0 ); }
-emitterT void eFCMOVNE32( x86IntRegType from )    { FCMOV32( 0xDB, 0xC8 ); }
-emitterT void eFCMOVNBE32( x86IntRegType from )   { FCMOV32( 0xDB, 0xD0 ); }
-emitterT void eFCMOVNU32( x86IntRegType from )    { FCMOV32( 0xDB, 0xD8 ); }
--- a/pcsx2/x86/ix86/ix86_group1.cpp
+++ b/pcsx2/x86/ix86/ix86_group1.cpp
@ -0,0 +1,225 @@
+/*  Pcsx2 - Pc Ps2 Emulator
+ *  Copyright (C) 2002-2009  Pcsx2 Team
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *  
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
+#include "PrecompiledHeader.h"
+#include "ix86_internal.h"
+
+//------------------------------------------------------------------
+// x86 Group 1 Instructions
+//------------------------------------------------------------------
+// Group 1 instructions all adhere to the same encoding scheme, and so they all
+// share the same emitter which has been coded here.
+//
+// Group 1 Table:  [column value is the Reg field of the ModRM byte]
+//
+//    0    1    2    3    4    5    6    7
+//   ADD  OR   ADC  SBB  AND  SUB  XOR  CMP
+//
+
+namespace x86Emitter {
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// x86RegConverter - this class is used internally by the emitter as a helper for
+// converting 8 and 16 register forms into 32 bit forms.  This way the end-user exposed API
+// can use type-safe 8/16/32 bit register types, and the underlying code can use a single
+// unified emitter to generate all function variations + prefixes and such. :)
+//
+class x86RegConverter : public x86Register32
+{
+public:
+	x86RegConverter( x86Register32 src ) : x86Register32( src ) {}
+	x86RegConverter( x86Register16 src ) : x86Register32( src.Id ) {}
+	x86RegConverter( x86Register8 src )  : x86Register32( src.Id ) {}
+};
+
+enum Group1InstructionType
+{
+	G1Type_ADD=0,
+	G1Type_OR,
+	G1Type_ADC,
+	G1Type_SBB,
+	G1Type_AND,
+	G1Type_SUB,
+	G1Type_XOR,
+	G1Type_CMP
+};
+
+
+static emitterT void Group1( Group1InstructionType inst, x86RegConverter to, x86RegConverter from, bool bit8form=false ) 
+{
+	write8( (bit8form ? 0 : 1) | (inst<<3) ); 
+	ModRM( 3, from.Id, to.Id );
+}
+
+static emitterT void Group1( Group1InstructionType inst, const ModSib& sibdest, x86RegConverter from, bool bit8form=false )
+{
+	write8( (bit8form ? 0 : 1) | (inst<<3) ); 
+	EmitSibMagic( from, sibdest );
+}
+
+static emitterT void Group1( Group1InstructionType inst, x86RegConverter to, const ModSib& sibsrc, bool bit8form=false )
+{
+	write8( (bit8form ? 2 : 3) | (inst<<3) );
+	EmitSibMagic( to, sibsrc );
+}
+
+// Note: this function emits based on the operand size of imm, so 16 bit imms generate a 16 bit
+// instruction (AX,BX,etc).
+template< typename T >
+static emitterT void Group1_Imm( Group1InstructionType inst, x86RegConverter to, T imm )
+{
+	bool bit8form = (sizeof(T) == 1);
+
+	if( !bit8form && is_s8( imm ) )
+	{
+		write8( 0x83 );
+		ModRM( 3, inst, to.Id );
+		write8( (s8)imm );
+	}
+	else
+	{
+		if( to == eax )
+			write8( (bit8form ? 4 : 5) | (inst<<3) );
+		else
+		{
+			write8( bit8form ? 0x80 : 0x81 );
+			ModRM( 3, inst, to.Id );
+		}
+		x86write<T>( imm );
+	}
+}
+
+// Note: this function emits based on the operand size of imm, so 16 bit imms generate a 16 bit
+// instruction (AX,BX,etc).
+template< typename T >
+static emitterT void Group1_Imm( Group1InstructionType inst, const ModSib& sibdest, T imm )
+{
+	bool bit8form = (sizeof(T) == 1);
+
+	write8( bit8form ? 0x80 : (is_s8( imm ) ? 0x83 : 0x81) );
+
+	EmitSibMagic( inst, sibdest );
+
+	if( !bit8form && is_s8( imm ) )
+		write8( (s8)imm );
+	else
+		x86write<T>( imm );
+}
+
+// 16 bit instruction prefix!
+static __forceinline void prefix16() { write8(0x66); }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+//
+#define DEFINE_GROUP1_OPCODE( cod ) \
+	emitterT void cod##32( x86Register32 to, x86Register32 from )  { Group1( G1Type_##cod, to,	from	); } \
+	emitterT void cod##32( x86Register32 to, void* from )          { Group1( G1Type_##cod, to,	ptr[from]	); } \
+	emitterT void cod##32( x86Register32 to, const ModSib& from )  { Group1( G1Type_##cod, to,	from	); } \
+	emitterT void cod##32( x86Register32 to, u32 imm )             { Group1_Imm( G1Type_##cod, to,	imm	); } \
+	emitterT void cod##32( const ModSib& to, x86Register32 from )  { Group1( G1Type_##cod, to,	from	); } \
+	emitterT void cod##32( void* to, x86Register32 from )          { Group1( G1Type_##cod, ptr[to],	from	); } \
+	emitterT void cod##32( void* to, u32 imm )                     { Group1_Imm( G1Type_##cod, ptr[to],	imm	); } \
+	emitterT void cod##32( const ModSib& to, u32 imm )             { Group1_Imm( G1Type_##cod, to,	imm	); } \
+ \
+	emitterT void cod##16( x86Register16 to, x86Register16 from )  { prefix16(); Group1( G1Type_##cod, to,	from	); } \
+	emitterT void cod##16( x86Register16 to, void* from )          { prefix16(); Group1( G1Type_##cod, to,	ptr[from]	); } \
+	emitterT void cod##16( x86Register16 to, const ModSib& from )  { prefix16(); Group1( G1Type_##cod, to,	from	); } \
+	emitterT void cod##16( x86Register16 to, u16 imm )             { prefix16(); Group1_Imm( G1Type_##cod, to,	imm	); } \
+	emitterT void cod##16( const ModSib& to, x86Register16 from )  { prefix16(); Group1( G1Type_##cod, to,	from	); } \
+	emitterT void cod##16( void* to, x86Register16 from )          { prefix16(); Group1( G1Type_##cod, ptr[to],	from	); } \
+	emitterT void cod##16( void* to, u16 imm )                     { prefix16(); Group1_Imm( G1Type_##cod, ptr[to],	imm	); } \
+	emitterT void cod##16( const ModSib& to, u16 imm )             { prefix16(); Group1_Imm( G1Type_##cod, to,	imm	); } \
+ \
+	emitterT void cod##8( x86Register8 to, x86Register8 from )     { Group1( G1Type_##cod, to,	from	, true ); } \
+	emitterT void cod##8( x86Register8 to, void* from )            { Group1( G1Type_##cod, to,	ptr[from], true ); } \
+	emitterT void cod##8( x86Register8 to, const ModSib& from )    { Group1( G1Type_##cod, to,	from	, true ); } \
+	emitterT void cod##8( x86Register8 to, u8 imm )                { Group1_Imm( G1Type_##cod, to,	imm	); } \
+	emitterT void cod##8( const ModSib& to, x86Register8 from )    { Group1( G1Type_##cod, to,	from	, true ); } \
+	emitterT void cod##8( void* to, x86Register8 from )            { Group1( G1Type_##cod, ptr[to],	from	, true ); } \
+	emitterT void cod##8( void* to, u8 imm )                       { Group1_Imm( G1Type_##cod, ptr[to],	imm	); } \
+	emitterT void cod##8( const ModSib& to, u8 imm )               { Group1_Imm( G1Type_##cod, to,	imm	); }
+
+DEFINE_GROUP1_OPCODE( ADD )
+DEFINE_GROUP1_OPCODE( CMP )
+DEFINE_GROUP1_OPCODE( OR )
+DEFINE_GROUP1_OPCODE( ADC )
+DEFINE_GROUP1_OPCODE( SBB )
+DEFINE_GROUP1_OPCODE( AND )
+DEFINE_GROUP1_OPCODE( SUB )
+DEFINE_GROUP1_OPCODE( XOR )
+
+}		// end namespace x86Emitter
+
+
+static __forceinline x86Emitter::x86Register32 _reghlp32( x86IntRegType src )
+{
+	return x86Emitter::x86Register32( src );
+}
+
+static __forceinline x86Emitter::x86Register16 _reghlp16( x86IntRegType src )
+{
+	return x86Emitter::x86Register16( src );
+}
+
+static __forceinline x86Emitter::x86Register8 _reghlp8( x86IntRegType src )
+{
+	return x86Emitter::x86Register8( src );
+}
+
+static __forceinline x86Emitter::ModSib _mrmhlp( x86IntRegType src )
+{
+	return x86Emitter::ModSib( x86Emitter::x86ModRm( _reghlp32(src) ) );
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+//
+#define DEFINE_LEGACY_HELPER( cod, bits ) \
+	emitterT void cod##bits##RtoR( x86IntRegType to, x86IntRegType from )	{ x86Emitter::cod##bits( _reghlp##bits(to), _reghlp##bits(from) ); } \
+	emitterT void cod##bits##ItoR( x86IntRegType to, u##bits imm )			{ x86Emitter::cod##bits( _reghlp##bits(to), imm ); } \
+	emitterT void cod##bits##MtoR( x86IntRegType to, uptr from )			{ x86Emitter::cod##bits( _reghlp##bits(to), (void*)from ); } \
+	emitterT void cod##bits##RtoM( uptr to, x86IntRegType from )			{ x86Emitter::cod##bits( (void*)to, _reghlp##bits(from) ); } \
+	emitterT void cod##bits##ItoM( uptr to, u##bits imm )					{ x86Emitter::cod##bits( (void*)to, imm ); } \
+	emitterT void cod##bits##ItoRm( x86IntRegType to, u##bits imm, int offset )	{ x86Emitter::cod##bits( _mrmhlp(to) + offset, imm ); } \
+	emitterT void cod##bits##RmtoR( x86IntRegType to, x86IntRegType from, int offset ) { x86Emitter::cod##bits( _reghlp##bits(to), _mrmhlp(from) + offset ); } \
+	emitterT void cod##bits##RtoRm( x86IntRegType to, x86IntRegType from, int offset ) { x86Emitter::cod##bits( _mrmhlp(to) + offset, _reghlp##bits(from) ); }
+
+#define DEFINE_GROUP1_OPCODE_LEGACY( cod ) \
+	DEFINE_LEGACY_HELPER( cod, 32 ) \
+	DEFINE_LEGACY_HELPER( cod, 16 ) \
+	DEFINE_LEGACY_HELPER( cod, 8 )
+
+DEFINE_GROUP1_OPCODE_LEGACY( ADD )
+DEFINE_GROUP1_OPCODE_LEGACY( CMP )
+DEFINE_GROUP1_OPCODE_LEGACY( OR )
+DEFINE_GROUP1_OPCODE_LEGACY( ADC )
+DEFINE_GROUP1_OPCODE_LEGACY( SBB )
+DEFINE_GROUP1_OPCODE_LEGACY( AND )
+DEFINE_GROUP1_OPCODE_LEGACY( SUB )
+DEFINE_GROUP1_OPCODE_LEGACY( XOR )
+
+// Special forms needed by the legacy emitter syntax:
+
+emitterT void AND32I8toR( x86IntRegType to, s8 from ) 
+{
+	x86Emitter::AND32( _reghlp32(to), from );
+}
+
+emitterT void AND32I8toM( uptr to, s8 from ) 
+{
+	x86Emitter::AND32( (void*)to, from );
+}
--- a/pcsx2/x86/ix86/ix86_internal.h
+++ b/pcsx2/x86/ix86/ix86_internal.h
@ -0,0 +1,43 @@
+
+#pragma once
+#include "ix86.h"
+
+//------------------------------------------------------------------
+// Helper Macros
+//------------------------------------------------------------------
+
+#define MEMADDR(addr, oplen)	(addr)
+
+#define Rex(w,r,x,b) assert(0)
+#define RexR(w, reg) assert( !(w || (reg)>=8) )
+#define RexB(w, base) assert( !(w || (base)>=8) )
+#define RexRB(w, reg, base) assert( !(w || (reg) >= 8 || (base)>=8) )
+#define RexRXB(w, reg, index, base) assert( !(w || (reg) >= 8 || (index) >= 8 || (base) >= 8) )
+
+#define _MM_MK_INSERTPS_NDX(srcField, dstField, zeroMask) (((srcField)<<6) | ((dstField)<<4) | (zeroMask))
+
+static const int ModRm_UseSib = 4;		// same index value as ESP (used in RM field)
+static const int ModRm_UseDisp32 = 5;	// same index value as EBP (used in Mod field)
+
+
+//------------------------------------------------------------------
+// General Emitter Helper functions
+//------------------------------------------------------------------
+
+namespace x86Emitter
+{
+	extern void EmitSibMagic( int regfield, const ModSib& info );
+	extern void EmitSibMagic( x86Register32 regfield, const ModSib& info );
+	extern bool NeedsSibMagic( const ModSib& info );
+}
+
+// From here out are the legacy (old) emitter functions...
+
+extern void WriteRmOffsetFrom(x86IntRegType to, x86IntRegType from, int offset);
+extern void ModRM( int mod, int reg, int rm );
+extern void SibSB( int ss, int index, int base );
+extern void SET8R( int cc, int to );
+extern u8*  J8Rel( int cc, int to );
+extern u32* J32Rel( int cc, u32 to );
+extern u64  GetCPUTick( void );
+//------------------------------------------------------------------
--- a/pcsx2/x86/ix86/ix86_legacy.cpp
+++ b/pcsx2/x86/ix86/ix86_legacy.cpp
--- a/pcsx2/x86/ix86/ix86_macros.h
+++ b/pcsx2/x86/ix86/ix86_macros.h
@ -51,66 +51,51 @@
 //------------------------------------------------------------------
 // mov instructions
 //------------------------------------------------------------------
-#define MOV64RtoR eMOV64RtoR<_EmitterId_>
-#define MOV64RtoM eMOV64RtoM<_EmitterId_>
-#define MOV64MtoR eMOV64MtoR<_EmitterId_>
-#define MOV64I32toM eMOV64I32toM<_EmitterId_>
-#define MOV64I32toR eMOV64I32toR<_EmitterId_>
-#define MOV64ItoR eMOV64ItoR<_EmitterId_>
-#define MOV64ItoRmOffset eMOV64ItoRmOffset<_EmitterId_>
-#define MOV64RmOffsettoR eMOV64RmOffsettoR<_EmitterId_>
-#define MOV64RmStoR eMOV64RmStoR<_EmitterId_>
-#define MOV64RtoRmOffset eMOV64RtoRmOffset<_EmitterId_>
-#define MOV64RtoRmS eMOV64RtoRmS<_EmitterId_>
 #define MOV32RtoR eMOV32RtoR<_EmitterId_>
 #define MOV32RtoM eMOV32RtoM<_EmitterId_>
 #define MOV32MtoR eMOV32MtoR<_EmitterId_>
 #define MOV32RmtoR eMOV32RmtoR<_EmitterId_>
-#define MOV32RmtoROffset eMOV32RmtoROffset<_EmitterId_>
+#define MOV32RmtoR eMOV32RmtoR<_EmitterId_>
 #define MOV32RmStoR eMOV32RmStoR<_EmitterId_>
 #define MOV32RmSOffsettoR eMOV32RmSOffsettoR<_EmitterId_>
 #define MOV32RtoRm eMOV32RtoRm<_EmitterId_>
 #define MOV32RtoRmS eMOV32RtoRmS<_EmitterId_>
 #define MOV32ItoR eMOV32ItoR<_EmitterId_>
 #define MOV32ItoM eMOV32ItoM<_EmitterId_>
-#define MOV32ItoRmOffset eMOV32ItoRmOffset<_EmitterId_>
-#define MOV32RtoRmOffset eMOV32RtoRmOffset<_EmitterId_>
+#define MOV32ItoRm eMOV32ItoRm<_EmitterId_>
+#define MOV32RtoRm eMOV32RtoRm<_EmitterId_>
 #define MOV16RtoM eMOV16RtoM<_EmitterId_>
 #define MOV16MtoR eMOV16MtoR<_EmitterId_>
 #define MOV16RmtoR eMOV16RmtoR<_EmitterId_>
-#define MOV16RmtoROffset eMOV16RmtoROffset<_EmitterId_>
+#define MOV16RmtoR eMOV16RmtoR<_EmitterId_>
 #define MOV16RmSOffsettoR eMOV16RmSOffsettoR<_EmitterId_>
 #define MOV16RtoRm eMOV16RtoRm<_EmitterId_>
 #define MOV16ItoM eMOV16ItoM<_EmitterId_>
 #define MOV16RtoRmS eMOV16RtoRmS<_EmitterId_>
 #define MOV16ItoR eMOV16ItoR<_EmitterId_>
-#define MOV16ItoRmOffset eMOV16ItoRmOffset<_EmitterId_>
-#define MOV16RtoRmOffset eMOV16RtoRmOffset<_EmitterId_>
+#define MOV16ItoRm eMOV16ItoRm<_EmitterId_>
+#define MOV16RtoRm eMOV16RtoRm<_EmitterId_>
 #define MOV8RtoM eMOV8RtoM<_EmitterId_>
 #define MOV8MtoR eMOV8MtoR<_EmitterId_>
 #define MOV8RmtoR eMOV8RmtoR<_EmitterId_>
-#define MOV8RmtoROffset eMOV8RmtoROffset<_EmitterId_>
+#define MOV8RmtoR eMOV8RmtoR<_EmitterId_>
 #define MOV8RmSOffsettoR eMOV8RmSOffsettoR<_EmitterId_>
 #define MOV8RtoRm eMOV8RtoRm<_EmitterId_>
 #define MOV8ItoM eMOV8ItoM<_EmitterId_>
 #define MOV8ItoR eMOV8ItoR<_EmitterId_>
-#define MOV8ItoRmOffset eMOV8ItoRmOffset<_EmitterId_>
-#define MOV8RtoRmOffset eMOV8RtoRmOffset<_EmitterId_>
+#define MOV8ItoRm eMOV8ItoRm<_EmitterId_>
+#define MOV8RtoRm eMOV8RtoRm<_EmitterId_>
 #define MOVSX32R8toR eMOVSX32R8toR<_EmitterId_>
 #define MOVSX32Rm8toR eMOVSX32Rm8toR<_EmitterId_>
-#define MOVSX32Rm8toROffset eMOVSX32Rm8toROffset<_EmitterId_>
 #define MOVSX32M8toR eMOVSX32M8toR<_EmitterId_>
 #define MOVSX32R16toR eMOVSX32R16toR<_EmitterId_>
 #define MOVSX32Rm16toR eMOVSX32Rm16toR<_EmitterId_>
-#define MOVSX32Rm16toROffset eMOVSX32Rm16toROffset<_EmitterId_>
 #define MOVSX32M16toR eMOVSX32M16toR<_EmitterId_>
 #define MOVZX32R8toR eMOVZX32R8toR<_EmitterId_>
 #define MOVZX32Rm8toR eMOVZX32Rm8toR<_EmitterId_>
-#define MOVZX32Rm8toROffset eMOVZX32Rm8toROffset<_EmitterId_>
 #define MOVZX32M8toR eMOVZX32M8toR<_EmitterId_>
 #define MOVZX32R16toR eMOVZX32R16toR<_EmitterId_>
 #define MOVZX32Rm16toR eMOVZX32Rm16toR<_EmitterId_>
-#define MOVZX32Rm16toROffset eMOVZX32Rm16toROffset<_EmitterId_>
 #define MOVZX32M16toR eMOVZX32M16toR<_EmitterId_>
 #define CMOVBE32RtoR eCMOVBE32RtoR<_EmitterId_>
 #define CMOVBE32MtoR eCMOVBE32MtoR<_EmitterId_>
@ -147,12 +132,10 @@
 //------------------------------------------------------------------
 // arithmetic instructions 
 //------------------------------------------------------------------
-#define ADD64ItoR eADD64ItoR<_EmitterId_>
-#define ADD64MtoR eADD64MtoR<_EmitterId_>
 #define ADD32ItoEAX eADD32ItoEAX<_EmitterId_>
 #define ADD32ItoR eADD32ItoR<_EmitterId_>
 #define ADD32ItoM eADD32ItoM<_EmitterId_>
-#define ADD32ItoRmOffset eADD32ItoRmOffset<_EmitterId_>
+#define ADD32ItoRm eADD32ItoRm<_EmitterId_>
 #define ADD32RtoR eADD32RtoR<_EmitterId_>
 #define ADD32RtoM eADD32RtoM<_EmitterId_>
 #define ADD32MtoR eADD32MtoR<_EmitterId_>
@ -171,7 +154,6 @@
 #define INC32M eINC32M<_EmitterId_>
 #define INC16R eINC16R<_EmitterId_>
 #define INC16M eINC16M<_EmitterId_>
-#define SUB64MtoR eSUB64MtoR<_EmitterId_>
 #define SUB32ItoR eSUB32ItoR<_EmitterId_>
 #define SUB32ItoM eSUB32ItoM<_EmitterId_>
 #define SUB32RtoR eSUB32RtoR<_EmitterId_>
@ -181,7 +163,6 @@
 #define SUB16ItoR eSUB16ItoR<_EmitterId_>
 #define SUB16ItoM eSUB16ItoM<_EmitterId_>
 #define SUB16MtoR eSUB16MtoR<_EmitterId_>
-#define SBB64RtoR eSBB64RtoR<_EmitterId_>
 #define SBB32ItoR eSBB32ItoR<_EmitterId_>
 #define SBB32ItoM eSBB32ItoM<_EmitterId_>
 #define SBB32RtoR eSBB32RtoR<_EmitterId_>
@ -203,12 +184,6 @@
 //------------------------------------------------------------------
 // shifting instructions 
 //------------------------------------------------------------------
-#define SHL64ItoR eSHL64ItoR<_EmitterId_>
-#define SHL64CLtoR eSHL64CLtoR<_EmitterId_>
-#define SHR64ItoR eSHR64ItoR<_EmitterId_>
-#define SHR64CLtoR eSHR64CLtoR<_EmitterId_>
-#define SAR64ItoR eSAR64ItoR<_EmitterId_>
-#define SAR64CLtoR eSAR64CLtoR<_EmitterId_>
 #define SHL32ItoR eSHL32ItoR<_EmitterId_>
 #define SHL32ItoM eSHL32ItoM<_EmitterId_>
 #define SHL32CLtoR eSHL32CLtoR<_EmitterId_>
@ -231,10 +206,6 @@
 //------------------------------------------------------------------
 // logical instructions
 //------------------------------------------------------------------
-#define OR64ItoR eOR64ItoR<_EmitterId_>
-#define OR64MtoR eOR64MtoR<_EmitterId_>
-#define OR64RtoR eOR64RtoR<_EmitterId_>
-#define OR64RtoM eOR64RtoM<_EmitterId_>
 #define OR32ItoR eOR32ItoR<_EmitterId_>
 #define OR32ItoM eOR32ItoM<_EmitterId_>
 #define OR32RtoR eOR32RtoR<_EmitterId_>
@ -249,11 +220,6 @@
 #define OR8RtoM eOR8RtoM<_EmitterId_>
 #define OR8ItoM eOR8ItoM<_EmitterId_>
 #define OR8MtoR eOR8MtoR<_EmitterId_>
-#define XOR64ItoR eXOR64ItoR<_EmitterId_>
-#define XOR64RtoR eXOR64RtoR<_EmitterId_>
-#define XOR64MtoR eXOR64MtoR<_EmitterId_>
-#define XOR64RtoR eXOR64RtoR<_EmitterId_>
-#define XOR64RtoM eXOR64RtoM<_EmitterId_>
 #define XOR32ItoR eXOR32ItoR<_EmitterId_>
 #define XOR32ItoM eXOR32ItoM<_EmitterId_>
 #define XOR32RtoR eXOR32RtoR<_EmitterId_>
@ -262,11 +228,6 @@
 #define XOR32MtoR eXOR32MtoR<_EmitterId_>
 #define XOR16RtoM eXOR16RtoM<_EmitterId_>
 #define XOR16ItoR eXOR16ItoR<_EmitterId_>
-#define AND64I32toR eAND64I32toR<_EmitterId_>
-#define AND64MtoR eAND64MtoR<_EmitterId_>
-#define AND64RtoM eAND64RtoM<_EmitterId_>
-#define AND64RtoR eAND64RtoR<_EmitterId_>
-#define AND64I32toM eAND64I32toM<_EmitterId_>
 #define AND32ItoR eAND32ItoR<_EmitterId_>
 #define AND32I8toR eAND32I8toR<_EmitterId_>
 #define AND32ItoM eAND32ItoM<_EmitterId_>
@ -275,7 +236,7 @@
 #define AND32RtoM eAND32RtoM<_EmitterId_>
 #define AND32MtoR eAND32MtoR<_EmitterId_>
 #define AND32RmtoR eAND32RmtoR<_EmitterId_>
-#define AND32RmtoROffset eAND32RmtoROffset<_EmitterId_>
+#define AND32RmtoR eAND32RmtoR<_EmitterId_>
 #define AND16RtoR eAND16RtoR<_EmitterId_>
 #define AND16ItoR eAND16ItoR<_EmitterId_>
 #define AND16ItoM eAND16ItoM<_EmitterId_>
@ -286,11 +247,8 @@
 #define AND8RtoM eAND8RtoM<_EmitterId_>
 #define AND8MtoR eAND8MtoR<_EmitterId_>
 #define AND8RtoR eAND8RtoR<_EmitterId_>
-#define BTS32MtoR eBTS32MtoR<_EmitterId_>
-#define NOT64R eNOT64R<_EmitterId_>
 #define NOT32R eNOT32R<_EmitterId_>
 #define NOT32M eNOT32M<_EmitterId_>
-#define NEG64R eNEG64R<_EmitterId_>
 #define NEG32R eNEG32R<_EmitterId_>
 #define NEG32M eNEG32M<_EmitterId_>
 #define NEG16R eNEG16R<_EmitterId_>
@ -350,15 +308,13 @@
 //------------------------------------------------------------------
 // misc instructions
 //------------------------------------------------------------------
-#define CMP64I32toR eCMP64I32toR<_EmitterId_>
-#define CMP64MtoR eCMP64MtoR<_EmitterId_>
-#define CMP64RtoR eCMP64RtoR<_EmitterId_>
 #define CMP32ItoR eCMP32ItoR<_EmitterId_>
 #define CMP32ItoM eCMP32ItoM<_EmitterId_>
 #define CMP32RtoR eCMP32RtoR<_EmitterId_>
 #define CMP32MtoR eCMP32MtoR<_EmitterId_>
+#define CMP32ItoRm eCMP32ItoRm<_EmitterId_>
+#define CMP8I8toRm eCMP8I8toRm<_EmitterId_>
 #define CMP32I8toRm eCMP32I8toRm<_EmitterId_>
-#define CMP32I8toRmOffset8 eCMP32I8toRmOffset8<_EmitterId_>
 #define CMP32I8toM eCMP32I8toM<_EmitterId_>
 #define CMP16ItoR eCMP16ItoR<_EmitterId_>
 #define CMP16ItoM eCMP16ItoM<_EmitterId_>
@ -540,16 +496,16 @@
 #define PUNPCKHDQMtoR ePUNPCKHDQMtoR<_EmitterId_>
 #define MOVQ64ItoR eMOVQ64ItoR<_EmitterId_>
 #define MOVQRtoR eMOVQRtoR<_EmitterId_>
-#define MOVQRmtoROffset eMOVQRmtoROffset<_EmitterId_>
-#define MOVQRtoRmOffset eMOVQRtoRmOffset<_EmitterId_>
+#define MOVQRmtoR eMOVQRmtoR<_EmitterId_>
+#define MOVQRtoRm eMOVQRtoRm<_EmitterId_>
 #define MOVDMtoMMX eMOVDMtoMMX<_EmitterId_>
 #define MOVDMMXtoM eMOVDMMXtoM<_EmitterId_>
 #define MOVD32RtoMMX eMOVD32RtoMMX<_EmitterId_>
 #define MOVD32RmtoMMX eMOVD32RmtoMMX<_EmitterId_>
-#define MOVD32RmOffsettoMMX eMOVD32RmOffsettoMMX<_EmitterId_>
+#define MOVD32RmtoMMX eMOVD32RmtoMMX<_EmitterId_>
 #define MOVD32MMXtoR eMOVD32MMXtoR<_EmitterId_>
 #define MOVD32MMXtoRm eMOVD32MMXtoRm<_EmitterId_>
-#define MOVD32MMXtoRmOffset eMOVD32MMXtoRmOffset<_EmitterId_>
+#define MOVD32MMXtoRm eMOVD32MMXtoRm<_EmitterId_>
 #define PINSRWRtoMMX ePINSRWRtoMMX<_EmitterId_>
 #define PSHUFWRtoR ePSHUFWRtoR<_EmitterId_>
 #define PSHUFWMtoR ePSHUFWMtoR<_EmitterId_>
@ -576,33 +532,31 @@
 #define SSE_MOVSS_XMM_to_M32 eSSE_MOVSS_XMM_to_M32<_EmitterId_>
 #define SSE_MOVSS_XMM_to_Rm eSSE_MOVSS_XMM_to_Rm<_EmitterId_>
 #define SSE_MOVSS_XMM_to_XMM eSSE_MOVSS_XMM_to_XMM<_EmitterId_>
-#define SSE_MOVSS_RmOffset_to_XMM eSSE_MOVSS_RmOffset_to_XMM<_EmitterId_>
-#define SSE_MOVSS_XMM_to_RmOffset eSSE_MOVSS_XMM_to_RmOffset<_EmitterId_>
+#define SSE_MOVSS_Rm_to_XMM eSSE_MOVSS_Rm_to_XMM<_EmitterId_>
+#define SSE_MOVSS_XMM_to_Rm eSSE_MOVSS_XMM_to_Rm<_EmitterId_>
 #define SSE_MASKMOVDQU_XMM_to_XMM eSSE_MASKMOVDQU_XMM_to_XMM<_EmitterId_>
 #define SSE_MOVLPS_M64_to_XMM eSSE_MOVLPS_M64_to_XMM<_EmitterId_>
 #define SSE_MOVLPS_XMM_to_M64 eSSE_MOVLPS_XMM_to_M64<_EmitterId_>
-#define SSE_MOVLPS_RmOffset_to_XMM eSSE_MOVLPS_RmOffset_to_XMM<_EmitterId_>
-#define SSE_MOVLPS_XMM_to_RmOffset eSSE_MOVLPS_XMM_to_RmOffset<_EmitterId_>
+#define SSE_MOVLPS_Rm_to_XMM eSSE_MOVLPS_Rm_to_XMM<_EmitterId_>
+#define SSE_MOVLPS_XMM_to_Rm eSSE_MOVLPS_XMM_to_Rm<_EmitterId_>
 #define SSE_MOVHPS_M64_to_XMM eSSE_MOVHPS_M64_to_XMM<_EmitterId_>
 #define SSE_MOVHPS_XMM_to_M64 eSSE_MOVHPS_XMM_to_M64<_EmitterId_>
-#define SSE_MOVHPS_RmOffset_to_XMM eSSE_MOVHPS_RmOffset_to_XMM<_EmitterId_>
-#define SSE_MOVHPS_XMM_to_RmOffset eSSE_MOVHPS_XMM_to_RmOffset<_EmitterId_>
+#define SSE_MOVHPS_Rm_to_XMM eSSE_MOVHPS_Rm_to_XMM<_EmitterId_>
+#define SSE_MOVHPS_XMM_to_Rm eSSE_MOVHPS_XMM_to_Rm<_EmitterId_>
 #define SSE_MOVLHPS_XMM_to_XMM eSSE_MOVLHPS_XMM_to_XMM<_EmitterId_>
 #define SSE_MOVHLPS_XMM_to_XMM eSSE_MOVHLPS_XMM_to_XMM<_EmitterId_>
 #define SSE_MOVLPSRmtoR eSSE_MOVLPSRmtoR<_EmitterId_>
-#define SSE_MOVLPSRmtoROffset eSSE_MOVLPSRmtoROffset<_EmitterId_>
 #define SSE_MOVLPSRtoRm eSSE_MOVLPSRtoRm<_EmitterId_>
-#define SSE_MOVLPSRtoRmOffset eSSE_MOVLPSRtoRmOffset<_EmitterId_>
 #define SSE_MOVAPSRmStoR eSSE_MOVAPSRmStoR<_EmitterId_>
 #define SSE_MOVAPSRtoRmS eSSE_MOVAPSRtoRmS<_EmitterId_>
-#define SSE_MOVAPSRtoRmOffset eSSE_MOVAPSRtoRmOffset<_EmitterId_>
-#define SSE_MOVAPSRmtoROffset eSSE_MOVAPSRmtoROffset<_EmitterId_>
+#define SSE_MOVAPSRtoRm eSSE_MOVAPSRtoRm<_EmitterId_>
+#define SSE_MOVAPSRmtoR eSSE_MOVAPSRmtoR<_EmitterId_>
 #define SSE_MOVUPSRmStoR eSSE_MOVUPSRmStoR<_EmitterId_>
 #define SSE_MOVUPSRtoRmS eSSE_MOVUPSRtoRmS<_EmitterId_>
 #define SSE_MOVUPSRtoRm eSSE_MOVUPSRtoRm<_EmitterId_>
 #define SSE_MOVUPSRmtoR eSSE_MOVUPSRmtoR<_EmitterId_>
-#define SSE_MOVUPSRmtoROffset eSSE_MOVUPSRmtoROffset<_EmitterId_>
-#define SSE_MOVUPSRtoRmOffset eSSE_MOVUPSRtoRmOffset<_EmitterId_>
+#define SSE_MOVUPSRmtoR eSSE_MOVUPSRmtoR<_EmitterId_>
+#define SSE_MOVUPSRtoRm eSSE_MOVUPSRtoRm<_EmitterId_>
 #define SSE_RCPPS_XMM_to_XMM eSSE_RCPPS_XMM_to_XMM<_EmitterId_>
 #define SSE_RCPPS_M128_to_XMM eSSE_RCPPS_M128_to_XMM<_EmitterId_>
 #define SSE_RCPSS_XMM_to_XMM eSSE_RCPSS_XMM_to_XMM<_EmitterId_>
@ -677,7 +631,7 @@
 #define SSE_UNPCKHPS_XMM_to_XMM eSSE_UNPCKHPS_XMM_to_XMM<_EmitterId_>
 #define SSE_SHUFPS_XMM_to_XMM eSSE_SHUFPS_XMM_to_XMM<_EmitterId_>
 #define SSE_SHUFPS_M128_to_XMM eSSE_SHUFPS_M128_to_XMM<_EmitterId_>
-#define SSE_SHUFPS_RmOffset_to_XMM eSSE_SHUFPS_RmOffset_to_XMM<_EmitterId_>
+#define SSE_SHUFPS_Rm_to_XMM eSSE_SHUFPS_Rm_to_XMM<_EmitterId_>
 #define SSE_CMPEQPS_M128_to_XMM eSSE_CMPEQPS_M128_to_XMM<_EmitterId_>
 #define SSE_CMPEQPS_XMM_to_XMM eSSE_CMPEQPS_XMM_to_XMM<_EmitterId_>
 #define SSE_CMPLTPS_M128_to_XMM eSSE_CMPLTPS_M128_to_XMM<_EmitterId_>
@ -781,8 +735,8 @@
 #define SSE2_MOVQ_XMM_to_M64 eSSE2_MOVQ_XMM_to_M64<_EmitterId_>
 #define SSE2_MOVDQ2Q_XMM_to_MM eSSE2_MOVDQ2Q_XMM_to_MM<_EmitterId_>
 #define SSE2_MOVQ2DQ_MM_to_XMM eSSE2_MOVQ2DQ_MM_to_XMM<_EmitterId_>
-#define SSE2_MOVDQARtoRmOffset eSSE2_MOVDQARtoRmOffset<_EmitterId_>
-#define SSE2_MOVDQARmtoROffset eSSE2_MOVDQARmtoROffset<_EmitterId_>
+#define SSE2_MOVDQARtoRm eSSE2_MOVDQARtoRm<_EmitterId_>
+#define SSE2_MOVDQARmtoR eSSE2_MOVDQARmtoR<_EmitterId_>
 #define SSE2_CVTDQ2PS_M128_to_XMM eSSE2_CVTDQ2PS_M128_to_XMM<_EmitterId_>  
 #define SSE2_CVTDQ2PS_XMM_to_XMM eSSE2_CVTDQ2PS_XMM_to_XMM<_EmitterId_>  
 #define SSE2_CVTPS2DQ_M128_to_XMM eSSE2_CVTPS2DQ_M128_to_XMM<_EmitterId_>  
@ -921,11 +875,11 @@
 #define SSE2_MOVD_M32_to_XMM		eSSE2_MOVD_M32_to_XMM<_EmitterId_>
 #define SSE2_MOVD_R_to_XMM			eSSE2_MOVD_R_to_XMM<_EmitterId_>
 #define SSE2_MOVD_Rm_to_XMM			eSSE2_MOVD_Rm_to_XMM<_EmitterId_>
-#define SSE2_MOVD_RmOffset_to_XMM	eSSE2_MOVD_RmOffset_to_XMM<_EmitterId_>
+#define SSE2_MOVD_Rm_to_XMM			eSSE2_MOVD_Rm_to_XMM<_EmitterId_>
 #define SSE2_MOVD_XMM_to_M32		eSSE2_MOVD_XMM_to_M32<_EmitterId_>
 #define SSE2_MOVD_XMM_to_R			eSSE2_MOVD_XMM_to_R<_EmitterId_>
 #define SSE2_MOVD_XMM_to_Rm			eSSE2_MOVD_XMM_to_Rm<_EmitterId_>
-#define SSE2_MOVD_XMM_to_RmOffset	eSSE2_MOVD_XMM_to_RmOffset<_EmitterId_>
+#define SSE2_MOVD_XMM_to_Rm			eSSE2_MOVD_XMM_to_Rm<_EmitterId_>
 #define SSE2_MOVQ_XMM_to_R			eSSE2_MOVQ_XMM_to_R<_EmitterId_>
 #define SSE2_MOVQ_R_to_XMM			eSSE2_MOVQ_R_to_XMM<_EmitterId_>
 //------------------------------------------------------------------
--- a/pcsx2/x86/ix86/ix86_mmx.cpp
+++ b/pcsx2/x86/ix86/ix86_mmx.cpp
@ -0,0 +1,584 @@
+/*  Pcsx2 - Pc Ps2 Emulator
+ *  Copyright (C) 2002-2009  Pcsx2 Team
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *  
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
+#include "PrecompiledHeader.h"
+#include "ix86_internal.h"
+
+//------------------------------------------------------------------
+// MMX instructions
+//
+// note: r64 = mm
+//------------------------------------------------------------------
+
+/* movq m64 to r64 */
+emitterT void MOVQMtoR( x86MMXRegType to, uptr from )
+{
+	write16( 0x6F0F );
+	ModRM( 0, to, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* movq r64 to m64 */
+emitterT void MOVQRtoM( uptr to, x86MMXRegType from ) 
+{
+	write16( 0x7F0F );
+	ModRM( 0, from, DISP32 );
+	write32(MEMADDR(to, 4)); 
+}
+
+/* pand r64 to r64 */
+emitterT void PANDRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xDB0F );
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PANDNRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0xDF0F );
+	ModRM( 3, to, from ); 
+}
+
+/* por r64 to r64 */
+emitterT void PORRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xEB0F );
+	ModRM( 3, to, from ); 
+}
+
+/* pxor r64 to r64 */
+emitterT void PXORRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xEF0F );
+	ModRM( 3, to, from ); 
+}
+
+/* psllq r64 to r64 */
+emitterT void PSLLQRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xF30F );
+	ModRM( 3, to, from ); 
+}
+
+/* psllq m64 to r64 */
+emitterT void PSLLQMtoR( x86MMXRegType to, uptr from ) 
+{
+	write16( 0xF30F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) );
+}
+
+/* psllq imm8 to r64 */
+emitterT void PSLLQItoR( x86MMXRegType to, u8 from ) 
+{
+	write16( 0x730F ); 
+	ModRM( 3, 6, to); 
+	write8( from ); 
+}
+
+/* psrlq r64 to r64 */
+emitterT void PSRLQRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xD30F ); 
+	ModRM( 3, to, from ); 
+}
+
+/* psrlq m64 to r64 */
+emitterT void PSRLQMtoR( x86MMXRegType to, uptr from ) 
+{
+	write16( 0xD30F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* psrlq imm8 to r64 */
+emitterT void PSRLQItoR( x86MMXRegType to, u8 from ) 
+{
+	write16( 0x730F );
+	ModRM( 3, 2, to); 
+	write8( from ); 
+}
+
+/* paddusb r64 to r64 */
+emitterT void PADDUSBRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xDC0F ); 
+	ModRM( 3, to, from ); 
+}
+
+/* paddusb m64 to r64 */
+emitterT void PADDUSBMtoR( x86MMXRegType to, uptr from ) 
+{
+	write16( 0xDC0F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* paddusw r64 to r64 */
+emitterT void PADDUSWRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xDD0F ); 
+	ModRM( 3, to, from ); 
+}
+
+/* paddusw m64 to r64 */
+emitterT void PADDUSWMtoR( x86MMXRegType to, uptr from ) 
+{
+	write16( 0xDD0F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* paddb r64 to r64 */
+emitterT void PADDBRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xFC0F ); 
+	ModRM( 3, to, from ); 
+}
+
+/* paddb m64 to r64 */
+emitterT void PADDBMtoR( x86MMXRegType to, uptr from ) 
+{
+	write16( 0xFC0F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* paddw r64 to r64 */
+emitterT void PADDWRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xFD0F ); 
+	ModRM( 3, to, from ); 
+}
+
+/* paddw m64 to r64 */
+emitterT void PADDWMtoR( x86MMXRegType to, uptr from ) 
+{
+	write16( 0xFD0F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* paddd r64 to r64 */
+emitterT void PADDDRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xFE0F ); 
+	ModRM( 3, to, from ); 
+}
+
+/* paddd m64 to r64 */
+emitterT void PADDDMtoR( x86MMXRegType to, uptr from ) 
+{
+	write16( 0xFE0F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* emms */
+emitterT void EMMS() 
+{
+	write16( 0x770F );
+}
+
+emitterT void PADDSBRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xEC0F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PADDSWRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xED0F );
+	ModRM( 3, to, from ); 
+}
+
+// paddq m64 to r64 (sse2 only?)
+emitterT void PADDQMtoR( x86MMXRegType to, uptr from )
+{
+	write16( 0xD40F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) ); 
+}
+
+// paddq r64 to r64 (sse2 only?)
+emitterT void PADDQRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0xD40F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PSUBSBRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xE80F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PSUBSWRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xE90F );
+	ModRM( 3, to, from ); 
+}
+
+
+emitterT void PSUBBRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xF80F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PSUBWRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xF90F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PSUBDRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xFA0F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PSUBDMtoR( x86MMXRegType to, uptr from )
+{
+	write16( 0xFA0F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) ); 
+}
+
+emitterT void PSUBUSBRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xD80F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PSUBUSWRtoR( x86MMXRegType to, x86MMXRegType from ) 
+{
+	write16( 0xD90F ); 
+	ModRM( 3, to, from ); 
+}
+
+// psubq m64 to r64 (sse2 only?)
+emitterT void PSUBQMtoR( x86MMXRegType to, uptr from )
+{
+	write16( 0xFB0F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) ); 
+}
+
+// psubq r64 to r64 (sse2 only?)
+emitterT void PSUBQRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0xFB0F ); 
+	ModRM( 3, to, from ); 
+}
+
+// pmuludq m64 to r64 (sse2 only?)
+emitterT void PMULUDQMtoR( x86MMXRegType to, uptr from )
+{
+	write16( 0xF40F ); 
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) ); 
+}
+
+// pmuludq r64 to r64 (sse2 only?)
+emitterT void PMULUDQRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0xF40F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PCMPEQBRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0x740F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PCMPEQWRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0x750F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PCMPEQDRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0x760F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PCMPEQDMtoR( x86MMXRegType to, uptr from )
+{
+	write16( 0x760F );
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) );
+}
+
+emitterT void PCMPGTBRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0x640F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PCMPGTWRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0x650F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PCMPGTDRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0x660F ); 
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PCMPGTDMtoR( x86MMXRegType to, uptr from )
+{
+	write16( 0x660F );
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) );
+}
+
+emitterT void PSRLWItoR( x86MMXRegType to, u8 from )
+{
+	write16( 0x710F );
+	ModRM( 3, 2 , to ); 
+	write8( from );
+}
+
+emitterT void PSRLDItoR( x86MMXRegType to, u8 from )
+{
+	write16( 0x720F );
+	ModRM( 3, 2 , to ); 
+	write8( from );
+}
+
+emitterT void PSRLDRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0xD20F );
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PSLLWItoR( x86MMXRegType to, u8 from )
+{
+	write16( 0x710F );
+	ModRM( 3, 6 , to ); 
+	write8( from );
+}
+
+emitterT void PSLLDItoR( x86MMXRegType to, u8 from )
+{
+	write16( 0x720F );
+	ModRM( 3, 6 , to ); 
+	write8( from );
+}
+
+emitterT void PSLLDRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0xF20F );
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PSRAWItoR( x86MMXRegType to, u8 from )
+{
+	write16( 0x710F );
+	ModRM( 3, 4 , to ); 
+	write8( from );
+}
+
+emitterT void PSRADItoR( x86MMXRegType to, u8 from )
+{
+	write16( 0x720F );
+	ModRM( 3, 4 , to ); 
+	write8( from );
+}
+
+emitterT void PSRADRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0xE20F );
+	ModRM( 3, to, from ); 
+}
+
+/* por m64 to r64 */
+emitterT void PORMtoR( x86MMXRegType to, uptr from ) 
+{
+	write16( 0xEB0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) );
+}
+
+/* pxor m64 to r64 */
+emitterT void PXORMtoR( x86MMXRegType to, uptr from ) 
+{
+	write16( 0xEF0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) );
+}
+
+/* pand m64 to r64 */
+emitterT void PANDMtoR( x86MMXRegType to, uptr from ) 
+{
+	//u64 rip = (u64)x86Ptr + 7;
+	write16( 0xDB0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) );
+}
+
+emitterT void PANDNMtoR( x86MMXRegType to, uptr from )
+{
+	write16( 0xDF0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) );
+}
+
+emitterT void PUNPCKHDQRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0x6A0F );
+	ModRM( 3, to, from );
+}
+
+emitterT void PUNPCKHDQMtoR( x86MMXRegType to, uptr from )
+{
+	write16( 0x6A0F );
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) );
+}
+
+emitterT void PUNPCKLDQRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0x620F );
+	ModRM( 3, to, from );
+}
+
+emitterT void PUNPCKLDQMtoR( x86MMXRegType to, uptr from )
+{
+	write16( 0x620F );
+	ModRM( 0, to, DISP32 ); 
+	write32( MEMADDR(from, 4) );
+}
+
+emitterT void MOVQ64ItoR( x86MMXRegType reg, u64 i ) 
+{
+	MOVQMtoR( reg, ( uptr )(x86Ptr) + 2 + 7 );
+	JMP8( 8 );
+	write64( i );
+}
+
+emitterT void MOVQRtoR( x86MMXRegType to, x86MMXRegType from )
+{
+	write16( 0x6F0F );
+	ModRM( 3, to, from );
+}
+
+emitterT void MOVQRmtoR( x86MMXRegType to, x86IntRegType from, int offset )
+{
+	write16( 0x6F0F );
+	WriteRmOffsetFrom( to, from, offset );
+}
+
+emitterT void MOVQRtoRm( x86IntRegType to, x86MMXRegType from, int offset )
+{
+	write16( 0x7F0F );
+	WriteRmOffsetFrom( from, to, offset );
+}
+
+/* movd m32 to r64 */
+emitterT void MOVDMtoMMX( x86MMXRegType to, uptr from ) 
+{
+	write16( 0x6E0F );
+	ModRM( 0, to, DISP32 );
+	write32( MEMADDR(from, 4) ); 
+}
+
+/* movd r64 to m32 */
+emitterT void MOVDMMXtoM( uptr to, x86MMXRegType from ) 
+{
+	write16( 0x7E0F );
+	ModRM( 0, from, DISP32 );
+	write32( MEMADDR(to, 4) ); 
+}
+
+emitterT void MOVD32RtoMMX( x86MMXRegType to, x86IntRegType from )
+{
+	write16( 0x6E0F );
+	ModRM( 3, to, from );
+}
+
+emitterT void MOVD32RmtoMMX( x86MMXRegType to, x86IntRegType from, int offset )
+{
+	write16( 0x6E0F );
+	WriteRmOffsetFrom( to, from, offset );
+}
+
+emitterT void MOVD32MMXtoR( x86IntRegType to, x86MMXRegType from )
+{
+	write16( 0x7E0F );
+	ModRM( 3, from, to );
+}
+
+emitterT void MOVD32MMXtoRm( x86IntRegType to, x86MMXRegType from, int offset )
+{
+	write16( 0x7E0F );
+	WriteRmOffsetFrom( from, to, offset );
+}
+
+// untested
+emitterT void PACKSSWBMMXtoMMX(x86MMXRegType to, x86MMXRegType from)
+{
+	write16( 0x630F );
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PACKSSDWMMXtoMMX(x86MMXRegType to, x86MMXRegType from)
+{
+	write16( 0x6B0F );
+	ModRM( 3, to, from ); 
+}
+
+emitterT void PMOVMSKBMMXtoR(x86IntRegType to, x86MMXRegType from)
+{
+	write16( 0xD70F ); 
+	ModRM( 3, to, from );
+}
+
+emitterT void PINSRWRtoMMX( x86MMXRegType to, x86SSERegType from, u8 imm8 )
+{
+	if (to > 7 || from > 7) Rex(1, to >> 3, 0, from >> 3);
+	write16( 0xc40f );
+	ModRM( 3, to, from );
+	write8( imm8 );
+}
+
+emitterT void PSHUFWRtoR(x86MMXRegType to, x86MMXRegType from, u8 imm8)
+{
+	write16(0x700f);
+	ModRM( 3, to, from );
+	write8(imm8);
+}
+
+emitterT void PSHUFWMtoR(x86MMXRegType to, uptr from, u8 imm8)
+{
+	write16( 0x700f );
+	ModRM( 0, to, DISP32 );
+	write32( MEMADDR(from, 4) );
+	write8(imm8);
+}
+
+emitterT void MASKMOVQRtoR(x86MMXRegType to, x86MMXRegType from)
+{
+	write16(0xf70f);
+	ModRM( 3, to, from );
+}
--- a/pcsx2/x86/ix86/ix86_mmx.inl
+++ b/pcsx2/x86/ix86/ix86_mmx.inl
@ -1,647 +0,0 @@
-/*  Pcsx2 - Pc Ps2 Emulator
- *  Copyright (C) 2002-2009  Pcsx2 Team
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *  
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *  
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
- */
-
-#pragma once
-
-//------------------------------------------------------------------
-// MMX instructions
-//
-// note: r64 = mm
-//------------------------------------------------------------------
-
-/* movq m64 to r64 */
-emitterT void eMOVQMtoR( x86MMXRegType to, uptr from )
-{
-	write16<I>( 0x6F0F );
-	ModRM<I>( 0, to, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* movq r64 to m64 */
-emitterT void eMOVQRtoM( uptr to, x86MMXRegType from ) 
-{
-	write16<I>( 0x7F0F );
-	ModRM<I>( 0, from, DISP32 );
-	write32<I>(MEMADDR(to, 4)); 
-}
-
-/* pand r64 to r64 */
-emitterT void ePANDRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xDB0F );
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePANDNRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0xDF0F );
-	ModRM<I>( 3, to, from ); 
-}
-
-/* por r64 to r64 */
-emitterT void ePORRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xEB0F );
-	ModRM<I>( 3, to, from ); 
-}
-
-/* pxor r64 to r64 */
-emitterT void ePXORRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xEF0F );
-	ModRM<I>( 3, to, from ); 
-}
-
-/* psllq r64 to r64 */
-emitterT void ePSLLQRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xF30F );
-	ModRM<I>( 3, to, from ); 
-}
-
-/* psllq m64 to r64 */
-emitterT void ePSLLQMtoR( x86MMXRegType to, uptr from ) 
-{
-	write16<I>( 0xF30F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) );
-}
-
-/* psllq imm8 to r64 */
-emitterT void ePSLLQItoR( x86MMXRegType to, u8 from ) 
-{
-	write16<I>( 0x730F ); 
-	ModRM<I>( 3, 6, to); 
-	write8<I>( from ); 
-}
-
-/* psrlq r64 to r64 */
-emitterT void ePSRLQRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xD30F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-/* psrlq m64 to r64 */
-emitterT void ePSRLQMtoR( x86MMXRegType to, uptr from ) 
-{
-	write16<I>( 0xD30F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* psrlq imm8 to r64 */
-emitterT void ePSRLQItoR( x86MMXRegType to, u8 from ) 
-{
-	write16<I>( 0x730F );
-	ModRM<I>( 3, 2, to); 
-	write8<I>( from ); 
-}
-
-/* paddusb r64 to r64 */
-emitterT void ePADDUSBRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xDC0F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-/* paddusb m64 to r64 */
-emitterT void ePADDUSBMtoR( x86MMXRegType to, uptr from ) 
-{
-	write16<I>( 0xDC0F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* paddusw r64 to r64 */
-emitterT void ePADDUSWRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xDD0F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-/* paddusw m64 to r64 */
-emitterT void ePADDUSWMtoR( x86MMXRegType to, uptr from ) 
-{
-	write16<I>( 0xDD0F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* paddb r64 to r64 */
-emitterT void ePADDBRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xFC0F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-/* paddb m64 to r64 */
-emitterT void ePADDBMtoR( x86MMXRegType to, uptr from ) 
-{
-	write16<I>( 0xFC0F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* paddw r64 to r64 */
-emitterT void ePADDWRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xFD0F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-/* paddw m64 to r64 */
-emitterT void ePADDWMtoR( x86MMXRegType to, uptr from ) 
-{
-	write16<I>( 0xFD0F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* paddd r64 to r64 */
-emitterT void ePADDDRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xFE0F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-/* paddd m64 to r64 */
-emitterT void ePADDDMtoR( x86MMXRegType to, uptr from ) 
-{
-	write16<I>( 0xFE0F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* emms */
-emitterT void eEMMS() 
-{
-	write16<I>( 0x770F );
-}
-
-emitterT void ePADDSBRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xEC0F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePADDSWRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xED0F );
-	ModRM<I>( 3, to, from ); 
-}
-
-// paddq m64 to r64 (sse2 only?)
-emitterT void ePADDQMtoR( x86MMXRegType to, uptr from )
-{
-	write16<I>( 0xD40F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-// paddq r64 to r64 (sse2 only?)
-emitterT void ePADDQRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0xD40F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePSUBSBRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xE80F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePSUBSWRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xE90F );
-	ModRM<I>( 3, to, from ); 
-}
-
-
-emitterT void ePSUBBRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xF80F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePSUBWRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xF90F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePSUBDRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xFA0F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePSUBDMtoR( x86MMXRegType to, uptr from )
-{
-	write16<I>( 0xFA0F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-emitterT void ePSUBUSBRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xD80F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePSUBUSWRtoR( x86MMXRegType to, x86MMXRegType from ) 
-{
-	write16<I>( 0xD90F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-// psubq m64 to r64 (sse2 only?)
-emitterT void ePSUBQMtoR( x86MMXRegType to, uptr from )
-{
-	write16<I>( 0xFB0F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-// psubq r64 to r64 (sse2 only?)
-emitterT void ePSUBQRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0xFB0F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-// pmuludq m64 to r64 (sse2 only?)
-emitterT void ePMULUDQMtoR( x86MMXRegType to, uptr from )
-{
-	write16<I>( 0xF40F ); 
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-// pmuludq r64 to r64 (sse2 only?)
-emitterT void ePMULUDQRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0xF40F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePCMPEQBRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x740F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePCMPEQWRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x750F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePCMPEQDRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x760F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePCMPEQDMtoR( x86MMXRegType to, uptr from )
-{
-	write16<I>( 0x760F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) );
-}
-
-emitterT void ePCMPGTBRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x640F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePCMPGTWRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x650F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePCMPGTDRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x660F ); 
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePCMPGTDMtoR( x86MMXRegType to, uptr from )
-{
-	write16<I>( 0x660F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) );
-}
-
-emitterT void ePSRLWItoR( x86MMXRegType to, u8 from )
-{
-	write16<I>( 0x710F );
-	ModRM<I>( 3, 2 , to ); 
-	write8<I>( from );
-}
-
-emitterT void ePSRLDItoR( x86MMXRegType to, u8 from )
-{
-	write16<I>( 0x720F );
-	ModRM<I>( 3, 2 , to ); 
-	write8<I>( from );
-}
-
-emitterT void ePSRLDRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0xD20F );
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePSLLWItoR( x86MMXRegType to, u8 from )
-{
-	write16<I>( 0x710F );
-	ModRM<I>( 3, 6 , to ); 
-	write8<I>( from );
-}
-
-emitterT void ePSLLDItoR( x86MMXRegType to, u8 from )
-{
-	write16<I>( 0x720F );
-	ModRM<I>( 3, 6 , to ); 
-	write8<I>( from );
-}
-
-emitterT void ePSLLDRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0xF20F );
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePSRAWItoR( x86MMXRegType to, u8 from )
-{
-	write16<I>( 0x710F );
-	ModRM<I>( 3, 4 , to ); 
-	write8<I>( from );
-}
-
-emitterT void ePSRADItoR( x86MMXRegType to, u8 from )
-{
-	write16<I>( 0x720F );
-	ModRM<I>( 3, 4 , to ); 
-	write8<I>( from );
-}
-
-emitterT void ePSRADRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0xE20F );
-	ModRM<I>( 3, to, from ); 
-}
-
-/* por m64 to r64 */
-emitterT void ePORMtoR( x86MMXRegType to, uptr from ) 
-{
-	write16<I>( 0xEB0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) );
-}
-
-/* pxor m64 to r64 */
-emitterT void ePXORMtoR( x86MMXRegType to, uptr from ) 
-{
-	write16<I>( 0xEF0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) );
-}
-
-/* pand m64 to r64 */
-emitterT void ePANDMtoR( x86MMXRegType to, uptr from ) 
-{
-	//u64 rip = (u64)x86Ptr[0] + 7;
-	write16<I>( 0xDB0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) );
-}
-
-emitterT void ePANDNMtoR( x86MMXRegType to, uptr from )
-{
-	write16<I>( 0xDF0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) );
-}
-
-emitterT void ePUNPCKHDQRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x6A0F );
-	ModRM<I>( 3, to, from );
-}
-
-emitterT void ePUNPCKHDQMtoR( x86MMXRegType to, uptr from )
-{
-	write16<I>( 0x6A0F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) );
-}
-
-emitterT void ePUNPCKLDQRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x620F );
-	ModRM<I>( 3, to, from );
-}
-
-emitterT void ePUNPCKLDQMtoR( x86MMXRegType to, uptr from )
-{
-	write16<I>( 0x620F );
-	ModRM<I>( 0, to, DISP32 ); 
-	write32<I>( MEMADDR(from, 4) );
-}
-
-emitterT void eMOVQ64ItoR( x86MMXRegType reg, u64 i ) 
-{
-	eMOVQMtoR<I>( reg, ( uptr )(x86Ptr[0]) + 2 + 7 );
-	eJMP8<I>( 8 );
-	write64<I>( i );
-}
-
-emitterT void eMOVQRtoR( x86MMXRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x6F0F );
-	ModRM<I>( 3, to, from );
-}
-
-emitterT void eMOVQRmtoROffset( x86MMXRegType to, x86IntRegType from, u32 offset )
-{
-	write16<I>( 0x6F0F );
-
-	if( offset < 128 ) {
-		ModRM<I>( 1, to, from );
-		write8<I>(offset);
-	}
-	else {
-		ModRM<I>( 2, to, from );
-		write32<I>(offset);
-	}
-}
-
-emitterT void eMOVQRtoRmOffset( x86IntRegType to, x86MMXRegType from, u32 offset )
-{
-	write16<I>( 0x7F0F );
-
-	if( offset < 128 ) {
-		ModRM<I>( 1, from , to );
-		write8<I>(offset);
-	}
-	else {
-		ModRM<I>( 2, from, to );
-		write32<I>(offset);
-	}
-}
-
-/* movd m32 to r64 */
-emitterT void eMOVDMtoMMX( x86MMXRegType to, uptr from ) 
-{
-	write16<I>( 0x6E0F );
-	ModRM<I>( 0, to, DISP32 );
-	write32<I>( MEMADDR(from, 4) ); 
-}
-
-/* movd r64 to m32 */
-emitterT void eMOVDMMXtoM( uptr to, x86MMXRegType from ) 
-{
-	write16<I>( 0x7E0F );
-	ModRM<I>( 0, from, DISP32 );
-	write32<I>( MEMADDR(to, 4) ); 
-}
-
-emitterT void eMOVD32RtoMMX( x86MMXRegType to, x86IntRegType from )
-{
-	write16<I>( 0x6E0F );
-	ModRM<I>( 3, to, from );
-}
-
-emitterT void eMOVD32RmtoMMX( x86MMXRegType to, x86IntRegType from )
-{
-	write16<I>( 0x6E0F );
-	ModRM<I>( 0, to, from );
-}
-
-emitterT void eMOVD32RmOffsettoMMX( x86MMXRegType to, x86IntRegType from, u32 offset )
-{
-	write16<I>( 0x6E0F );
-
-	if( offset < 128 ) {
-		ModRM<I>( 1, to, from );
-		write8<I>(offset);
-	}
-	else {
-		ModRM<I>( 2, to, from );
-		write32<I>(offset);
-	}
-}
-
-emitterT void eMOVD32MMXtoR( x86IntRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x7E0F );
-	ModRM<I>( 3, from, to );
-}
-
-emitterT void eMOVD32MMXtoRm( x86IntRegType to, x86MMXRegType from )
-{
-	write16<I>( 0x7E0F );
-	ModRM<I>( 0, from, to );
-	if( to >= 4 ) {
-		// no idea why
-		assert( to == ESP );
-		write8<I>(0x24);
-	}
-
-}
-
-emitterT void eMOVD32MMXtoRmOffset( x86IntRegType to, x86MMXRegType from, u32 offset )
-{
-	write16<I>( 0x7E0F );
-
-	if( offset < 128 ) {
-		ModRM<I>( 1, from, to );
-		write8<I>(offset);
-	}
-	else {
-		ModRM<I>( 2, from, to );
-		write32<I>(offset);
-	}
-}
-
-///* movd r32 to r64 */
-//emitterT void eMOVD32MMXtoMMX( x86MMXRegType to, x86MMXRegType from ) 
-//{
-//	write16<I>( 0x6E0F );
-//	ModRM<I>( 3, to, from );
-//}
-//
-///* movq r64 to r32 */
-//emitterT void eMOVD64MMXtoMMX( x86MMXRegType to, x86MMXRegType from ) 
-//{
-//	write16<I>( 0x7E0F );
-//	ModRM<I>( 3, from, to );
-//}
-
-// untested
-emitterT void ePACKSSWBMMXtoMMX(x86MMXRegType to, x86MMXRegType from)
-{
-	write16<I>( 0x630F );
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePACKSSDWMMXtoMMX(x86MMXRegType to, x86MMXRegType from)
-{
-	write16<I>( 0x6B0F );
-	ModRM<I>( 3, to, from ); 
-}
-
-emitterT void ePMOVMSKBMMXtoR(x86IntRegType to, x86MMXRegType from)
-{
-	write16<I>( 0xD70F ); 
-	ModRM<I>( 3, to, from );
-}
-
-emitterT void ePINSRWRtoMMX( x86MMXRegType to, x86SSERegType from, u8 imm8 )
-{
-	if (to > 7 || from > 7) Rex(1, to >> 3, 0, from >> 3);
-	write16<I>( 0xc40f );
-	ModRM<I>( 3, to, from );
-	write8<I>( imm8 );
-}
-
-emitterT void ePSHUFWRtoR(x86MMXRegType to, x86MMXRegType from, u8 imm8)
-{
-	write16<I>(0x700f);
-	ModRM<I>( 3, to, from );
-	write8<I>(imm8);
-}
-
-emitterT void ePSHUFWMtoR(x86MMXRegType to, uptr from, u8 imm8)
-{
-	write16<I>( 0x700f );
-	ModRM<I>( 0, to, DISP32 );
-	write32<I>( MEMADDR(from, 4) );
-	write8<I>(imm8);
-}
-
-emitterT void eMASKMOVQRtoR(x86MMXRegType to, x86MMXRegType from)
-{
-	write16<I>(0xf70f);
-	ModRM<I>( 3, to, from );
-}
--- a/pcsx2/x86/ix86/ix86_sse.cpp
+++ b/pcsx2/x86/ix86/ix86_sse.cpp
--- a/pcsx2/x86/ix86/ix86_sse.inl
+++ b/pcsx2/x86/ix86/ix86_sse.inl
--- a/pcsx2/x86/ix86/ix86_sse_helpers.h
+++ b/pcsx2/x86/ix86/ix86_sse_helpers.h
@ -22,164 +22,30 @@
 // SSE-X Helpers (generates either INT or FLOAT versions of certain SSE instructions)
 // This header should always be included *after* ix86.h.

-#ifndef _ix86_included_
-#error Dependency fail: Please define _EmitterId_ and include ix86.h first.
-#endif
-
 // Added AlwaysUseMovaps check to the relevant functions here, which helps reduce the
 // overhead of dynarec instructions that use these.

-static __forceinline void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from )
-{
-	if( !AlwaysUseMovaps && g_xmmtypes[to] == XMMT_INT ) SSE2_MOVDQA_M128_to_XMM(to, from);
-	else SSE_MOVAPS_M128_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_MOVDQA_XMM_to_M128( uptr to, x86SSERegType from )
-{
-	if( !AlwaysUseMovaps && g_xmmtypes[from] == XMMT_INT ) SSE2_MOVDQA_XMM_to_M128(to, from);
-	else SSE_MOVAPS_XMM_to_M128(to, from);
-}
-
-static __forceinline void SSEX_MOVDQA_XMM_to_XMM( x86SSERegType to, x86SSERegType from )
-{
-	if( !AlwaysUseMovaps && g_xmmtypes[from] == XMMT_INT ) SSE2_MOVDQA_XMM_to_XMM(to, from);
-	else SSE_MOVAPS_XMM_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_MOVDQARmtoROffset( x86SSERegType to, x86IntRegType from, int offset )
-{
-	if( !AlwaysUseMovaps && g_xmmtypes[to] == XMMT_INT ) SSE2_MOVDQARmtoROffset(to, from, offset);
-	else SSE_MOVAPSRmtoROffset(to, from, offset);
-}
-
-static __forceinline void SSEX_MOVDQARtoRmOffset( x86IntRegType to, x86SSERegType from, int offset )
-{
-	if( !AlwaysUseMovaps && g_xmmtypes[from] == XMMT_INT ) SSE2_MOVDQARtoRmOffset(to, from, offset);
-	else SSE_MOVAPSRtoRmOffset(to, from, offset);
-}
-
-static __forceinline void SSEX_MOVDQU_M128_to_XMM( x86SSERegType to, uptr from )
-{
-	if( !AlwaysUseMovaps && g_xmmtypes[to] == XMMT_INT ) SSE2_MOVDQU_M128_to_XMM(to, from);
-	else SSE_MOVUPS_M128_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_MOVDQU_XMM_to_M128( uptr to, x86SSERegType from )
-{
-	if( !AlwaysUseMovaps && g_xmmtypes[from] == XMMT_INT ) SSE2_MOVDQU_XMM_to_M128(to, from);
-	else SSE_MOVUPS_XMM_to_M128(to, from);
-}
-
-static __forceinline void SSEX_MOVD_M32_to_XMM( x86SSERegType to, uptr from )
-{
-	if( g_xmmtypes[to] == XMMT_INT ) SSE2_MOVD_M32_to_XMM(to, from);
-	else SSE_MOVSS_M32_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_MOVD_XMM_to_M32( u32 to, x86SSERegType from )
-{
-	if( g_xmmtypes[from] == XMMT_INT ) SSE2_MOVD_XMM_to_M32(to, from);
-	else SSE_MOVSS_XMM_to_M32(to, from);
-}
-
-static __forceinline void SSEX_MOVD_XMM_to_Rm( x86IntRegType to, x86SSERegType from )
-{
-	if( g_xmmtypes[from] == XMMT_INT ) SSE2_MOVD_XMM_to_Rm(to, from);
-	else SSE_MOVSS_XMM_to_Rm(to, from);
-}
-
-static __forceinline void SSEX_MOVD_RmOffset_to_XMM( x86SSERegType to, x86IntRegType from, int offset )
-{
-	if( g_xmmtypes[to] == XMMT_INT ) SSE2_MOVD_RmOffset_to_XMM(to, from, offset);
-	else SSE_MOVSS_RmOffset_to_XMM(to, from, offset);
-}
-
-static __forceinline void SSEX_MOVD_XMM_to_RmOffset( x86IntRegType to, x86SSERegType from, int offset )
-{
-	if( g_xmmtypes[from] == XMMT_INT ) SSE2_MOVD_XMM_to_RmOffset(to, from, offset);
-	else SSE_MOVSS_XMM_to_RmOffset(to, from, offset);
-}
-
-static __forceinline void SSEX_POR_M128_to_XMM( x86SSERegType to, uptr from )
-{
-	if( g_xmmtypes[to] == XMMT_INT ) SSE2_POR_M128_to_XMM(to, from);
-	else SSE_ORPS_M128_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_POR_XMM_to_XMM( x86SSERegType to, x86SSERegType from )
-{
-	if( g_xmmtypes[from] == XMMT_INT ) SSE2_POR_XMM_to_XMM(to, from);
-	else SSE_ORPS_XMM_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_PXOR_M128_to_XMM( x86SSERegType to, uptr from )
-{
-	if( g_xmmtypes[to] == XMMT_INT ) SSE2_PXOR_M128_to_XMM(to, from);
-	else SSE_XORPS_M128_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_PXOR_XMM_to_XMM( x86SSERegType to, x86SSERegType from )
-{
-	if( g_xmmtypes[from] == XMMT_INT ) SSE2_PXOR_XMM_to_XMM(to, from);
-	else SSE_XORPS_XMM_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_PAND_M128_to_XMM( x86SSERegType to, uptr from )
-{
-	if( g_xmmtypes[to] == XMMT_INT ) SSE2_PAND_M128_to_XMM(to, from);
-	else SSE_ANDPS_M128_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_PAND_XMM_to_XMM( x86SSERegType to, x86SSERegType from )
-{
-	if( g_xmmtypes[from] == XMMT_INT ) SSE2_PAND_XMM_to_XMM(to, from);
-	else SSE_ANDPS_XMM_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_PANDN_M128_to_XMM( x86SSERegType to, uptr from )
-{
-	if( g_xmmtypes[to] == XMMT_INT ) SSE2_PANDN_M128_to_XMM(to, from);
-	else SSE_ANDNPS_M128_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_PANDN_XMM_to_XMM( x86SSERegType to, x86SSERegType from )
-{
-	if( g_xmmtypes[from] == XMMT_INT ) SSE2_PANDN_XMM_to_XMM(to, from);
-	else SSE_ANDNPS_XMM_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_PUNPCKLDQ_M128_to_XMM(x86SSERegType to, uptr from)
-{
-	if( g_xmmtypes[to] == XMMT_INT ) SSE2_PUNPCKLDQ_M128_to_XMM(to, from);
-	else SSE_UNPCKLPS_M128_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_PUNPCKLDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
-{
-	if( g_xmmtypes[from] == XMMT_INT ) SSE2_PUNPCKLDQ_XMM_to_XMM(to, from);
-	else SSE_UNPCKLPS_XMM_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_PUNPCKHDQ_M128_to_XMM(x86SSERegType to, uptr from)
-{
-	if( g_xmmtypes[to] == XMMT_INT ) SSE2_PUNPCKHDQ_M128_to_XMM(to, from);
-	else SSE_UNPCKHPS_M128_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_PUNPCKHDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
-{
-	if( g_xmmtypes[from] == XMMT_INT ) SSE2_PUNPCKHDQ_XMM_to_XMM(to, from);
-	else SSE_UNPCKHPS_XMM_to_XMM(to, from);
-}
-
-static __forceinline void SSEX_MOVHLPS_XMM_to_XMM( x86SSERegType to, x86SSERegType from )
-{
-	if( g_xmmtypes[from] == XMMT_INT ) {
-		SSE2_PUNPCKHQDQ_XMM_to_XMM(to, from);
-		if( to != from ) SSE2_PSHUFD_XMM_to_XMM(to, to, 0x4e);
-	}
-	else {
-		SSE_MOVHLPS_XMM_to_XMM(to, from);
-	}
-}
+extern void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from );
+extern void SSEX_MOVDQA_XMM_to_M128( uptr to, x86SSERegType from );
+extern void SSEX_MOVDQA_XMM_to_XMM( x86SSERegType to, x86SSERegType from );
+extern void SSEX_MOVDQARmtoR( x86SSERegType to, x86IntRegType from, int offset=0 );
+extern void SSEX_MOVDQARtoRm( x86IntRegType to, x86SSERegType from, int offset=0 );
+extern void SSEX_MOVDQU_M128_to_XMM( x86SSERegType to, uptr from );
+extern void SSEX_MOVDQU_XMM_to_M128( uptr to, x86SSERegType from );
+extern void SSEX_MOVD_M32_to_XMM( x86SSERegType to, uptr from );
+extern void SSEX_MOVD_XMM_to_M32( u32 to, x86SSERegType from );
+extern void SSEX_MOVD_Rm_to_XMM( x86SSERegType to, x86IntRegType from, int offset=0 );
+extern void SSEX_MOVD_XMM_to_Rm( x86IntRegType to, x86SSERegType from, int offset=0 );
+extern void SSEX_POR_M128_to_XMM( x86SSERegType to, uptr from );
+extern void SSEX_POR_XMM_to_XMM( x86SSERegType to, x86SSERegType from );
+extern void SSEX_PXOR_M128_to_XMM( x86SSERegType to, uptr from );
+extern void SSEX_PXOR_XMM_to_XMM( x86SSERegType to, x86SSERegType from );
+extern void SSEX_PAND_M128_to_XMM( x86SSERegType to, uptr from );
+extern void SSEX_PAND_XMM_to_XMM( x86SSERegType to, x86SSERegType from );
+extern void SSEX_PANDN_M128_to_XMM( x86SSERegType to, uptr from );
+extern void SSEX_PANDN_XMM_to_XMM( x86SSERegType to, x86SSERegType from );
+extern void SSEX_PUNPCKLDQ_M128_to_XMM(x86SSERegType to, uptr from );
+extern void SSEX_PUNPCKLDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from );
+extern void SSEX_PUNPCKHDQ_M128_to_XMM(x86SSERegType to, uptr from );
+extern void SSEX_PUNPCKHDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from );
+extern void SSEX_MOVHLPS_XMM_to_XMM( x86SSERegType to, x86SSERegType from );
--- a/pcsx2/x86/ix86/ix86_tools.cpp
+++ b/pcsx2/x86/ix86/ix86_tools.cpp
@ -18,7 +18,7 @@

 #include "PrecompiledHeader.h"

-#include "Misc.h"
+#include "System.h"
 #include "ix86/ix86.h"

 // used to make sure regs don't get changed while in recompiler
@ -27,8 +27,8 @@
 u8 g_globalMMXSaved = 0;
 u8 g_globalXMMSaved = 0;

-PCSX2_ALIGNED16( static u64 g_globalMMXData[8] );
-PCSX2_ALIGNED16( static u64 g_globalXMMData[2*XMMREGS] );
+PCSX2_ALIGNED16( u64 g_globalMMXData[8] );
+PCSX2_ALIGNED16( u64 g_globalXMMData[2*XMMREGS] );


 /////////////////////////////////////////////////////////////////////
--- a/pcsx2/x86/ix86/ix86_types.h
+++ b/pcsx2/x86/ix86/ix86_types.h
@ -29,6 +29,7 @@

 // general types
 typedef int x86IntRegType;
+
 #define EAX 0
 #define EBX 3
 #define ECX 1
@ -149,3 +150,252 @@ struct CPUINFO{

 extern CPUINFO cpuinfo;
 //------------------------------------------------------------------
+
+// templated version of is_s8 is required, so that u16's get correct sign extension treatment.
+template< typename T >
+static __forceinline bool is_s8( T imm ) { return (s8)imm == (s32)imm; }
+
+namespace x86Emitter
+{
+	class x86ModRm;
+
+	//////////////////////////////////////////////////////////////////////////////////////////
+	//
+	struct x86Register32
+	{
+		static const x86Register32 Empty;		// defined as an empty/unused value (-1)
+		
+		int Id;
+
+		x86Register32( const x86Register32& src ) : Id( src.Id ) {}
+		x86Register32() : Id( -1 ) {}
+		explicit x86Register32( int regId ) : Id( regId ) { jASSUME( Id >= -1 && Id < 8 ); }
+
+		bool IsEmpty() const { return Id == -1; }
+
+		bool operator==( const x86Register32& src ) const { return Id == src.Id; }
+		bool operator!=( const x86Register32& src ) const { return Id != src.Id; }
+		
+		x86ModRm operator+( const x86Register32& right ) const;
+		x86ModRm operator+( const x86ModRm& right ) const;
+		x86ModRm operator+( s32 right ) const;
+
+		x86ModRm operator*( u32 factor ) const;
+		
+		x86Register32& operator=( const x86Register32& src )
+		{
+			Id = src.Id;
+			return *this;
+		}
+	};
+	
+	//////////////////////////////////////////////////////////////////////////////////////////
+	// Similar to x86Register, but without the ability to add/combine them with ModSib.
+	//
+	class x86Register16
+	{
+	public:
+		static const x86Register16 Empty;
+
+		int Id;
+
+		x86Register16( const x86Register16& src ) : Id( src.Id ) {}
+		x86Register16() : Id( -1 ) {}
+		explicit x86Register16( int regId ) : Id( regId ) { jASSUME( Id >= -1 && Id < 8 ); }
+
+		bool IsEmpty() const { return Id == -1; }
+
+		bool operator==( const x86Register16& src ) const { return Id == src.Id; }
+		bool operator!=( const x86Register16& src ) const { return Id != src.Id; }
+
+		x86Register16& operator=( const x86Register16& src )
+		{
+			Id = src.Id;
+			return *this;
+		}
+	};
+
+	//////////////////////////////////////////////////////////////////////////////////////////
+	// Similar to x86Register, but without the ability to add/combine them with ModSib.
+	//
+	class x86Register8
+	{
+	public:
+		static const x86Register8 Empty;
+
+		int Id;
+
+		x86Register8( const x86Register16& src ) : Id( src.Id ) {}
+		x86Register8() : Id( -1 ) {}
+		explicit x86Register8( int regId ) : Id( regId ) { jASSUME( Id >= -1 && Id < 8 ); }
+
+		bool IsEmpty() const { return Id == -1; }
+
+		bool operator==( const x86Register8& src ) const { return Id == src.Id; }
+		bool operator!=( const x86Register8& src ) const { return Id != src.Id; }
+
+		x86Register8& operator=( const x86Register8& src )
+		{
+			Id = src.Id;
+			return *this;
+		}
+	};
+	
+	// Use 32 bit registers as out index register (for ModSig memory address calculations)
+	typedef x86Register32 x86IndexReg;
+
+	//////////////////////////////////////////////////////////////////////////////////////////
+	//
+	class x86ModRm
+	{
+	public:
+		x86IndexReg Base;		// base register (no scale)
+		x86IndexReg Index;		// index reg gets multiplied by the scale
+		int Factor;				// scale applied to the index register, in factor form (not a shift!)
+		s32 Displacement;		// address displacement
+
+	public:
+		x86ModRm( x86IndexReg base, x86IndexReg index, int factor=1, s32 displacement=0 ) :
+			Base( base ),
+			Index( index ),
+			Factor( factor ),
+			Displacement( displacement )
+		{
+		}
+
+		explicit x86ModRm( x86IndexReg base, int displacement=0 ) :
+			Base( base ),
+			Index(),
+			Factor(0),
+			Displacement( displacement )
+		{
+		}
+		
+		explicit x86ModRm( s32 displacement ) :
+			Base(),
+			Index(),
+			Factor(0),
+			Displacement( displacement )
+		{
+		}
+		
+		static x86ModRm FromIndexReg( x86IndexReg index, int scale=0, s32 displacement=0 );
+
+	public:
+		bool IsByteSizeDisp() const { return is_s8( Displacement ); }
+		x86IndexReg GetEitherReg() const;
+
+		x86ModRm& Add( s32 imm )
+		{
+			Displacement += imm;
+			return *this;
+		}
+		
+		x86ModRm& Add( const x86IndexReg& src );
+		x86ModRm& Add( const x86ModRm& src );
+
+		x86ModRm operator+( const x86IndexReg& right ) const { return x86ModRm( *this ).Add( right ); }
+		x86ModRm operator+( const x86ModRm& right ) const { return x86ModRm( *this ).Add( right ); }
+		x86ModRm operator+( const s32 imm ) const { return x86ModRm( *this ).Add( imm ); }
+		x86ModRm operator-( const s32 imm ) const { return x86ModRm( *this ).Add( -imm ); }
+	};
+
+	//////////////////////////////////////////////////////////////////////////////////////////
+	// ModSib - Internal low-level representation of the ModRM/SIB information.
+	//
+	// This class serves two purposes:  It houses 'reduced' ModRM/SIB info only, which means that
+	// the Base, Index, Scale, and Displacement values are all valid, and it serves as a type-
+	// safe layer between the x86Register's operators (which generate x86ModRm types) and the
+	// emitter's ModSib instruction forms.  Without this, the x86Register would pass as a
+	// ModSib type implicitly, and that would cause ambiguity on a number of instructions.
+	//
+	class ModSib
+	{
+	public:
+		x86IndexReg Base;		// base register (no scale)
+		x86IndexReg Index;		// index reg gets multiplied by the scale
+		int Scale;				// scale applied to the index register, in scale/shift form
+		s32 Displacement;		// offset applied to the Base/Index registers.
+
+		explicit ModSib( const x86ModRm& src );
+		explicit ModSib( s32 disp );
+		ModSib( x86IndexReg base, x86IndexReg index, int scale=0, s32 displacement=0 );
+		
+		x86IndexReg GetEitherReg() const;
+		bool IsByteSizeDisp() const { return is_s8( Displacement ); }
+
+		ModSib& Add( s32 imm )
+		{
+			Displacement += imm;
+			return *this;
+		}
+
+		ModSib operator+( const s32 imm ) const { return ModSib( *this ).Add( imm ); }
+		ModSib operator-( const s32 imm ) const { return ModSib( *this ).Add( -imm ); }
+
+	protected:
+		void Reduce();
+	};
+
+	//////////////////////////////////////////////////////////////////////////////////////////
+	// x86IndexerType - This is a static class which provisions our ptr[] syntax.
+	//
+	struct x86IndexerType
+	{
+		// passthrough instruction, allows ModSib to pass silently through ptr translation
+		// without doing anything and without compiler error.
+		const ModSib& operator[]( const ModSib& src ) const { return src; }
+
+		ModSib operator[]( x86IndexReg src ) const
+		{
+			return ModSib( src, x86IndexReg::Empty );
+		}
+
+		ModSib operator[]( const x86ModRm& src ) const
+		{
+			return ModSib( src );
+		}
+
+		ModSib operator[]( uptr src ) const
+		{
+			return ModSib( src );
+		}
+
+		ModSib operator[]( void* src ) const
+		{
+			return ModSib( (uptr)src );
+		}
+		
+		x86IndexerType() {}
+	};
+
+	// ------------------------------------------------------------------------
+	extern const x86IndexerType ptr;
+
+	extern const x86Register32 eax;
+	extern const x86Register32 ebx;
+	extern const x86Register32 ecx;
+	extern const x86Register32 edx;
+	extern const x86Register32 esi;
+	extern const x86Register32 edi;
+	extern const x86Register32 ebp;
+	extern const x86Register32 esp;
+
+	extern const x86Register16 ax;
+	extern const x86Register16 bx;
+	extern const x86Register16 cx;
+	extern const x86Register16 dx;
+	extern const x86Register16 si;
+	extern const x86Register16 di;
+	extern const x86Register16 bp;
+	extern const x86Register16 sp;
+
+	extern const x86Register8 al;
+	extern const x86Register8 cl;
+	extern const x86Register8 dl;
+	extern const x86Register8 bl;
+	extern const x86Register8 ah;
+	extern const x86Register8 ch;
+	extern const x86Register8 dh;
+	extern const x86Register8 bh;
+}
--- a/pcsx2/x86/microVU.cpp
+++ b/pcsx2/x86/microVU.cpp
@ -41,8 +41,7 @@ microVUt(void) mVUinit(VURegs* vuRegsPtr) {
 	mVU->regs		= vuRegsPtr;
 	mVU->index		= vuIndex;
 	mVU->microSize	= (vuIndex ? 0x4000 : 0x1000);
-	mVU->progSize	= (vuIndex ? 0x4000 : 0x1000) / 8;
-	mVU->cacheAddr	= 0xC0000000 + (vuIndex ? mVU->cacheSize : 0);
+	mVU->progSize	= (vuIndex ? 0x4000 : 0x1000) / 4;
 	mVU->cache		= NULL;

 	mVUreset<vuIndex>();
@ -55,22 +54,36 @@ microVUt(void) mVUreset() {
 	mVUclose<vuIndex>(); // Close

 	// Create Block Managers
-	for (int i; i <= mVU->prog.max; i++) {
-		for (u32 j; j < (mVU->progSize / 2); j++) {
+	for (int i = 0; i <= mVU->prog.max; i++) {
+		for (u32 j = 0; j < (mVU->progSize / 2); j++) {
 			mVU->prog.prog[i].block[j] = new microBlockManager();
 		}
 	}

 	// Dynarec Cache
-	mVU->cache = SysMmapEx(mVU->cacheAddr, mVU->cacheSize, 0x10000000, (vuIndex ? "Micro VU1" : "Micro VU0"));
-	if ( mVU->cache == NULL ) throw Exception::OutOfMemory(fmt_string( "microVU Error: failed to allocate recompiler memory! (addr: 0x%x)", params (u32)mVU->cache));
+	mVU->cache = SysMmapEx((vuIndex ? 0x1e840000 : 0x0e840000), mVU->cacheSize, 0, (vuIndex ? "Micro VU1" : "Micro VU0"));
+	if ( mVU->cache == NULL ) throw Exception::OutOfMemory(fmt_string( "microVU Error: Failed to allocate recompiler memory! (addr: 0x%x)", params (u32)mVU->cache));
+	mVU->ptr = mVU->cache;

-	// Other Variables
+	// Setup Entrance/Exit Points
+	mVUdispatcherA<vuIndex>();
+	mVUdispatcherB<vuIndex>();
+
+	// Program Variables
 	memset(&mVU->prog, 0, sizeof(mVU->prog));
 	mVU->prog.finished = 1;
 	mVU->prog.cleared = 1;
 	mVU->prog.cur = -1;
 	mVU->prog.total = -1;
+
+	// Setup Dynarec Cache Limits for Each Program
+	u8* z = (mVU->cache + 512); // Dispatcher Code is in first 512 bytes
+	for (int i = 0; i <= mVU->prog.max; i++) {
+		mVU->prog.prog[i].x86start = z;
+		mVU->prog.prog[i].x86ptr = z;
+		z += (mVU->cacheSize / (mVU->prog.max + 1));
+		mVU->prog.prog[i].x86end = z;
+	}
 }

 // Free Allocated Resources
@ -81,8 +94,8 @@ microVUt(void) mVUclose() {
 	if ( mVU->cache ) { HostSys::Munmap( mVU->cache, mVU->cacheSize ); mVU->cache = NULL; }

 	// Delete Block Managers
-	for (int i; i <= mVU->prog.max; i++) {
-		for (u32 j; j < (mVU->progSize / 2); j++) {
+	for (int i = 0; i <= mVU->prog.max; i++) {
+		for (u32 j = 0; j < (mVU->progSize / 2); j++) {
 			if (mVU->prog.prog[i].block[j]) delete mVU->prog.prog[i].block[j];
 		}
 	}
@ -99,33 +112,6 @@ microVUt(void) mVUclear(u32 addr, u32 size) {
 	// that its probably not worth it...
 }

-// Executes for number of cycles
-microVUt(void*) __fastcall mVUexecute(u32 startPC, u32 cycles) {
-/*	
-	Pseudocode: (ToDo: implement # of cycles)
-	1) Search for existing program
-	2) If program not found, goto 5
-	3) Search for recompiled block
-	4) If recompiled block found, goto 6
-	5) Recompile as much blocks as possible
-	6) Return start execution address of block
-*/
-	microVU* mVU = mVUx;
-	if ( mVUsearchProg(mVU) ) { // Found Program
-		//microBlock* block = mVU->prog.prog[mVU->prog.cur].block[startPC]->search(mVU->prog.lastPipelineState);
-		//if (block) return block->x86ptrStart; // Found Block
-	}
-	// Recompile code
-	return NULL;
-}
-
-void* __fastcall mVUexecuteVU0(u32 startPC, u32 cycles) {
-	return mVUexecute<0>(startPC, cycles);
-}
-void* __fastcall mVUexecuteVU1(u32 startPC, u32 cycles) {
-	return mVUexecute<1>(startPC, cycles);
-}
-
 //------------------------------------------------------------------
 // Micro VU - Private Functions
 //------------------------------------------------------------------
@ -133,6 +119,7 @@ void* __fastcall mVUexecuteVU1(u32 startPC, u32 cycles) {
 // Clears program data (Sets used to 1 because calling this function implies the program will be used at least once)
 __forceinline void mVUclearProg(microVU* mVU, int progIndex) {
 	mVU->prog.prog[progIndex].used = 1;
+	mVU->prog.prog[progIndex].x86ptr = mVU->prog.prog[progIndex].x86start;
 	for (u32 i = 0; i < (mVU->progSize / 2); i++) {
 		mVU->prog.prog[progIndex].block[i]->reset();
 	}
@ -171,7 +158,7 @@ __forceinline int mVUsearchProg(microVU* mVU) {
 		for (int i = 0; i <= mVU->prog.total; i++) {
 			//if (i == mVU->prog.cur) continue; // We can skip the current program. (ToDo: Verify that games don't clear, and send the same microprogram :/)
 			if (!memcmp_mmx(mVU->prog.prog[i].data, mVU->regs->Micro, mVU->microSize)) {
-				if (i == mVU->prog.cur) SysPrintf("microVU: Same micro program sent!\n");
+				if (i == mVU->prog.cur) { mVUlog("microVU: Same micro program sent!"); }
 				mVU->prog.cur = i;
 				mVU->prog.cleared = 0;
 				mVU->prog.prog[i].used++;
@ -206,98 +193,31 @@ __forceinline void mVUinvalidateBlock(microVU* mVU, u32 addr, u32 size) {
 	}
 }

-//------------------------------------------------------------------
-// Dispatcher Functions
-//------------------------------------------------------------------
-
-#ifdef _MSC_VER
-// Runs VU0 for number of cycles
-__declspec(naked) void __fastcall startVU0(u32 startPC, u32 cycles) {
-	__asm {
-		// __fastcall = The first two DWORD or smaller arguments are passed in ECX and EDX registers; all other arguments are passed right to left.
-		call mVUexecuteVU0
-
-		/*backup cpu state*/
-		push ebx;
-		push ebp;
-		push esi;
-		push edi;
-
-		ldmxcsr g_sseVUMXCSR
-		/* Should set xmmZ? */
-		jmp eax
-	}
-}
-
-// Runs VU1 for number of cycles
-__declspec(naked) void __fastcall startVU1(u32 startPC, u32 cycles) {
-	__asm {
-
-		call mVUexecuteVU1
-
-		/*backup cpu state*/
-		push ebx;
-		push ebp;
-		push esi;
-		push edi;
-
-		ldmxcsr g_sseVUMXCSR
-
-		jmp eax
-	}
-}
-
-// Exit point
-__declspec(naked) void __fastcall endVU0(u32 startPC, u32 cycles) {
-	__asm {
-
-		//call mVUcleanUpVU0
-
-		/*restore cpu state*/
-		pop edi;
-		pop esi;
-		pop ebp;
-		pop ebx;
-		
-		ldmxcsr g_sseMXCSR
-		emms
-
-		ret
-	}
-}
-#else
-extern "C" {
-	extern void __fastcall startVU0(u32 startPC, u32 cycles);
-	extern void __fastcall startVU1(u32 startPC, u32 cycles);
-	extern void __fastcall endVU0(u32 startPC, u32 cycles);
-}
-#endif
-
 //------------------------------------------------------------------
 // Wrapper Functions - Called by other parts of the Emu
 //------------------------------------------------------------------

-__forceinline void initVUrec(VURegs* vuRegs, const int vuIndex) {
+void initVUrec(VURegs* vuRegs, const int vuIndex) {
 	if (!vuIndex)	mVUinit<0>(vuRegs);
 	else			mVUinit<1>(vuRegs);
 }

-__forceinline void closeVUrec(const int vuIndex) {
+void closeVUrec(const int vuIndex) {
 	if (!vuIndex)	mVUclose<0>();
 	else			mVUclose<1>();
 }

-__forceinline void resetVUrec(const int vuIndex) {
+void resetVUrec(const int vuIndex) {
 	if (!vuIndex)	mVUreset<0>();
 	else			mVUreset<1>();
 }

-__forceinline void clearVUrec(u32 addr, u32 size, const int vuIndex) {
+void clearVUrec(u32 addr, u32 size, const int vuIndex) {
 	if (!vuIndex)	mVUclear<0>(addr, size);
 	else			mVUclear<1>(addr, size);
 }

-__forceinline void runVUrec(u32 startPC, u32 cycles, const int vuIndex) {
+void runVUrec(u32 startPC, u32 cycles, const int vuIndex) {
 	if (!vuIndex)	startVU0(startPC, cycles);
 	else			startVU1(startPC, cycles);
 }
--- a/pcsx2/x86/microVU.h
+++ b/pcsx2/x86/microVU.h
@ -18,7 +18,6 @@

 #pragma once
 #define mVUdebug // Prints Extra Info to Console
-#define _EmitterId_ (vuIndex+1)
 #include "Common.h"
 #include "VU.h"
 #include "GS.h"
@ -92,9 +91,12 @@ public:

 template<u32 progSize>
 struct microProgram {
-	u32 data[progSize];
+	u32 data[progSize/4];
 	u32 used;		// Number of times its been used
-	microBlockManager* block[progSize / 2];
+	u8* x86ptr;		// Pointer to program's recompilation code
+	u8* x86start;	// Start of program's rec-cache
+	u8* x86end;		// Limit of program's rec-cache
+	microBlockManager* block[progSize/8];
 	microAllocInfo<progSize> allocInfo;
 };

@ -113,30 +115,24 @@ struct microProgManager {
 struct microVU {
 	u32 index;		// VU Index (VU0 or VU1)
 	u32 microSize;	// VU Micro Memory Size
-	u32 progSize;	// VU Micro Program Size (microSize/8)
-	u32 cacheAddr;	// VU Cache Start Address
+	u32 progSize;	// VU Micro Program Size (microSize/4)
 	static const u32 cacheSize = 0x500000; // VU Cache Size

-	microProgManager<0x1000> prog; // Micro Program Data
+	microProgManager<0x4000> prog; // Micro Program Data
 	
 	VURegs*	regs;		 // VU Regs Struct
 	u8*		cache;		 // Dynarec Cache Start (where we will start writing the recompiled code to)
+	u8*		startFunct;	 // Ptr Function to the Start code for recompiled programs
+	u8*		exitFunct;	 // Ptr Function to the Exit code for recompiled programs
 	u8*		ptr;		 // Pointer to next place to write recompiled code to
 	u32		code;		 // Contains the current Instruction
 	u32		iReg;		 // iReg (only used in recompilation, not execution)
 	u32		clipFlag[4]; // 4 instances of clip flag (used in execution)
 	u32		divFlag;	 // 1 instance of I/D flags
-
-/*
-	uptr x86eax; // Accumulator register. Used in arithmetic operations.
-	uptr x86ecx; // Counter register. Used in shift/rotate instructions.
-	uptr x86edx; // Data register. Used in arithmetic operations and I/O operations.
-	uptr x86ebx; // Base register. Used as a pointer to data (located in DS in segmented mode).
-	uptr x86esp; // Stack Pointer register. Pointer to the top of the stack.
-	uptr x86ebp; // Stack Base Pointer register. Used to point to the base of the stack.
-	uptr x86esi; // Source register. Used as a pointer to a source in stream operations.
-	uptr x86edi; // Destination register. Used as a pointer to a destination in stream operations.
-*/
+	u32		VIbackup[2]; // Holds a backup of a VI reg if modified before a branch
+	u32		branch;		 // Holds branch compare result (IBxx) OR Holds address to Jump to (JALR/JR)
+	u32		p;			 // Holds current P instance index
+	u32		q;			 // Holds current Q instance index
 };

 // microVU rec structs
@ -147,14 +143,24 @@ extern PCSX2_ALIGNED16(microVU microVU1);
 extern void (*mVU_UPPER_OPCODE[64])( VURegs* VU, s32 info );
 extern void (*mVU_LOWER_OPCODE[128])( VURegs* VU, s32 info );

+// Main Functions
+microVUt(void) mVUinit(VURegs*);
+microVUt(void) mVUreset();
+microVUt(void) mVUclose();
+microVUt(void) mVUclear(u32, u32);
+
+// Private Functions
 __forceinline void	mVUclearProg(microVU* mVU, int progIndex);
 __forceinline int	mVUfindLeastUsedProg(microVU* mVU);
 __forceinline int	mVUsearchProg(microVU* mVU);
 __forceinline void	mVUcacheProg(microVU* mVU, int progIndex);
+void* __fastcall	mVUexecuteVU0(u32 startPC, u32 cycles);
+void* __fastcall	mVUexecuteVU1(u32 startPC, u32 cycles);

-#ifdef __LINUX__
-microVUt(void) mVUreset();
-microVUt(void) mVUclose();
+#ifndef __LINUX__
+typedef void (__fastcall *mVUrecCall)(u32, u32);
+#else
+typedef void (*mVUrecCall)(u32, u32) __attribute__((__fastcall)); // Not sure if this is correct syntax (should be close xD)
 #endif

 // Include all the *.inl files (Needed because C++ sucks with templates and *.cpp files)
@ -163,3 +169,4 @@ microVUt(void) mVUclose();
 #include "microVU_Alloc.inl"
 #include "microVU_Tables.inl"
 #include "microVU_Compile.inl"
+#include "microVU_Execute.inl"
--- a/pcsx2/x86/microVU_Alloc.h
+++ b/pcsx2/x86/microVU_Alloc.h
@ -33,6 +33,8 @@ struct microRegInfo {
 	u8 VI[32];
 	u8 q;
 	u8 p;
+	u8 r;
+	u8 xgkick;
 };

 struct microTempRegInfo {
@ -42,40 +44,19 @@ struct microTempRegInfo {
 	u8 VIreg;		// Index of the VI reg
 	u8 q;			// Holds cycle info for Q reg
 	u8 p;			// Holds cycle info for P reg
+	u8 r;			// Holds cycle info for R reg (Will never cause stalls, but useful to know if R is modified)
+	u8 xgkick;		// Holds the cycle info for XGkick
 };

 template<u32 pSize>
 struct microAllocInfo {
 	microRegInfo	 regs;	   // Pipeline info
 	microTempRegInfo regsTemp; // Temp Pipeline info (used so that new pipeline info isn't conflicting between upper and lower instructions in the same cycle)
-	u8  branch;			// 0 = No Branch, 1 = Branch, 2 = Conditional Branch, 3 = Jump (JALR/JR)
-	u8	divFlag;		// 0 = Transfer DS/IS flags normally, 1 = Clear DS/IS Flags, > 1 = set DS/IS flags to bit 2::1 of divFlag
-	u8	divFlagTimer;	// Used to ensure divFlag's contents are merged at the appropriate time.
+	u8  branch;			// 0 = No Branch, 1 = B. 2 = BAL, 3~8 = Conditional Branches, 9 = JALR, 10 = JR
 	u8  maxStall;		// Helps in computing stalls (stores the max amount of cycles to stall for the current opcodes)
 	u32 cycles;			// Cycles for current block
+	u32 count;			// Number of VU 64bit instructions ran (starts at 0 for each block)
 	u32 curPC;			// Current PC
-	u32 info[pSize];	// bit 00 = Lower Instruction is NOP
-						// bit 01
-						// bit 02
-						// bit 03
-						// bit 04
-						// bit 05 = Write to Q1 or Q2?
-						// bit 06 = Read Q1 or Q2?
-						// bit 07 = Read/Write to P1 or P2?
-						// bit 08 = Update Mac Flags?
-						// bit 09 = Update Status Flags?
-						// bit 10 = Used with bit 11 to make a 2-bit key for mac flag instance
-						// bit 11
-						// bit 12 = Used with bit 13 to make a 2-bit key for status flag instance
-						// bit 13
-						// bit 14 = Used with bit 15 to make a 2-bit key for clip flag instance
-						// bit 15
-						// bit 16 = Used with bit 17 to make a 2-bit key for mac flag instance
-						// bit 17
-						// bit 18 = Used with bit 19 to make a 2-bit key for status flag instance
-						// bit 19
-						// bit 20 = Used with bit 21 to make a 2-bit key for clip flag instance
-						// bit 21
-						// bit 22 = Read VI(Fs) from backup memory?
-						// bit 23 = Read VI(Ft) from backup memory?
+	u32 startPC;		// Start PC for Cur Block
+	u32 info[pSize/8];	// Info for Instructions in current block
 };
--- a/pcsx2/x86/microVU_Alloc.inl
+++ b/pcsx2/x86/microVU_Alloc.inl
@ -201,11 +201,11 @@ microVUt(void) mVUallocFMAC5b(int& ACC, int& Fs) {
 // FMAC6 - Normal FMAC Opcodes (I Reg)
 //------------------------------------------------------------------

-#define getIreg(reg) {  \
+#define getIreg(reg, modXYZW) {  \
 	MOV32ItoR(gprT1, mVU->iReg);  \
 	SSE2_MOVD_R_to_XMM(reg, gprT1);  \
 	if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2<vuIndex>(reg, xmmT1, 8);  \
-	if (!_XYZW_SS) { mVUunpack_xyzw<vuIndex>(reg, reg, 0); }  \
+	if (!((_XYZW_SS && modXYZW) || (_X_Y_Z_W == 8))) { mVUunpack_xyzw<vuIndex>(reg, reg, 0); }  \
 }

 microVUt(void) mVUallocFMAC6a(int& Fd, int& Fs, int& Ft) {
@ -213,7 +213,7 @@ microVUt(void) mVUallocFMAC6a(int& Fd, int& Fs, int& Ft) {
 	Fs = xmmFs;
 	Ft = xmmFt;
 	Fd = xmmFs;
-	getIreg(Ft);
+	getIreg(Ft, 1);
 	getReg6(Fs, _Fs_);
 }

@ -230,7 +230,7 @@ microVUt(void) mVUallocFMAC7a(int& ACC, int& Fs, int& Ft) {
 	ACC = xmmACC;
 	Fs = (_X_Y_Z_W == 15) ? xmmACC : xmmFs;
 	Ft = xmmFt;
-	getIreg(Ft);
+	getIreg(Ft, 0);
 	if (_X_Y_Z_W == 8)	{ getReg6(Fs, _Fs_); }
 	else if (!_Fs_)		{ getZero4(Fs); }
 	else				{ getReg4(Fs, _Fs_); }
@ -374,7 +374,7 @@ microVUt(void) mVUallocFMAC12a(int& Fd, int& ACC, int& Fs, int& Ft) {
 	Ft = xmmFt;
 	Fd = xmmFs;
 	ACC = xmmACC;
-	getIreg(Ft);
+	getIreg(Ft, 0);
 	if (_X_Y_Z_W == 8)	{ getReg6(Fs, _Fs_); }
 	else if (!_Fs_)		{ getZero4(Fs); }
 	else				{ getReg4(Fs, _Fs_); }
@ -395,7 +395,7 @@ microVUt(void) mVUallocFMAC13a(int& Fd, int& ACC, int& Fs, int& Ft) {
 	Fd = xmmT1;
 	ACC = xmmT1;
 	SSE_MOVAPS_XMM_to_XMM(ACC, xmmACC);
-	getIreg(Ft);
+	getIreg(Ft, 0);
 	if (_X_Y_Z_W == 8)	{ getReg6(Fs, _Fs_); }
 	else if (!_Fs_)		{ getZero4(Fs); }
 	else				{ getReg4(Fs, _Fs_); }
@ -480,7 +480,7 @@ microVUt(void) mVUallocFMAC16a(int& ACCw, int& ACCr, int& Fs, int& Ft) {
 	ACCw = xmmACC;
 	ACCr = ((_X_Y_Z_W == 15) || (_X_Y_Z_W == 8)) ? xmmACC : xmmT1;
 	SSE_MOVAPS_XMM_to_XMM(ACCr, xmmACC);
-	getIreg(Ft);
+	getIreg(Ft, 0);
 	if (_X_Y_Z_W == 8)	{ getReg6(Fs, _Fs_); }
 	else if (!_Fs_)		{ getZero4(Fs); }
 	else				{ getReg4(Fs, _Fs_); }
@ -708,19 +708,7 @@ microVUt(void) mVUallocCFLAGb(int reg, int fInstance) {
 	microVU* mVU = mVUx;
 	MOV32RtoM(mVU->clipFlag[fInstance], reg);
 }
-/*
-microVUt(void) mVUallocDFLAGa(int reg) {
-	microVU* mVU = mVUx;
-	//if (!mVUdivFlag)		 { MOV32MtoR(reg, (uptr)&mVU->divFlag[readQ]); AND32ItoR(reg, 0xc00); }
-	//else if (mVUdivFlag & 1) { XOR32RtoR(reg, reg); }
-	//else					 { MOV32ItoR(reg, (u32)((mVUdivFlag << 9) & 0xc00)); }
-}

-microVUt(void) mVUallocDFLAGb(int reg) {
-	microVU* mVU = mVUx;
-	//MOV32RtoM((uptr)&mVU->divFlag[writeQ], reg);
-}
-*/
 //------------------------------------------------------------------
 // VI Reg Allocators
 //------------------------------------------------------------------
@ -734,6 +722,12 @@ microVUt(void) mVUallocVIa(int GPRreg, int _reg_) {

 microVUt(void) mVUallocVIb(int GPRreg, int _reg_) {
 	microVU* mVU = mVUx;
+	if (backupVI) { // Backs up reg to memory (used when VI is modified b4 a branch)
+		MOV32RtoM((uptr)&mVU->VIbackup[1], GPRreg);
+		mVUallocVIa<vuIndex>(GPRreg, _reg_);
+		MOV32RtoM((uptr)&mVU->VIbackup[0], GPRreg);
+		MOV32MtoR(GPRreg, (uptr)&mVU->VIbackup[1]);
+	}
 	if (_reg_ == 0)		{ return; }
 	else if (_reg_ < 9)	{ MOVD32RtoMMX(mmVI(_reg_), GPRreg); }
 	else				{ MOV16RtoM((uptr)&mVU->regs->VI[_reg_].UL, GPRreg); }
--- a/pcsx2/x86/microVU_Analyze.inl
+++ b/pcsx2/x86/microVU_Analyze.inl
@ -102,6 +102,49 @@ microVUt(void) mVUanalyzeFMAC4(int Fs, int Ft) {
 	analyzeReg4(Ft);
 }

+//------------------------------------------------------------------
+// IALU - IALU Opcodes
+//------------------------------------------------------------------
+
+#define analyzeVIreg1(reg)			{ if (reg) { mVUstall = aMax(mVUstall, mVUregs.VI[reg]); } }
+#define analyzeVIreg2(reg, aCycles)	{ if (reg) { mVUregsTemp.VIreg = reg; mVUregsTemp.VI = aCycles; mVUinfo |= _writesVI; mVU->VIbackup[0] = reg; } }
+
+microVUt(void) mVUanalyzeIALU1(int Id, int Is, int It) {
+	microVU* mVU = mVUx;
+	if (!Id) { mVUinfo |= _isNOP; }
+	analyzeVIreg1(Is);
+	analyzeVIreg1(It);
+	analyzeVIreg2(Id, 1);
+}
+
+microVUt(void) mVUanalyzeIALU2(int Is, int It) {
+	microVU* mVU = mVUx;
+	if (!It) { mVUinfo |= _isNOP; }
+	analyzeVIreg1(Is);
+	analyzeVIreg2(It, 1);
+}
+
+//------------------------------------------------------------------
+// MR32 - MR32 Opcode
+//------------------------------------------------------------------
+
+// Flips xyzw stalls to yzwx
+#define analyzeReg6(reg) {									\
+	if (reg) {												\
+		if (_X) { mVUstall = aMax(mVUstall, aReg(reg).y); }	\
+		if (_Y) { mVUstall = aMax(mVUstall, aReg(reg).z); }	\
+		if (_Z) { mVUstall = aMax(mVUstall, aReg(reg).w); }	\
+		if (_W) { mVUstall = aMax(mVUstall, aReg(reg).x); } \
+	}														\
+}
+
+microVUt(void) mVUanalyzeMR32(int Fs, int Ft) {
+	microVU* mVU = mVUx;
+	if (!Ft) { mVUinfo |= _isNOP; }
+	analyzeReg6(Fs);
+	analyzeReg2(Ft);
+}
+
 //------------------------------------------------------------------
 // FDIV - DIV/SQRT/RSQRT Opcodes
 //------------------------------------------------------------------
@ -143,4 +186,120 @@ microVUt(void) mVUanalyzeEFU2(int Fs, u8 xCycles) {
 	analyzePreg(xCycles);
 }

+//------------------------------------------------------------------
+// MFP - MFP Opcode
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeMFP(int Ft) {
+	microVU* mVU = mVUx; // ToDo: Needs special info for P reg?
+	if (!Ft) { mVUinfo |= _isNOP; }
+	analyzeReg2(Ft);
+}
+
+//------------------------------------------------------------------
+// LQx - LQ/LQD/LQI Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeLQ(int Ft, int Is, bool writeIs) {
+	microVU* mVU = mVUx;
+	analyzeVIreg1(Is);
+	analyzeReg2(Ft);
+	if (!Ft)	 { mVUinfo |= (writeIs && Is) ? _noWriteVF : _isNOP; }
+	if (writeIs) { analyzeVIreg2(Is, 1); }
+}
+
+//------------------------------------------------------------------
+// SQx - SQ/SQD/SQI Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeSQ(int Fs, int It, bool writeIt) {
+	microVU* mVU = mVUx;
+	analyzeReg1(Fs);
+	analyzeVIreg1(It);
+	if (writeIt) { analyzeVIreg2(It, 1); }
+}
+
+//------------------------------------------------------------------
+// R*** - R Reg Opcodes
+//------------------------------------------------------------------
+
+#define analyzeRreg() { mVUregsTemp.r = 1; }
+
+microVUt(void) mVUanalyzeR1(int Fs, int Fsf) {
+	microVU* mVU = mVUx;
+	analyzeReg5(Fs, Fsf);
+	analyzeRreg();
+}
+
+microVUt(void) mVUanalyzeR2(int Ft, bool canBeNOP) {
+	microVU* mVU = mVUx;
+	if (!Ft) { mVUinfo |= ((canBeNOP) ? _isNOP : _noWriteVF); }
+	analyzeReg2(Ft);
+	analyzeRreg();
+}
+
+//------------------------------------------------------------------
+// Sflag - Status Flag Opcodes
+//------------------------------------------------------------------
+
+microVUt(void) mVUanalyzeSflag(int It) {
+	microVU* mVU = mVUx;
+	if (!It) { mVUinfo |= _isNOP; }
+	else	 { mVUinfo |= _isSflag | _swapOps; } // ToDo: set s flag at right time
+	analyzeVIreg2(It, 1);
+}
+
+microVUt(void) mVUanalyzeFSSET() {
+	microVU* mVU = mVUx;
+	int i, curPC = iPC;
+	for (i = mVUcount; i > 0; i--) {
+		incPC2(-2);
+		if (isSflag) break;
+		mVUinfo &= ~_doStatus;
+	}
+	iPC = curPC;
+}
+
+//------------------------------------------------------------------
+// XGkick
+//------------------------------------------------------------------
+
+#define analyzeXGkick1()  { mVUstall = aMax(mVUstall, mVUregs.xgkick); }
+#define analyzeXGkick2(x) { mVUregsTemp.xgkick = x; }
+
+microVUt(void) mVUanalyzeXGkick(int Fs, int xCycles) {
+	microVU* mVU = mVUx;
+	analyzeVIreg1(Fs);
+	analyzeXGkick1();
+	analyzeXGkick2(xCycles);
+}
+
+//------------------------------------------------------------------
+// Branches - Branch Opcodes
+//------------------------------------------------------------------
+
+#define analyzeBranchVI(reg, infoVal) {													\
+	if (reg && (mVUcount > 0)) { /* Ensures branch is not first opcode in block */		\
+		incPC(-2);																		\
+		if (writesVI && (reg == mVU->VIbackup[0])) { /* If prev Op modified VI reg */	\
+			mVUinfo |= _backupVI;														\
+			incPC(2);																	\
+			mVUinfo |= infoVal;															\
+		}																				\
+		else { incPC(2); }																\
+	}																					\
+}
+
+microVUt(void) mVUanalyzeBranch1(int Is) {
+	microVU* mVU = mVUx;
+	if (mVUregs.VI[Is])	{ analyzeVIreg1(Is); }
+	else				{ analyzeBranchVI(Is, _memReadIs); }
+}
+
+microVUt(void) mVUanalyzeBranch2(int Is, int It) {
+	microVU* mVU = mVUx;
+	if (mVUregs.VI[Is] || mVUregs.VI[It]) { analyzeVIreg1(Is); analyzeVIreg1(It); }
+	else								  { analyzeBranchVI(Is, _memReadIs); analyzeBranchVI(It, _memReadIt);}
+}
+
 #endif //PCSX2_MICROVU
--- a/pcsx2/x86/microVU_Compile.inl
+++ b/pcsx2/x86/microVU_Compile.inl
@ -19,18 +19,6 @@
 #pragma once
 #ifdef PCSX2_MICROVU

-#ifdef mVUdebug
-#define mVUdebugStuff1() {										\
-	if (curI & _Ibit_)	{ SysPrintf("microVU: I-bit set!\n"); }	\
-	if (curI & _Ebit_)	{ SysPrintf("microVU: E-bit set!\n"); }	\
-	if (curI & _Mbit_)	{ SysPrintf("microVU: M-bit set!\n"); }	\
-	if (curI & _Dbit_)	{ SysPrintf("microVU: D-bit set!\n"); }	\
-	if (curI & _Tbit_)	{ SysPrintf("microVU: T-bit set!\n"); }	\
-}
-#else
-#define mVUdebugStuff1() {}
-#endif
-
 #define createBlock(blockEndPtr) {									\
 	block.pipelineState = pipelineState;							\
 	block.x86ptrStart = x86ptrStart;								\
@ -41,37 +29,89 @@
 	}																\
 }

-#define curI		  mVUcurProg.data[iPC]
-#define setCode()	{ mVU->code = curI; }
-#define incPC(x)	{ iPC = ((iPC + x) & (mVU->progSize-1)); setCode(); }
-#define startLoop()	{ mVUdebugStuff1(); mVUstall = 0; memset(&mVUregsTemp, 0, sizeof(mVUregsTemp)); }
+#define branchCase(Xcmp)											\
+	CMP16ItoM((uptr)mVU->branch, 0);								\
+	ajmp = Xcmp((uptr)0);											\
+	break
+
+#define branchCase2() {												\
+	incPC(-2);														\
+	MOV32ItoR(gprT1, (xPC + (2 * 8)) & ((vuIndex) ? 0x3fff:0xfff));	\
+	mVUallocVIb<vuIndex>(gprT1, _Ft_);								\
+	incPC(+2);														\
+}
+
+#define startLoop()			{ mVUdebug1(); mVUstall = 0; memset(&mVUregsTemp, 0, sizeof(mVUregsTemp)); }
+#define calcCycles(reg, x)	{ reg = ((reg > x) ? (reg - x) : 0); }
+#define incP()				{ mVU->p = (mVU->p+1) & 1; }
+#define incQ()				{ mVU->q = (mVU->q+1) & 1; }
+
+microVUt(void) mVUincCycles(int x) {
+	microVU* mVU = mVUx;
+	mVUcycles += x;
+	for (int z = 31; z > 0; z--) {
+		calcCycles(mVUregs.VF[z].x, x);
+		calcCycles(mVUregs.VF[z].y, x);
+		calcCycles(mVUregs.VF[z].z, x);
+		calcCycles(mVUregs.VF[z].w, x);
+	}
+	for (int z = 16; z > 0; z--) {
+		calcCycles(mVUregs.VI[z], x);
+	}
+	if (mVUregs.q) {
+		calcCycles(mVUregs.q, x);
+		if (!mVUregs.q) { incQ(); } // Do Status Flag Merging Stuff?
+	}
+	if (mVUregs.p) {
+		calcCycles(mVUregs.p, x);
+		if (!mVUregs.p) { incP(); }
+	}
+	calcCycles(mVUregs.r, x);
+	calcCycles(mVUregs.xgkick, x);
+}

 microVUt(void) mVUsetCycles() {
 	microVU* mVU = mVUx;
 	incCycles(mVUstall);
+	if (mVUregsTemp.VFreg[0] == mVUregsTemp.VFreg[1] && !mVUregsTemp.VFreg[0]) { // If upper Op && lower Op write to same VF reg
+		mVUinfo |= (mVUregsTemp.r || mVUregsTemp.VI) ? _noWriteVF : _isNOP;		 // If lower Op doesn't modify anything else, then make it a NOP
+		mVUregsTemp.VF[1].x = aMax(mVUregsTemp.VF[0].x, mVUregsTemp.VF[1].x);	 // Use max cycles from each vector
+		mVUregsTemp.VF[1].y = aMax(mVUregsTemp.VF[0].y, mVUregsTemp.VF[1].y);
+		mVUregsTemp.VF[1].z = aMax(mVUregsTemp.VF[0].z, mVUregsTemp.VF[1].z);
+		mVUregsTemp.VF[1].w = aMax(mVUregsTemp.VF[0].w, mVUregsTemp.VF[1].w);
+	}
 	mVUregs.VF[mVUregsTemp.VFreg[0]].reg = mVUregsTemp.VF[0].reg;
-	mVUregs.VF[mVUregsTemp.VFreg[1]].reg =(mVUregsTemp.VFreg[0] == mVUregsTemp.VFreg[1]) ? (aMax(mVUregsTemp.VF[0].reg, mVUregsTemp.VF[1].reg)) : (mVUregsTemp.VF[1].reg);
+	mVUregs.VF[mVUregsTemp.VFreg[1]].reg = mVUregsTemp.VF[1].reg;
 	mVUregs.VI[mVUregsTemp.VIreg]		 = mVUregsTemp.VI;
 	mVUregs.q							 = mVUregsTemp.q;
 	mVUregs.p							 = mVUregsTemp.p;
+	mVUregs.r							 = mVUregsTemp.r;
+	mVUregs.xgkick						 = mVUregsTemp.xgkick;
 }

-microVUx(void) mVUcompile(u32 startPC, u32 pipelineState, microRegInfo* pState, u8* x86ptrStart) {
+//------------------------------------------------------------------
+// Recompiler
+//------------------------------------------------------------------
+
+microVUx(void*) mVUcompile(u32 startPC, u32 pipelineState, microRegInfo* pState, u8* x86ptrStart) {
 	microVU* mVU = mVUx;
 	microBlock block;
-	int branch;
+	u8* thisPtr = mVUcurProg.x86Ptr;
 	iPC = startPC / 4;
 	
 	// Searches for Existing Compiled Block (if found, then returns; else, compile)
-	microBlock* pblock = mVUblock[iPC]->search(pipelineState, pState);
-	if (block) { x86SetPtr(pblock->x86ptrEnd); return; }
+	microBlock* pblock = mVUblock[iPC/2]->search(pipelineState, pState);
+	if (block) { return pblock->x86ptrStart; }

 	// First Pass
 	setCode();
-	branch	  = 0;
 	mVUbranch	= 0;
+	mVUstartPC	= iPC;
+	mVUcount	= 0;
 	mVUcycles	= 1; // Skips "M" phase, and starts counting cycles at "T" stage
-	for (;;) {
+	mVU->p		= 0; // All blocks start at p index #0
+	mVU->q		= 0; // All blocks start at q index #0
+	for (int branch = 0;; ) {
 		startLoop();
 		mVUopU<vuIndex, 0>();
 		if (curI & _Ebit_)	  { branch = 1; }
@ -79,34 +119,70 @@ microVUx(void) mVUcompile(u32 startPC, u32 pipelineState, microRegInfo* pState,
 		if (curI & _Ibit_)	  { incPC(1); mVUinfo |= _isNOP; }
 		else				  { incPC(1); mVUopL<vuIndex, 0>(); }
 		mVUsetCycles<vuIndex>();
+		if (mVU->p)			  { mVUinfo |= _readP; }
+		if (mVU->q)			  { mVUinfo |= _readQ; }
+		else				  { mVUinfo |= _writeQ; }
 		if		(branch >= 2) { mVUinfo |= _isEOB | ((branch == 3) ? _isBdelay : 0); if (mVUbranch) { Console::Error("microVU Warning: Branch in E-bit/Branch delay slot!"); mVUinfo |= _isNOP; } break; }
 		else if (branch == 1) { branch = 2; }
 		if		(mVUbranch)	  { branch = 3; mVUbranch = 0; mVUinfo |= _isBranch; }
 		incPC(1);
+		incCycles(1);
+		mVUcount++;
 	}

 	// Second Pass
-	iPC = startPC;
+	iPC = mVUstartPC;
 	setCode();
 	for (bool x = 1; x; ) {
+		//
+		// ToDo: status/mac flag stuff?
+		//
 		if (isEOB)			{ x = 0; }
-		else if (isBranch)	{ mVUopU<vuIndex, 1>(); incPC(2); }
+		//if (isBranch2)	{ mVUopU<vuIndex, 1>(); incPC(2); }
+		
+		if (isNop)			{ mVUopU<vuIndex, 1>(); if (curI & _Ibit_) { incPC(1); mVU->iReg = curI; } else { incPC(1); } }
+		else if (!swapOps)	{ mVUopU<vuIndex, 1>(); incPC(1); mVUopL<vuIndex, 1>(); }
+		else				{ incPC(1); mVUopL<vuIndex, 1>(); incPC(-1); mVUopU<vuIndex, 1>(); incPC(1); }

-		mVUopU<vuIndex, 1>();
-		if (isNop)	   { if (curI & _Ibit_) { incPC(1); mVU->iReg = curI; } else { incPC(1); } }
-		else		   { incPC(1); mVUopL<vuIndex, 1>(); }
 		if (!isBdelay) { incPC(1); }
 		else {
-			incPC(-2); // Go back to Branch Opcode
-			mVUopL<vuIndex, 1>(); // Run Branch Opcode
+			u32* ajmp;
 			switch (mVUbranch) {
-				case 1: break;
-				case 2: break;
-				case 3: break;
+				case 3: branchCase(JZ32);  // IBEQ
+				case 4: branchCase(JGE32); // IBGEZ
+				case 5: branchCase(JG32);  // IBGTZ
+				case 6: branchCase(JLE32); // IBLEQ
+				case 7: branchCase(JL32);  // IBLTZ
+				case 8: branchCase(JNZ32); // IBNEQ
+				case 2: branchCase2();	   // BAL
+				case 1: 
+					// search for block
+					ajmp = JMP32((uptr)0); 
+					
+					break; // B/BAL
+				case 9: branchCase2();	   // JALR
+				case 10: break; // JR/JALR
+				//mVUcurProg.x86Ptr
 			}
-			break;
+			return thisPtr;
 		}
 	}
+	// Do E-bit end stuff here
+
+	incCycles(55); // Ensures Valid P/Q instances
+	mVUcycles -= 55;
+	if (mVU->q) { SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0xe5); }
+	SSE_MOVSS_XMM_to_M32((uptr)&mVU->regs->VI[REG_Q], xmmPQ);
+	SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, mVU->p ? 3 : 2);
+	SSE_MOVSS_XMM_to_M32((uptr)&mVU->regs->VI[REG_P], xmmPQ);
+
+	MOV32ItoM((uptr)&mVU->p, mVU->p);
+	MOV32ItoM((uptr)&mVU->q, mVU->q);
+	AND32ItoM((uptr)&microVU0.regs.VI[REG_VPU_STAT].UL, (vuIndex ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
+	AND32ItoM((uptr)&mVU->regs->vifRegs->stat, ~0x4); // Not sure what this does but zerorecs do it...
+	MOV32ItoM((uptr)&mVU->regs->VI[REG_TPC], xPC);
+	JMP32((uptr)mVU->exitFunct - ((uptr)x86Ptr + 5));
+	return thisPtr;
 }

 #endif //PCSX2_MICROVU
--- a/pcsx2/x86/microVU_Execute.inl
+++ b/pcsx2/x86/microVU_Execute.inl
@ -0,0 +1,164 @@
+/*  Pcsx2 - Pc Ps2 Emulator
+*  Copyright (C) 2009  Pcsx2-Playground Team
+*
+*  This program is free software; you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation; either version 2 of the License, or
+*  (at your option) any later version.
+*  
+*  This program is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  GNU General Public License for more details.
+*  
+*  You should have received a copy of the GNU General Public License
+*  along with this program; if not, write to the Free Software
+*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+*/
+#pragma once
+#ifdef PCSX2_MICROVU
+
+//------------------------------------------------------------------
+// Dispatcher Functions
+//------------------------------------------------------------------
+
+// Generates the code for entering recompiled blocks
+microVUt(void) mVUdispatcherA() {
+	static u32 PCSX2_ALIGNED16(vuMXCSR);
+	microVU* mVU = mVUx;
+	x86SetPtr(mVU->ptr);
+	mVU->startFunct = mVU->ptr;
+
+	// __fastcall = The first two DWORD or smaller arguments are passed in ECX and EDX registers; all other arguments are passed right to left.
+	if (!vuIndex) { CALLFunc((uptr)mVUexecuteVU0); }
+	else		  { CALLFunc((uptr)mVUexecuteVU1); }
+
+	// Backup cpu state
+	PUSH32R(EBX);
+	PUSH32R(EBP);
+	PUSH32R(ESI);
+	PUSH32R(EDI);
+
+	// Load VU's MXCSR state
+	vuMXCSR = g_sseVUMXCSR;
+	SSE_LDMXCSR((uptr)&vuMXCSR);
+
+	// Load Regs
+	MOV32MtoR(gprR,  (uptr)&mVU->regs->VI[REG_R]);
+	MOV32MtoR(gprF0, (uptr)&mVU->regs->VI[REG_STATUS_FLAG]);
+	MOV32MtoR(gprF1, (uptr)&mVU->regs->VI[REG_MAC_FLAG]);
+	SHL32ItoR(gprF0, 16);
+	AND32ItoR(gprF1, 0xffff);
+	OR32RtoR (gprF0, gprF1);
+	MOV32RtoR(gprF1, gprF0);
+	MOV32RtoR(gprF2, gprF0);
+	MOV32RtoR(gprF3, gprF0);
+
+	for (int i = 0; i < 8; i++) {
+		MOVQMtoR(i, (uptr)&mVU->regs->VI[i+1]);
+	}
+
+	SSE_MOVAPS_M128_to_XMM(xmmACC, (uptr)&mVU->regs->ACC);
+	SSE_MOVAPS_M128_to_XMM(xmmMax, (uptr)mVU_maxvals);
+	SSE_MOVAPS_M128_to_XMM(xmmMin, (uptr)mVU_minvals);
+	SSE_MOVAPS_M128_to_XMM(xmmT1, (uptr)&mVU->regs->VI[REG_P]);
+	SSE_MOVAPS_M128_to_XMM(xmmPQ, (uptr)&mVU->regs->VI[REG_Q]);
+	SSE_SHUFPS_XMM_to_XMM(xmmPQ, xmmT1, 0); // wzyx = PPQQ
+
+	// Jump to Recompiled Code Block
+	JMPR(EAX);
+	mVU->ptr = x86Ptr;
+}
+
+// Generates the code to exit from recompiled blocks
+microVUt(void) mVUdispatcherB() {
+	static u32 PCSX2_ALIGNED16(eeMXCSR);
+	microVU* mVU = mVUx;
+	x86SetPtr(mVU->ptr);
+	mVU->exitFunct = mVU->ptr;
+
+	// __fastcall = The first two DWORD or smaller arguments are passed in ECX and EDX registers; all other arguments are passed right to left.
+	if (!vuIndex) { CALLFunc((uptr)mVUcleanUpVU0); }
+	else		  { CALLFunc((uptr)mVUcleanUpVU1); }
+
+	// Load EE's MXCSR state
+	eeMXCSR = g_sseMXCSR;
+	SSE_LDMXCSR((uptr)&eeMXCSR);
+	
+	// Save Regs
+	MOV32RtoR(gprT1, gprF0); // ToDo: Ensure Correct Flag instances
+	AND32ItoR(gprT1, 0xffff);
+	SHR32ItoR(gprF0, 16);
+	MOV32RtoM((uptr)&mVU->regs->VI[REG_R],			 gprR);
+	MOV32RtoM((uptr)&mVU->regs->VI[REG_STATUS_FLAG], gprT1);
+	MOV32RtoM((uptr)&mVU->regs->VI[REG_MAC_FLAG],	 gprF0);
+	
+	for (int i = 0; i < 8; i++) {
+		MOVDMMXtoM((uptr)&mVU->regs->VI[i+1], i);
+	}
+
+	SSE_MOVAPS_XMM_to_M128((uptr)&mVU->regs->ACC, xmmACC);
+	//SSE_MOVSS_XMM_to_M32((uptr)&mVU->regs->VI[REG_Q], xmmPQ); // ToDo: Ensure Correct Q/P instances
+	//SSE2_PSHUFD_XMM_to_XMM(xmmPQ, xmmPQ, 0); // wzyx = PPPP
+	//SSE_MOVSS_XMM_to_M32((uptr)&mVU->regs->VI[REG_P], xmmPQ);
+
+	// Restore cpu state
+	POP32R(EDI);
+	POP32R(ESI);
+	POP32R(EBP);
+	POP32R(EBX);
+
+	EMMS();
+	RET();
+
+	mVU->ptr = x86Ptr;
+	mVUcachCheck(mVU->cache, 512);
+}
+
+//------------------------------------------------------------------
+// Execution Functions
+//------------------------------------------------------------------
+
+// Executes for number of cycles
+microVUt(void*) __fastcall mVUexecute(u32 startPC, u32 cycles) {
+/*
+	Pseudocode: (ToDo: implement # of cycles)
+	1) Search for existing program
+	2) If program not found, goto 5
+	3) Search for recompiled block
+	4) If recompiled block found, goto 6
+	5) Recompile as much blocks as possible
+	6) Return start execution address of block
+*/
+	microVU* mVU = mVUx;
+	mVUlog("microVU%x: startPC = 0x%x, cycles = 0x%x", params vuIndex, startPC, cycles);
+	if ( mVUsearchProg(mVU) ) { // Found Program
+		//microBlock* block = mVU->prog.prog[mVU->prog.cur].block[startPC]->search(mVU->prog.lastPipelineState);
+		//if (block) return block->x86ptrStart; // Found Block
+	}
+	// Recompile code
+	return NULL;
+}
+
+//------------------------------------------------------------------
+// Cleanup Functions
+//------------------------------------------------------------------
+
+microVUt(void) mVUcleanUp() {
+	microVU* mVU = mVUx;
+	mVU->ptr = mVUcurProg.x86ptr;
+	mVUcachCheck(mVUcurProg.x86start, (uptr)(mVUcurProg.x86end - mVUcurProg.x86start));
+}
+
+//------------------------------------------------------------------
+// Caller Functions
+//------------------------------------------------------------------
+
+void  __fastcall startVU0(u32 startPC, u32 cycles) { ((mVUrecCall)microVU0.startFunct)(startPC, cycles); }
+void  __fastcall startVU1(u32 startPC, u32 cycles) { ((mVUrecCall)microVU1.startFunct)(startPC, cycles); }
+void* __fastcall mVUexecuteVU0(u32 startPC, u32 cycles) { return mVUexecute<0>(startPC, cycles); }
+void* __fastcall mVUexecuteVU1(u32 startPC, u32 cycles) { return mVUexecute<1>(startPC, cycles); }
+void mVUcleanUpVU0() { mVUcleanUp<0>(); }
+void mVUcleanUpVU1() { mVUcleanUp<1>(); }
+
+#endif //PCSX2_MICROVU
--- a/pcsx2/x86/microVU_Lower.inl
+++ b/pcsx2/x86/microVU_Lower.inl
@ -23,6 +23,10 @@
 // Micro VU Micromode Lower instructions
 //------------------------------------------------------------------

+//------------------------------------------------------------------
+// DIV/SQRT/RSQRT
+//------------------------------------------------------------------
+
 #define testZero(xmmReg, xmmTemp, gprTemp) {										\
 	SSE_XORPS_XMM_to_XMM(xmmTemp, xmmTemp);		/* Clear xmmTemp (make it 0) */		\
 	SSE_CMPEQPS_XMM_to_XMM(xmmTemp, xmmReg);	/* Set all F's if zero */			\
@ -128,6 +132,10 @@ microVUf(void) mVU_RSQRT() {
 	}
 }

+//------------------------------------------------------------------
+// EATAN/EEXP/ELENG/ERCPR/ERLENG/ERSADD/ERSQRT/ESADD/ESIN/ESQRT/ESUM
+//------------------------------------------------------------------
+
 #define EATANhelper(addr) {						\
 	SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs);			\
 	SSE_MULSS_XMM_to_XMM(xmmT1, xmmFs);			\
@ -401,6 +409,10 @@ microVUf(void) mVU_ESUM() {
 	}
 }

+//------------------------------------------------------------------
+// FCAND/FCEQ/FCGET/FCOR/FCSET
+//------------------------------------------------------------------
+
 microVUf(void) mVU_FCAND() {
 	microVU* mVU = mVUx;
 	if (!recPass) {}
@ -456,6 +468,10 @@ microVUf(void) mVU_FCSET() {
 	}
 }

+//------------------------------------------------------------------
+// FMAND/FMEQ/FMOR
+//------------------------------------------------------------------
+
 microVUf(void) mVU_FMAND() {
 	microVU* mVU = mVUx;
 	if (!recPass) {}
@ -491,9 +507,13 @@ microVUf(void) mVU_FMOR() {
 	}
 }

+//------------------------------------------------------------------
+// FSAND/FSEQ/FSOR/FSSET
+//------------------------------------------------------------------
+
 microVUf(void) mVU_FSAND() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeSflag<vuIndex>(_Ft_); }
 	else { 
 		mVUallocSFLAGa<vuIndex>(gprT1, fvsInstance);
 		AND16ItoR(gprT1, _Imm12_);
@ -503,7 +523,7 @@ microVUf(void) mVU_FSAND() {

 microVUf(void) mVU_FSEQ() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeSflag<vuIndex>(_Ft_); }
 	else { 
 		mVUallocSFLAGa<vuIndex>(gprT1, fvsInstance);
 		XOR16ItoR(gprT1, _Imm12_);
@ -515,7 +535,7 @@ microVUf(void) mVU_FSEQ() {

 microVUf(void) mVU_FSOR() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeSflag<vuIndex>(_Ft_); }
 	else { 
 		mVUallocSFLAGa<vuIndex>(gprT1, fvsInstance);
 		OR16ItoR(gprT1, _Imm12_);
@ -525,20 +545,22 @@ microVUf(void) mVU_FSOR() {

 microVUf(void) mVU_FSSET() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeFSSET<vuIndex>(); }
 	else { 
 		int flagReg;
 		getFlagReg(flagReg, fsInstance);
-		MOV16ItoR(gprT1, (_Imm12_ & 0xfc0));
-		//if (_Imm12_ & 0xc00) { mVUdivFlag = _Imm12_ >> 9; }
-		//else				 { mVUdivFlag = 1; }
-		//mVUdivFlagT = 4;
+		AND32ItoR(flagReg, 0x03f);
+		OR32ItoR(flagReg, (_Imm12_ & 0xfc0));
 	}
 }

+//------------------------------------------------------------------
+// IADD/IADDI/IADDIU/IAND/IOR/ISUB/ISUBIU
+//------------------------------------------------------------------
+
 microVUf(void) mVU_IADD() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeIALU1<vuIndex>(_Fd_, _Fs_, _Ft_); }
 	else { 
 		mVUallocVIa<vuIndex>(gprT1, _Fs_);
 		if (_Ft_ != _Fs_) {
@ -552,7 +574,7 @@ microVUf(void) mVU_IADD() {

 microVUf(void) mVU_IADDI() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeIALU2<vuIndex>(_Fs_, _Ft_); }
 	else { 
 		mVUallocVIa<vuIndex>(gprT1, _Fs_);
 		ADD16ItoR(gprT1, _Imm5_);
@ -562,7 +584,7 @@ microVUf(void) mVU_IADDI() {

 microVUf(void) mVU_IADDIU() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeIALU2<vuIndex>(_Fs_, _Ft_); }
 	else { 
 		mVUallocVIa<vuIndex>(gprT1, _Fs_);
 		ADD16ItoR(gprT1, _Imm12_);
@ -572,7 +594,7 @@ microVUf(void) mVU_IADDIU() {

 microVUf(void) mVU_IAND() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeIALU1<vuIndex>(_Fd_, _Fs_, _Ft_); }
 	else { 
 		mVUallocVIa<vuIndex>(gprT1, _Fs_);
 		if (_Ft_ != _Fs_) {
@ -585,7 +607,7 @@ microVUf(void) mVU_IAND() {

 microVUf(void) mVU_IOR() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeIALU1<vuIndex>(_Fd_, _Fs_, _Ft_); }
 	else { 
 		mVUallocVIa<vuIndex>(gprT1, _Fs_);
 		if (_Ft_ != _Fs_) {
@ -598,7 +620,7 @@ microVUf(void) mVU_IOR() {

 microVUf(void) mVU_ISUB() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeIALU1<vuIndex>(_Fd_, _Fs_, _Ft_); }
 	else { 
 		if (_Ft_ != _Fs_) {
 			mVUallocVIa<vuIndex>(gprT1, _Fs_);
@ -615,7 +637,7 @@ microVUf(void) mVU_ISUB() {

 microVUf(void) mVU_ISUBIU() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeIALU2<vuIndex>(_Fs_, _Ft_); }
 	else { 
 		mVUallocVIa<vuIndex>(gprT1, _Fs_);
 		SUB16ItoR(gprT1, _Imm12_);
@ -623,18 +645,13 @@ microVUf(void) mVU_ISUBIU() {
 	}
 }

-microVUf(void) mVU_MOVE() {
-	microVU* mVU = mVUx;
-	if (!recPass) { /*If (!_Ft_ || (_Ft_ == _Fs_)) nop();*/ }
-	else { 
-		mVUloadReg<vuIndex>(xmmT1, (uptr)&mVU->regs->VF[_Fs_].UL[0], _X_Y_Z_W);
-		mVUsaveReg<vuIndex>(xmmT1, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
-	}
-}
+//------------------------------------------------------------------
+// MFIR/MFP/MOVE/MR32/MTIR
+//------------------------------------------------------------------

 microVUf(void) mVU_MFIR() {
 	microVU* mVU = mVUx;
-	if (!recPass) { /*If (!_Ft_) nop();*/ }
+	if (!recPass) { if (!_Ft_) { mVUinfo |= _isNOP; } analyzeVIreg1(_Fs_); analyzeReg2(_Ft_); }
 	else { 
 		mVUallocVIa<vuIndex>(gprT1, _Fs_);
 		MOVSX32R16toR(gprT1, gprT1);
@ -646,25 +663,25 @@ microVUf(void) mVU_MFIR() {

 microVUf(void) mVU_MFP() {
 	microVU* mVU = mVUx;
-	if (!recPass) { /*If (!_Ft_) nop();*/ }
+	if (!recPass) { mVUanalyzeMFP<vuIndex>(_Ft_); }
 	else { 
 		getPreg(xmmFt);
 		mVUsaveReg<vuIndex>(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
 	}
 }

-microVUf(void) mVU_MTIR() {
+microVUf(void) mVU_MOVE() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { if (!_Ft_ || (_Ft_ == _Fs_)) { mVUinfo |= _isNOP; } analyzeReg1(_Fs_); analyzeReg2(_Ft_); }
 	else { 
-		MOVZX32M16toR(gprT1, (uptr)&mVU->regs->VF[_Fs_].UL[_Fsf_]);
-		mVUallocVIb<vuIndex>(gprT1, _Ft_);
+		mVUloadReg<vuIndex>(xmmT1, (uptr)&mVU->regs->VF[_Fs_].UL[0], _X_Y_Z_W);
+		mVUsaveReg<vuIndex>(xmmT1, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
 	}
 }

 microVUf(void) mVU_MR32() {
 	microVU* mVU = mVUx;
-	if (!recPass) { /*If (!_Ft_) nop();*/ }
+	if (!recPass) { mVUanalyzeMR32<vuIndex>(_Fs_, _Ft_); }
 	else { 
 		mVUloadReg<vuIndex>(xmmT1, (uptr)&mVU->regs->VF[_Fs_].UL[0], (_X_Y_Z_W == 8) ? 4 : 15);
 		if (_X_Y_Z_W != 8) { SSE2_PSHUFD_XMM_to_XMM(xmmT1, xmmT1, 0x39); }
@ -672,9 +689,22 @@ microVUf(void) mVU_MR32() {
 	}
 }

+microVUf(void) mVU_MTIR() {
+	microVU* mVU = mVUx;
+	if (!recPass) { if (!_Ft_) { mVUinfo |= _isNOP; } analyzeReg5(_Fs_, _Fsf_); analyzeVIreg2(_Ft_, 1); }
+	else { 
+		MOVZX32M16toR(gprT1, (uptr)&mVU->regs->VF[_Fs_].UL[_Fsf_]);
+		mVUallocVIb<vuIndex>(gprT1, _Ft_);
+	}
+}
+
+//------------------------------------------------------------------
+// ILW/ILWR
+//------------------------------------------------------------------
+
 microVUf(void) mVU_ILW() {
 	microVU* mVU = mVUx;
-	if (!recPass) { /*If (!_Ft_) nop();*/ }
+	if (!recPass) { if (!_Ft_) { mVUinfo |= _isNOP; } analyzeVIreg1(_Fs_); analyzeVIreg2(_Ft_, 4);  }
 	else { 
 		if (!_Fs_) {
 			MOVZX32M16toR( gprT1, (uptr)mVU->regs->Mem + getVUmem(_Imm11_) + offsetSS );
@ -684,7 +714,7 @@ microVUf(void) mVU_ILW() {
 			mVUallocVIa<vuIndex>(gprT1, _Fs_);
 			ADD32ItoR(gprT1, _Imm11_);
 			mVUaddrFix<vuIndex>(gprT1);
-			MOV32RmSOffsettoR(gprT1, gprT1, (uptr)mVU->regs->Mem + offsetSS, 0); // ToDo: check if this works.
+			MOV32RmtoR(gprT1, gprT1, (uptr)mVU->regs->Mem + offsetSS);
 			if (isMMX(_Ft_)) AND32ItoR(gprT1, 0xffff);
 			mVUallocVIb<vuIndex>(gprT1, _Ft_);
 		}
@ -693,25 +723,29 @@ microVUf(void) mVU_ILW() {

 microVUf(void) mVU_ILWR() {
 	microVU* mVU = mVUx;
-	if (!recPass) { /*If (!_Ft_) nop();*/ }
+	if (!recPass) { if (!_Ft_) { mVUinfo |= _isNOP; } analyzeVIreg1(_Fs_); analyzeVIreg2(_Ft_, 4); }
 	else { 
 		if (!_Fs_) {
-			MOVZX32M16toR( gprT1, (uptr)mVU->regs->Mem + offsetSS );
+			MOVZX32M16toR(gprT1, (uptr)mVU->regs->Mem + offsetSS);
 			mVUallocVIb<vuIndex>(gprT1, _Ft_);
 		}
 		else {
 			mVUallocVIa<vuIndex>(gprT1, _Fs_);
 			mVUaddrFix<vuIndex>(gprT1);
-			MOV32RmSOffsettoR(gprT1, gprT1, (uptr)mVU->regs->Mem + offsetSS, 0); // ToDo: check if this works.
+			MOV32RmtoR(gprT1, gprT1, (uptr)mVU->regs->Mem + offsetSS);
 			if (isMMX(_Ft_)) AND32ItoR(gprT1, 0xffff);
 			mVUallocVIb<vuIndex>(gprT1, _Ft_);
 		}
 	}
 }

+//------------------------------------------------------------------
+// ISW/ISWR
+//------------------------------------------------------------------
+
 microVUf(void) mVU_ISW() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { analyzeVIreg1(_Fs_); analyzeVIreg1(_Ft_); }
 	else { 
 		if (!_Fs_) {
 			int imm = getVUmem(_Imm11_);
@ -726,17 +760,17 @@ microVUf(void) mVU_ISW() {
 			mVUallocVIa<vuIndex>(gprT2, _Ft_);
 			ADD32ItoR(gprT1, _Imm11_);
 			mVUaddrFix<vuIndex>(gprT1);
-			if (_X) MOV32RtoRmOffset(gprT1, gprT2, (uptr)mVU->regs->Mem);
-			if (_Y) MOV32RtoRmOffset(gprT1, gprT2, (uptr)mVU->regs->Mem+4);
-			if (_Z) MOV32RtoRmOffset(gprT1, gprT2, (uptr)mVU->regs->Mem+8);
-			if (_W) MOV32RtoRmOffset(gprT1, gprT2, (uptr)mVU->regs->Mem+12);
+			if (_X) MOV32RtoRm(gprT1, gprT2, (uptr)mVU->regs->Mem);
+			if (_Y) MOV32RtoRm(gprT1, gprT2, (uptr)mVU->regs->Mem+4);
+			if (_Z) MOV32RtoRm(gprT1, gprT2, (uptr)mVU->regs->Mem+8);
+			if (_W) MOV32RtoRm(gprT1, gprT2, (uptr)mVU->regs->Mem+12);
 		}
 	}
 }

 microVUf(void) mVU_ISWR() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { analyzeVIreg1(_Fs_); analyzeVIreg1(_Ft_); }
 	else { 
 		if (!_Fs_) {
 			mVUallocVIa<vuIndex>(gprT1, _Ft_);
@ -749,17 +783,21 @@ microVUf(void) mVU_ISWR() {
 			mVUallocVIa<vuIndex>(gprT1, _Fs_);
 			mVUallocVIa<vuIndex>(gprT2, _Ft_);
 			mVUaddrFix<vuIndex>(gprT1);
-			if (_X) MOV32RtoRmOffset(gprT1, gprT2, (uptr)mVU->regs->Mem);
-			if (_Y) MOV32RtoRmOffset(gprT1, gprT2, (uptr)mVU->regs->Mem+4);
-			if (_Z) MOV32RtoRmOffset(gprT1, gprT2, (uptr)mVU->regs->Mem+8);
-			if (_W) MOV32RtoRmOffset(gprT1, gprT2, (uptr)mVU->regs->Mem+12);
+			if (_X) MOV32RtoRm(gprT1, gprT2, (uptr)mVU->regs->Mem);
+			if (_Y) MOV32RtoRm(gprT1, gprT2, (uptr)mVU->regs->Mem+4);
+			if (_Z) MOV32RtoRm(gprT1, gprT2, (uptr)mVU->regs->Mem+8);
+			if (_W) MOV32RtoRm(gprT1, gprT2, (uptr)mVU->regs->Mem+12);
 		}
 	}
 }

+//------------------------------------------------------------------
+// LQ/LQD/LQI
+//------------------------------------------------------------------
+
 microVUf(void) mVU_LQ() {
 	microVU* mVU = mVUx;
-	if (!recPass) { /*If (!_Ft_) nop();*/ }
+	if (!recPass) { mVUanalyzeLQ<vuIndex>(_Ft_, _Fs_, 0); }
 	else { 
 		if (!_Fs_) {
 			mVUloadReg<vuIndex>(xmmFt, (uptr)mVU->regs->Mem + getVUmem(_Imm11_), _X_Y_Z_W);
@ -777,9 +815,9 @@ microVUf(void) mVU_LQ() {

 microVUf(void) mVU_LQD() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeLQ<vuIndex>(_Ft_, _Fs_, 1); }
 	else { 
-		if (!_Fs_ && _Ft_) {
+		if (!_Fs_ && !noWriteVF) {
 			mVUloadReg<vuIndex>(xmmFt, (uptr)mVU->regs->Mem, _X_Y_Z_W);
 			mVUsaveReg<vuIndex>(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
 		}
@ -787,7 +825,7 @@ microVUf(void) mVU_LQD() {
 			mVUallocVIa<vuIndex>(gprT1, _Fs_);
 			SUB16ItoR(gprT1, 1);
 			mVUallocVIb<vuIndex>(gprT1, _Fs_); // ToDo: Backup to memory check.
-			if (_Ft_) {
+			if (!noWriteVF) {
 				mVUaddrFix<vuIndex>(gprT1);
 				mVUloadReg2<vuIndex>(xmmFt, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
 				mVUsaveReg<vuIndex>(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
@ -798,15 +836,15 @@ microVUf(void) mVU_LQD() {

 microVUf(void) mVU_LQI() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeLQ<vuIndex>(_Ft_, _Fs_, 1); }
 	else { 
-		if (!_Fs_ && _Ft_) {
+		if (!_Fs_ && !noWriteVF) {
 			mVUloadReg<vuIndex>(xmmFt, (uptr)mVU->regs->Mem, _X_Y_Z_W);
 			mVUsaveReg<vuIndex>(xmmFt, (uptr)&mVU->regs->VF[_Ft_].UL[0], _X_Y_Z_W);
 		}
 		else {
 			mVUallocVIa<vuIndex>((_Ft_) ? gprT1 : gprT2, _Fs_);
-			if (_Ft_) {
+			if (!noWriteVF) {
 				MOV32RtoR(gprT2, gprT1);
 				mVUaddrFix<vuIndex>(gprT1);
 				mVUloadReg2<vuIndex>(xmmFt, gprT1, (uptr)mVU->regs->Mem, _X_Y_Z_W);
@ -818,9 +856,13 @@ microVUf(void) mVU_LQI() {
 	}
 }

+//------------------------------------------------------------------
+// SQ/SQD/SQI
+//------------------------------------------------------------------
+
 microVUf(void) mVU_SQ() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeSQ<vuIndex>(_Fs_, _Ft_, 0); }
 	else { 
 		if (!_Ft_) {
 			getReg7(xmmFs, _Fs_);
@ -838,7 +880,7 @@ microVUf(void) mVU_SQ() {

 microVUf(void) mVU_SQD() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeSQ<vuIndex>(_Fs_, _Ft_, 1); }
 	else { 
 		if (!_Ft_) {
 			getReg7(xmmFs, _Fs_);
@ -857,7 +899,7 @@ microVUf(void) mVU_SQD() {

 microVUf(void) mVU_SQI() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeSQ<vuIndex>(_Fs_, _Ft_, 1); }
 	else { 
 		if (!_Ft_) {
 			getReg7(xmmFs, _Fs_);
@ -875,9 +917,13 @@ microVUf(void) mVU_SQI() {
 	}
 }

+//------------------------------------------------------------------
+// RINIT/RGET/RNEXT/RXOR
+//------------------------------------------------------------------
+
 microVUf(void) mVU_RINIT() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeR1<vuIndex>(_Fs_, _Fsf_); }
 	else { 
 		if (_Fs_ || (_Fsf_ == 3)) {
 			getReg8(gprR, _Fs_, _Fsf_);
@ -890,7 +936,7 @@ microVUf(void) mVU_RINIT() {

 microVUt(void) mVU_RGET_() {
 	microVU* mVU = mVUx;
-	if (_Ft_) {
+	if (!noWriteVF) {
 		if (_X) MOV32RtoM((uptr)&mVU->regs->VF[_Ft_].UL[0], gprR);
 		if (_Y) MOV32RtoM((uptr)&mVU->regs->VF[_Ft_].UL[1], gprR);
 		if (_Z) MOV32RtoM((uptr)&mVU->regs->VF[_Ft_].UL[2], gprR);
@ -900,13 +946,13 @@ microVUt(void) mVU_RGET_() {

 microVUf(void) mVU_RGET() {
 	microVU* mVU = mVUx;
-	if (!recPass) { /*if (!_Ft_) nop();*/ }
+	if (!recPass) { mVUanalyzeR2<vuIndex>(_Ft_, 1); }
 	else { mVU_RGET_<vuIndex>(); }
 }

 microVUf(void) mVU_RNEXT() {
 	microVU* mVU = mVUx;
-	if (!recPass) { /*if (!_Ft_) nop();*/ }
+	if (!recPass) { mVUanalyzeR2<vuIndex>(_Ft_, 0); }
 	else { 
 		// algorithm from www.project-fao.org
 		MOV32RtoR(gprT1, gprR);
@ -928,7 +974,7 @@ microVUf(void) mVU_RNEXT() {

 microVUf(void) mVU_RXOR() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeR1<vuIndex>(_Fs_, _Fsf_); }
 	else { 
 		if (_Fs_ || (_Fsf_ == 3)) {
 			getReg8(gprT1, _Fs_, _Fsf_);
@ -938,21 +984,27 @@ microVUf(void) mVU_RXOR() {
 	}
 }

+//------------------------------------------------------------------
+// WaitP/WaitQ
+//------------------------------------------------------------------
+
 microVUf(void) mVU_WAITP() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
-	else {}
+	if (!recPass) { mVUstall = aMax(mVUstall, ((mVUregs.p) ? (mVUregs.p - 1) : 0)); }
 }

 microVUf(void) mVU_WAITQ() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
-	else {}
+	if (!recPass) { mVUstall = aMax(mVUstall, mVUregs.q); }
 }

+//------------------------------------------------------------------
+// XTOP/XITOP
+//------------------------------------------------------------------
+
 microVUf(void) mVU_XTOP() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { if (!_Ft_) { mVUinfo |= _isNOP; } analyzeVIreg2(_Ft_, 1); }
 	else { 
 		MOVZX32M16toR( gprT1, (uptr)&mVU->regs->vifRegs->top);
 		mVUallocVIb<vuIndex>(gprT1, _Ft_);
@ -961,13 +1013,17 @@ microVUf(void) mVU_XTOP() {

 microVUf(void) mVU_XITOP() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { if (!_Ft_) { mVUinfo |= _isNOP; } analyzeVIreg2(_Ft_, 1); }
 	else { 
 		MOVZX32M16toR( gprT1, (uptr)&mVU->regs->vifRegs->itop );
 		mVUallocVIb<vuIndex>(gprT1, _Ft_);
 	}
 }

+//------------------------------------------------------------------
+// XGkick
+//------------------------------------------------------------------
+
 microVUt(void) __fastcall mVU_XGKICK_(u32 addr) {
 	microVU* mVU = mVUx;
 	u32 *data = (u32*)(mVU->regs->Mem + (addr&0x3fff));
@ -981,61 +1037,106 @@ void __fastcall mVU_XGKICK1(u32 addr) { mVU_XGKICK_<1>(addr); }

 microVUf(void) mVU_XGKICK() {
 	microVU* mVU = mVUx;
-	if (!recPass) {}
+	if (!recPass) { mVUanalyzeXGkick<vuIndex>(_Fs_, 4); }
 	else {
 		mVUallocVIa<vuIndex>(gprT2, _Fs_); // gprT2 = ECX for __fastcall
+		PUSH32R(gprR); // gprR = EDX is volatile so backup
 		if (!vuIndex)  CALLFunc((uptr)mVU_XGKICK0);
 		else		   CALLFunc((uptr)mVU_XGKICK1);
+		POP32R(gprR); // Restore
 	}
 }

 //------------------------------------------------------------------
-// Branches
+// Branches/Jumps
 //------------------------------------------------------------------

 microVUf(void) mVU_B() {
 	microVU* mVU = mVUx;
 	mVUbranch = 1;
+	if (!recPass) { /*mVUinfo |= _isBranch2;*/ }
 }
 microVUf(void) mVU_BAL() {
 	microVU* mVU = mVUx;
-	mVUbranch = 1; 
-	if (recPass) {
-		MOV32ItoR(gprT1, (xPC + (2 * 8)) & 0xffff);
-		mVUallocVIb<vuIndex>(gprT1, _Ft_);
-	}
+	mVUbranch = 2;
+	if (!recPass) { /*mVUinfo |= _isBranch2;*/ analyzeVIreg2(_Ft_, 1); }
+	else {}
 }
 microVUf(void) mVU_IBEQ() {
 	microVU* mVU = mVUx;
-	mVUbranch = 2;
+	mVUbranch = 3;
+	if (!recPass) { mVUanalyzeBranch2<vuIndex>(_Fs_, _Ft_); }
+	else {
+		if (memReadIs) MOV32MtoR(gprT1, (uptr)mVU->VIbackup[0]);
+		else mVUallocVIa<vuIndex>(gprT1, _Fs_);
+		if (memReadIt) XOR32MtoR(gprT1, (uptr)mVU->VIbackup[0]);
+		else { mVUallocVIa<vuIndex>(gprT2, _Ft_); XOR32RtoR(gprT1, gprT2); }
+		MOV32RtoM((uptr)mVU->branch, gprT1);
+	}
 }
 microVUf(void) mVU_IBGEZ() {
 	microVU* mVU = mVUx;
-	mVUbranch = 2;
+	mVUbranch = 4;
+	if (!recPass) { mVUanalyzeBranch1<vuIndex>(_Fs_); }
+	else {
+		if (memReadIs) MOV32MtoR(gprT1, (uptr)mVU->VIbackup[0]);
+		else mVUallocVIa<vuIndex>(gprT1, _Fs_);
+		//SHR32ItoR(gprT1, 15);
+		MOV32RtoM((uptr)mVU->branch, gprT1);
+	}
 }
 microVUf(void) mVU_IBGTZ() {
 	microVU* mVU = mVUx;
-	mVUbranch = 2;
-}
-microVUf(void) mVU_IBLTZ() {
-	microVU* mVU = mVUx;
-	mVUbranch = 2;
+	mVUbranch = 5;
+	if (!recPass) { mVUanalyzeBranch1<vuIndex>(_Fs_); }
+	else {
+		if (memReadIs) MOV32MtoR(gprT1, (uptr)mVU->VIbackup[0]);
+		else mVUallocVIa<vuIndex>(gprT1, _Fs_);
+		MOV32RtoM((uptr)mVU->branch, gprT1);
+	}
 }
 microVUf(void) mVU_IBLEZ() {
 	microVU* mVU = mVUx;
-	mVUbranch = 2;
+	mVUbranch = 6;
+	if (!recPass) { mVUanalyzeBranch1<vuIndex>(_Fs_); }
+	else {
+		if (memReadIs) MOV32MtoR(gprT1, (uptr)mVU->VIbackup[0]);
+		else mVUallocVIa<vuIndex>(gprT1, _Fs_);
+		MOV32RtoM((uptr)mVU->branch, gprT1);
+	}
+}
+microVUf(void) mVU_IBLTZ() {
+	microVU* mVU = mVUx;
+	mVUbranch = 7;
+	if (!recPass) { mVUanalyzeBranch1<vuIndex>(_Fs_); }
+	else {
+		if (memReadIs) MOV32MtoR(gprT1, (uptr)mVU->VIbackup[0]);
+		else mVUallocVIa<vuIndex>(gprT1, _Fs_);
+		//SHR32ItoR(gprT1, 15);
+		MOV32RtoM((uptr)mVU->branch, gprT1);
+	}
 }
 microVUf(void) mVU_IBNE() {
 	microVU* mVU = mVUx;
-	mVUbranch = 2;
+	mVUbranch = 8;
+	if (!recPass) { mVUanalyzeBranch2<vuIndex>(_Fs_, _Ft_); }
+	else {
+		if (memReadIs) MOV32MtoR(gprT1, (uptr)mVU->VIbackup[0]);
+		else mVUallocVIa<vuIndex>(gprT1, _Fs_);
+		if (memReadIt) XOR32MtoR(gprT1, (uptr)mVU->VIbackup[0]);
+		else { mVUallocVIa<vuIndex>(gprT2, _Ft_); XOR32RtoR(gprT1, gprT2); }
+		MOV32RtoM((uptr)mVU->branch, gprT1);
+	}
 }
 microVUf(void) mVU_JR() {
 	microVU* mVU = mVUx;
-	mVUbranch = 3;
+	mVUbranch = 9;
+	if (!recPass) { mVUanalyzeBranch1<vuIndex>(_Fs_); }
 }
 microVUf(void) mVU_JALR() {
 	microVU* mVU = mVUx;
-	mVUbranch = 3;
+	mVUbranch = 10;
+	if (!recPass) { mVUanalyzeBranch1<vuIndex>(_Fs_); analyzeVIreg2(_Ft_, 1); }
 }

 #endif //PCSX2_MICROVU
--- a/pcsx2/x86/microVU_Misc.h
+++ b/pcsx2/x86/microVU_Misc.h
@ -66,9 +66,9 @@ declareAllVariables
 //------------------------------------------------------------------
 // Helper Macros
 //------------------------------------------------------------------
-#define _Ft_ ((mVU->code >> 16) & 0x1F)  // The rt part of the instruction register 
-#define _Fs_ ((mVU->code >> 11) & 0x1F)  // The rd part of the instruction register 
-#define _Fd_ ((mVU->code >>  6) & 0x1F)  // The sa part of the instruction register
+#define _Ft_ ((mVU->code >> 16) & 0x1F)  // The ft/it part of the instruction register 
+#define _Fs_ ((mVU->code >> 11) & 0x1F)  // The fs/is part of the instruction register 
+#define _Fd_ ((mVU->code >>  6) & 0x1F)  // The fd/id part of the instruction register

 #define _X	 ((mVU->code>>24) & 0x1)
 #define _Y	 ((mVU->code>>23) & 0x1)
@ -143,20 +143,25 @@ declareAllVariables
 #define mVUallocInfo mVU->prog.prog[mVU->prog.cur].allocInfo
 #define mVUbranch	 mVUallocInfo.branch
 #define mVUcycles	 mVUallocInfo.cycles
+#define mVUcount	 mVUallocInfo.count
 #define mVUstall	 mVUallocInfo.maxStall
-#define mVUdivFlag	 mVUallocInfo.divFlag
-#define mVUdivFlagT	 mVUallocInfo.divFlagTimer
 #define mVUregs		 mVUallocInfo.regs
 #define mVUregsTemp	 mVUallocInfo.regsTemp
 #define mVUinfo		 mVUallocInfo.info[mVUallocInfo.curPC / 2]
+#define mVUstartPC	 mVUallocInfo.startPC
 #define iPC			 mVUallocInfo.curPC
 #define xPC			 ((iPC / 2) * 8)
-#define incCycles(x) { mVUcycles += x; }
+#define curI		 mVUcurProg.data[iPC]
+#define setCode()	 { mVU->code = curI; }
+#define incPC(x)	 { iPC = ((iPC + x) & (mVU->progSize-1)); setCode(); }
+#define incPC2(x)	 { iPC = ((iPC + x) & (mVU->progSize-1)); }
+#define incCycles(x) { mVUincCycles<vuIndex>(x); }

 #define _isNOP		 (1<<0) // Skip Lower Instruction
 #define _isBranch	 (1<<1) // Cur Instruction is a Branch
 #define _isEOB		 (1<<2) // End of Block
 #define _isBdelay	 (1<<3) // Cur Instruction in Branch Delay slot
+#define _isSflag	 (1<<4) // Cur Instruction uses status flag
 #define _writeQ		 (1<<5)
 #define _readQ		 (1<<6)
 #define _writeP		 (1<<7)
@ -166,17 +171,25 @@ declareAllVariables
 #define _doStatus	 (1<<9)
 #define _fmInstance	 (3<<10)
 #define _fsInstance	 (3<<12)
-#define _fcInstance	 (3<<14)
-#define _fpmInstance (3<<10)
 #define _fpsInstance (3<<12)
+#define _fcInstance	 (3<<14)
+#define _fpcInstance (3<<14)
 #define _fvmInstance (3<<16)
 #define _fvsInstance (3<<18)
-#define _fvcInstance (3<<14)
+#define _fvcInstance (3<<20)
+#define _noWriteVF	 (1<<21) // Don't write back the result of a lower op to VF reg if upper op writes to same reg (or if VF = 0)
+#define _backupVI	 (1<<22) // Backup VI reg to memory if modified before branch (branch uses old VI value unless opcode is ILW or ILWR)
+#define _memReadIs	 (1<<23) // Read Is (VI reg) from memory (used by branches)
+#define _memReadIt	 (1<<24) // Read If (VI reg) from memory (used by branches)
+#define _writesVI	 (1<<25) // Current Instruction writes to VI
+#define _swapOps	 (1<<26) // Runs Lower Instruction Before Upper Instruction
+//#define _isBranch2	 (1<<27) // Cur Instruction is a Branch that writes VI regs (BAL/JALR)

 #define isNOP		 (mVUinfo & (1<<0))
 #define isBranch	 (mVUinfo & (1<<1))
 #define isEOB		 (mVUinfo & (1<<2))
 #define isBdelay	 (mVUinfo & (1<<3))
+#define isSflag		 (mVUinfo & (1<<4))
 #define writeQ		((mVUinfo >> 5) & 1)
 #define readQ		((mVUinfo >> 6) & 1)
 #define writeP		((mVUinfo >> 7) & 1)
@ -192,11 +205,32 @@ declareAllVariables
 #define fvmInstance	((mVUinfo >> 16) & 3)
 #define fvsInstance	((mVUinfo >> 18) & 3)
 #define fvcInstance	((mVUinfo >> 20) & 3)
-
-//#define getFs		 (mVUinfo & (1<<13))
-//#define getFt		 (mVUinfo & (1<<14))
-//#define fpmInstance	(((u8)((mVUinfo & (3<<10)) >> 10) - 1) & 0x3)
+#define noWriteVF	 (mVUinfo & (1<<21))
+#define backupVI	 (mVUinfo & (1<<22))
+#define memReadIs	 (mVUinfo & (1<<23))
+#define memReadIt	 (mVUinfo & (1<<24))
+#define writesVI	 (mVUinfo & (1<<25))
+#define swapOps		 (mVUinfo & (1<<26))
+//#define isBranch2	 (mVUinfo & (1<<27))

 #define isMMX(_VIreg_)	(_VIreg_ >= 1 && _VIreg_ <=9)
 #define mmVI(_VIreg_)	(_VIreg_ - 1)

+#ifdef mVUdebug
+#define mVUlog Console::Notice
+#define mVUdebug1() {											\
+	if (curI & _Ibit_)	{ SysPrintf("microVU: I-bit set!\n"); }	\
+	if (curI & _Ebit_)	{ SysPrintf("microVU: E-bit set!\n"); }	\
+	if (curI & _Mbit_)	{ SysPrintf("microVU: M-bit set!\n"); }	\
+	if (curI & _Dbit_)	{ SysPrintf("microVU: D-bit set!\n"); }	\
+	if (curI & _Tbit_)	{ SysPrintf("microVU: T-bit set!\n"); }	\
+}
+#else
+#define mVUlog 0&&
+#define mVUdebug1() {}
+#endif
+
+#define mVUcachCheck(start, limit) {  \
+	uptr diff = mVU->ptr - start; \
+	if (diff >= limit) { Console::Error("microVU Error: Program went over it's cache limit. Size = %x", params diff); } \
+}
--- a/pcsx2/x86/microVU_Misc.inl
+++ b/pcsx2/x86/microVU_Misc.inl
@ -85,11 +85,11 @@ microVUx(void) mVUloadReg(int reg, uptr offset, int xyzw) {

 microVUx(void) mVUloadReg2(int reg, int gprReg, uptr offset, int xyzw) {
 	switch( xyzw ) {
-		case 8:		SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset);		break; // X
-		case 4:		SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+4);	break; // Y
-		case 2:		SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+8);	break; // Z
-		case 1:		SSE_MOVSS_RmOffset_to_XMM(reg, gprReg, offset+12);	break; // W
-		default:	SSE_MOVAPSRmtoROffset(reg, gprReg, offset);			break;
+		case 8:		SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset);	 break; // X
+		case 4:		SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+4);  break; // Y
+		case 2:		SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+8);  break; // Z
+		case 1:		SSE_MOVSS_Rm_to_XMM(reg, gprReg, offset+12); break; // W
+		default:	SSE_MOVAPSRmtoR(reg, gprReg, offset);		 break;
 	}
 }

@ -142,44 +142,44 @@ microVUx(void) mVUsaveReg2(int reg, int gprReg, u32 offset, int xyzw) {
 	switch ( xyzw ) {
 		case 5:		SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xB1);
 					SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
-					SSE_MOVSS_XMM_to_RmOffset(gprReg, offset+4, reg);
-					SSE_MOVSS_XMM_to_RmOffset(gprReg, offset+12, xmmT1);
+					SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+4);
+					SSE_MOVSS_XMM_to_Rm(gprReg, xmmT1, offset+12);
 					break; // YW
 		case 6:		SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0xc9);
-					SSE_MOVLPS_XMM_to_RmOffset(gprReg, offset+4, xmmT1);
+					SSE_MOVLPS_XMM_to_Rm(gprReg, xmmT1, offset+4);
 					break; // YZ
 		case 7:		SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x93); //ZYXW
-					SSE_MOVHPS_XMM_to_RmOffset(gprReg, offset+4, xmmT1);
-					SSE_MOVSS_XMM_to_RmOffset(gprReg, offset+12, xmmT1);
+					SSE_MOVHPS_XMM_to_Rm(gprReg, xmmT1, offset+4);
+					SSE_MOVSS_XMM_to_Rm(gprReg, xmmT1, offset+12);
 					break; // YZW
 		case 9:		SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
-					SSE_MOVSS_XMM_to_RmOffset(gprReg, offset, reg);
+					SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
 					if ( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSLDUP_XMM_to_XMM(xmmT1, xmmT1);
 					else SSE2_PSHUFD_XMM_to_XMM(xmmT1, xmmT1, 0x55);
-					SSE_MOVSS_XMM_to_RmOffset(gprReg, offset+12, xmmT1);
+					SSE_MOVSS_XMM_to_Rm(gprReg, xmmT1, offset+12);
 					break; // XW
 		case 10:	SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
-					SSE_MOVSS_XMM_to_RmOffset(gprReg, offset, reg);
-					SSE_MOVSS_XMM_to_RmOffset(gprReg, offset+8, xmmT1);
+					SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
+					SSE_MOVSS_XMM_to_Rm(gprReg, xmmT1, offset+8);
 					break; //XZ
-		case 11:	SSE_MOVSS_XMM_to_RmOffset(gprReg, offset, reg);
-					SSE_MOVHPS_XMM_to_RmOffset(gprReg, offset+8, reg);
+		case 11:	SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);
+					SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+8);
 					break; //XZW
 		case 13:	SSE2_PSHUFD_XMM_to_XMM(xmmT1, reg, 0x4b); //YXZW				
-					SSE_MOVHPS_XMM_to_RmOffset(gprReg, offset, xmmT1);
-					SSE_MOVSS_XMM_to_RmOffset(gprReg, offset+12, xmmT1);
+					SSE_MOVHPS_XMM_to_Rm(gprReg, xmmT1, offset);
+					SSE_MOVSS_XMM_to_Rm(gprReg, xmmT1, offset+12);
 					break; // XYW
 		case 14:	SSE_MOVHLPS_XMM_to_XMM(xmmT1, reg);
-					SSE_MOVLPS_XMM_to_RmOffset(gprReg, offset, reg);
-					SSE_MOVSS_XMM_to_RmOffset(gprReg, offset+8, xmmT1);
+					SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset);
+					SSE_MOVSS_XMM_to_Rm(gprReg,  xmmT1, offset+8);
 					break; // XYZ
-		case 8:		SSE_MOVSS_XMM_to_RmOffset(gprReg, offset, reg);		break; // X
-		case 4:		SSE_MOVSS_XMM_to_RmOffset(gprReg, offset+4, reg);	break; // Y
-		case 2:		SSE_MOVSS_XMM_to_RmOffset(gprReg, offset+8, reg);	break; // Z
-		case 1:		SSE_MOVSS_XMM_to_RmOffset(gprReg, offset+12, reg);	break; // W
-		case 12:	SSE_MOVLPS_XMM_to_RmOffset(gprReg, offset, reg);	break; // XY
-		case 3:		SSE_MOVHPS_XMM_to_RmOffset(gprReg, offset+8, reg);	break; // ZW
-		default:	SSE_MOVAPSRtoRmOffset(gprReg, offset, reg);			break; // XYZW
+		case 8:		SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset);	 break; // X
+		case 4:		SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+4);	 break; // Y
+		case 2:		SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+8);	 break; // Z
+		case 1:		SSE_MOVSS_XMM_to_Rm(gprReg, reg, offset+12); break; // W
+		case 12:	SSE_MOVLPS_XMM_to_Rm(gprReg, reg, offset);	 break; // XY
+		case 3:		SSE_MOVHPS_XMM_to_Rm(gprReg, reg, offset+8); break; // ZW
+		default:	SSE_MOVAPSRtoRm(gprReg, reg, offset);		 break; // XYZW
 	}
 }

@ -251,7 +251,7 @@ microVUt(void) mVUaddrFix(int gprReg) {
 		u8 *jmpA, *jmpB; 
 		CMP32ItoR(EAX, 0x400);
 		jmpA = JL8(0); // if addr >= 0x4000, reads VU1's VF regs and VI regs
-			AND32ItoR(EAX, 0x43f);
+			AND32ItoR(EAX, 0x43f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs!
 			jmpB = JMP8(0);
 		x86SetJ8(jmpA);
 			AND32ItoR(EAX, 0xff); // if addr < 0x4000, wrap around
--- a/plugins/GSdx/GS.h
+++ b/plugins/GSdx/GS.h
@ -26,7 +26,7 @@

 #pragma once

-#define PLUGIN_VERSION 14
+#define PLUGIN_VERSION 15

 #include "GSVector.h"

--- a/plugins/GSdx/GSRendererHW.h
+++ b/plugins/GSdx/GSRendererHW.h
@ -349,8 +349,6 @@ protected:

 		OverrideOutput();

-		m_tc->InvalidateTextures(context->FRAME, context->ZBUF);
-
 		if(s_dump)
 		{
 			CString str;
@ -360,6 +358,8 @@ protected:
 			if(s_savez) ds->m_texture.Save(str);
 			// if(s_savez) m_dev.SaveToFileD32S8X24(ds->m_texture, str); // TODO
 		}
+
+		m_tc->InvalidateTextures(context->FRAME, context->ZBUF);
 	}

 	virtual void Draw(int prim, Texture& rt, Texture& ds, typename GSTextureCache<Device>::GSTexture* tex) = 0;
@ -507,6 +507,35 @@ protected:

 		#pragma endregion

+		#pragma region GoW2 z buffer clear
+
+		if(m_game.title == CRC::GodOfWar2)
+		{
+			DWORD FBP = m_context->FRAME.Block();
+			DWORD FBW = m_context->FRAME.FBW;
+			DWORD FPSM = m_context->FRAME.PSM;
+
+			if((FBP == 0x00f00 || FBP == 0x00100) && FPSM == PSM_PSMZ24) // ntsc 0xf00, pal 0x100
+			{
+				GIFRegTEX0 TEX0;
+
+				TEX0.TBP0 = FBP;
+				TEX0.TBW = FBW;
+				TEX0.PSM = FPSM;
+
+				if(GSTextureCache<Device>::GSDepthStencil* ds = m_tc->GetDepthStencil(TEX0, m_width, m_height))
+				{
+					m_dev.ClearDepth(ds->m_texture, 0);
+				}
+
+				return false;
+			}
+
+			return true;
+		}
+
+		#pragma endregion
+
 		return true;
 	}

--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@ -2081,6 +2081,27 @@ bool GSC_GodOfWar(const GSFrameInfo& fi, int& skip)
 	return true;
 }

+bool GSC_GodOfWar2(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x00100 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x00100 && fi.TPSM == PSM_PSMCT16 // ntsc
+		|| fi.TME && fi.FBP == 0x02100 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x02100 && fi.TPSM == PSM_PSMCT16) // pal
+		{
+			skip = 30; // shadows
+		}
+		else if(fi.TME && fi.FBP == 0x00500 && fi.FPSM == PSM_PSMCT24 && fi.TBP0 == 0x02100 && fi.TPSM == PSM_PSMCT32) // pal
+		{
+			// skip = 17; // only looks correct at native resolution
+		}
+	}
+	else
+	{
+	}
+
+	return true;
+}
+
 bool GSC_GiTS(const GSFrameInfo& fi, int& skip)
 {
 	if(skip == 0)
@ -2172,7 +2193,7 @@ bool GSState::IsBadFrame(int& skip)
 		map[CRC::Tekken5] = GSC_Tekken5;
 		map[CRC::IkkiTousen] = GSC_IkkiTousen;
 		map[CRC::GodOfWar] = GSC_GodOfWar;
-		map[CRC::GodOfWar2] = GSC_GodOfWar;
+		map[CRC::GodOfWar2] = GSC_GodOfWar2;
 		map[CRC::GiTS] = GSC_GiTS;
 		map[CRC::Onimusha3] = GSC_Onimusha3;
 		map[CRC::TalesOfAbyss] = GSC_TalesOfAbyss;
--- a/plugins/GSdx/GSTexture9.cpp
+++ b/plugins/GSdx/GSTexture9.cpp
@ -140,7 +140,7 @@ void GSTexture9::Unmap()

 bool GSTexture9::Save(CString fn, bool dds)
 {
-	CComPtr<IDirect3DResource9> res;
+	CComPtr<IDirect3DSurface9> surface;
 	
 	if(m_desc.Usage & D3DUSAGE_DEPTHSTENCIL)
 	{
@ -153,8 +153,6 @@ bool GSTexture9::Save(CString fn, bool dds)
 		if(desc.Format != D3DFMT_D32F_LOCKABLE)
 			return false;

-		CComPtr<IDirect3DSurface9> surface;
-		
 		hr = m_dev->CreateOffscreenPlainSurface(desc.Width, desc.Height, D3DFMT_A8R8G8B8, D3DPOOL_SYSTEMMEM, &surface, NULL);

 		D3DLOCKED_RECT slr, dlr;
@ -175,24 +173,22 @@ bool GSTexture9::Save(CString fn, bool dds)

 		m_surface->UnlockRect();
 		surface->UnlockRect();
-
-		res = surface;
 	}
 	else
 	{
-		res = m_surface;
+		surface = m_surface;
 	}

-	if(CComQIPtr<IDirect3DSurface9> surface = res)
+	if(surface != NULL)
 	{
 		return SUCCEEDED(D3DXSaveSurfaceToFile(fn, dds ? D3DXIFF_DDS : D3DXIFF_BMP, surface, NULL, NULL));
 	}
-
-	if(CComQIPtr<IDirect3DTexture9> texture = res)
+/*
+	if(CComQIPtr<IDirect3DTexture9> texture = surface)
 	{
 		return SUCCEEDED(D3DXSaveTextureToFile(fn, dds ? D3DXIFF_DDS : D3DXIFF_BMP, texture, NULL));
 	}
-
+*/
 	return false;
 }

--- a/plugins/LilyPad/Config.cpp
+++ b/plugins/LilyPad/Config.cpp
@ -1404,18 +1404,59 @@ INT_PTR CALLBACK DialogProc(HWND hWnd, unsigned int msg, WPARAM wParam, LPARAM l
 				if (i >= 0) {
 					unsigned int index = (unsigned int)SendMessage(GetDlgItem(hWnd, IDC_FORCEFEEDBACK), CB_GETITEMDATA, i, 0);
 					if (index < (unsigned int) dm->numDevices) {
+						Device *dev = dm->devices[index];
 						ForceFeedbackBinding *b;
-						int count = CreateEffectBinding(dm->devices[index], 0, port, slot, cmd-ID_BIG_MOTOR, &b);
+						int count = CreateEffectBinding(dev, 0, port, slot, cmd-ID_BIG_MOTOR, &b);
 						if (b) {
-							for (int j=0; j<2 && j <dm->devices[index]->numFFAxes; j++) {
+							int needSet = 1;
+							if (dev->api == XINPUT && dev->numFFAxes == 2) {
+								needSet = 0;
+								if (cmd == ID_BIG_MOTOR) {
+									b->axes[0].force = BASE_SENSITIVITY;
+								}
+								else {
+									b->axes[1].force = BASE_SENSITIVITY;
+								}
+							}
+							else if (dev->api == DI) {
+								int bigIndex=0, littleIndex=0;
+								int constantEffect = 0, squareEffect = 0;
+								int j;
+								for (j=0; j<dev->numFFAxes; j++) {
+									// DI object instance.  0 is x-axis, 1 is y-axis.
+									int instance = (dev->ffAxes[j].id>>8)&0xFFFF;
+									if (instance == 0) {
+										bigIndex = j;
+									}
+									else if (instance == 1) {
+										littleIndex = j;
+									}
+								}
+								for (j=0; j<dev->numFFEffectTypes; j++) {
+									if (!wcsicmp(L"13541C20-8E33-11D0-9AD0-00A0C9A06E35", dev->ffEffectTypes[j].effectID)) constantEffect = j;
+									if (!wcsicmp(L"13541C22-8E33-11D0-9AD0-00A0C9A06E35", dev->ffEffectTypes[j].effectID)) squareEffect = j;
+								}
+								needSet = 0;
+								if (cmd == ID_BIG_MOTOR) {
+									b->axes[bigIndex].force = BASE_SENSITIVITY;
+									b->axes[littleIndex].force = 1;
+									b->effectIndex = constantEffect;
+								}
+								else {
+									b->axes[bigIndex].force = 1;
+									b->axes[littleIndex].force = BASE_SENSITIVITY;
+									b->effectIndex = squareEffect;
+								}
+							}
+							if (needSet) {
+								for (int j=0; j<2 && j <dev->numFFAxes; j++) {
 									b->axes[j].force = BASE_SENSITIVITY;
 								}
 							}
-						if (count >= 0) {
-							PropSheet_Changed(hWndProp, hWnd);
 							UnselectAll(hWndList);
 							ListView_SetItemState(hWndList, count, LVIS_SELECTED, LVIS_SELECTED);
 						}
+						PropSheet_Changed(hWndProp, hWnd);
 					}
 				}
 			}
@ -1867,7 +1908,7 @@ INT_PTR CALLBACK GeneralDialogProc(HWND hWnd, unsigned int msg, WPARAM wParam, L
 								InsertMenuItemW(hMenu, index, 1, &info);
 							}
 							else {
-								info.wID = port2+2*slot2;
+								info.wID = port2+2*slot2+1;
 								wsprintfW(text, L"Swap with %s", pad);
 								InsertMenuItemW(hMenu, 0, 1, &info);
 							}
@ -1879,12 +1920,14 @@ INT_PTR CALLBACK GeneralDialogProc(HWND hWnd, unsigned int msg, WPARAM wParam, L
 					DestroyMenu(hMenu);
 					if (!res) break;
 					if (res > 0) {
+						res--;
 						slot2 = res / 2;
 						port2 = res&1;
 						PadConfig padCfgTemp = config.padConfigs[port1][slot1];
 						config.padConfigs[port1][slot1] = config.padConfigs[port2][slot2];
 						config.padConfigs[port2][slot2] = padCfgTemp;
 						for (int i=0; i<dm->numDevices; i++) {
+							if (dm->devices[i]->type == IGNORE) continue;
 							PadBindings bindings = dm->devices[i]->pads[port1][slot1];
 							dm->devices[i]->pads[port1][slot1] = dm->devices[i]->pads[port2][slot2];
 							dm->devices[i]->pads[port2][slot2] = bindings;
@ -1892,6 +1935,7 @@ INT_PTR CALLBACK GeneralDialogProc(HWND hWnd, unsigned int msg, WPARAM wParam, L
 					}
 					else {
 						for (int i=0; i<dm->numDevices; i++) {
+							if (dm->devices[i]->type == IGNORE) continue;
 							free(dm->devices[i]->pads[port1][slot1].bindings);
 							for (int j=0; j<dm->devices[i]->pads[port1][slot1].numFFBindings; j++) {
 								free(dm->devices[i]->pads[port1][slot1].ffBindings[j].axes);
--- a/plugins/LilyPad/KeyboardQueue.cpp
+++ b/plugins/LilyPad/KeyboardQueue.cpp
@ -1,13 +1,16 @@
 // This is undoubtedly completely unnecessary.
 #include "KeyboardQueue.h"

-static int numQueuedEvents = 0;
-static keyEvent queuedEvents[20];
-
 // What MS calls a single process Mutex.  Faster, supposedly.
 // More importantly, can be abbreviated, amusingly, as cSection.
 static CRITICAL_SECTION cSection;
-static int csInitialized = 0;
+static u8 csInitialized = 0;
+
+#define EVENT_QUEUE_LEN 16
+// Actually points one beyond the last queued event.
+static u8 lastQueuedEvent = 0;
+static u8 nextQueuedEvent = 0;
+static keyEvent queuedEvents[EVENT_QUEUE_LEN];

 void QueueKeyEvent(int key, int event) {
 	if (!csInitialized) {
@ -15,50 +18,42 @@ void QueueKeyEvent(int key, int event) {
 		InitializeCriticalSection(&cSection);
 	}
 	EnterCriticalSection(&cSection);
-	if (numQueuedEvents >= 15) {
-		// Generally shouldn't happen.
-		for (int i=0; i<15; i++) {
-			queuedEvents[i] = queuedEvents[i+5];
-		}
-		numQueuedEvents = 15;
-	}
-	int index = numQueuedEvents;
-	// Move escape to top of queue.  May do something
+
+	// Don't queue events if escape is on top of queue.  This is just for safety
+	// purposes when a game is killing the emulator for whatever reason.
+	if (nextQueuedEvent == lastQueuedEvent ||
+		queuedEvents[nextQueuedEvent].key != VK_ESCAPE ||
+		queuedEvents[nextQueuedEvent].evt != KEYPRESS) {
+			// Clear queue on escape down, bringing escape to front.  May do something
 			// with shift/ctrl/alt and F-keys, later.
 			if (event == KEYPRESS && key == VK_ESCAPE) {
-		while (index) {
-			queuedEvents[index-1] = queuedEvents[index];
-			index--;
+				nextQueuedEvent = lastQueuedEvent;
+			}
+
+			queuedEvents[lastQueuedEvent].key = key;
+			queuedEvents[lastQueuedEvent].evt = event;
+
+			lastQueuedEvent = (lastQueuedEvent + 1) % EVENT_QUEUE_LEN;
+			// If queue wrapped around, remove last element.
+			if (nextQueuedEvent == lastQueuedEvent) {
+				nextQueuedEvent = (nextQueuedEvent + 1) % EVENT_QUEUE_LEN;
 			}
 	}
-	queuedEvents[index].key = key;
-	queuedEvents[index].evt = event;
-	numQueuedEvents ++;
 	LeaveCriticalSection(&cSection);
 }

 int GetQueuedKeyEvent(keyEvent *event) {
-	int out = 0;
-	if (numQueuedEvents) {
+	if (lastQueuedEvent == nextQueuedEvent) return 0;
+
 	EnterCriticalSection(&cSection);
-		// Shouldn't be 0, but just in case...
-		if (numQueuedEvents) {
-			*event = queuedEvents[0];
-			numQueuedEvents--;
-			out = 1;
-			for (int i=0; i<numQueuedEvents; i++) {
-				queuedEvents[i] = queuedEvents[i+1];
-			}
-		}
+	*event = queuedEvents[nextQueuedEvent];
+	nextQueuedEvent = (nextQueuedEvent + 1) % EVENT_QUEUE_LEN;
 	LeaveCriticalSection(&cSection);
-	}
-	return out;
+	return 1;
 }

 void ClearKeyQueue() {
-	if (numQueuedEvents) {
-		numQueuedEvents = 0;
-	}
+	lastQueuedEvent = nextQueuedEvent;
 	if (csInitialized) {
 		DeleteCriticalSection(&cSection);
 		csInitialized = 0;
--- a/plugins/LilyPad/LilyPad.cpp
+++ b/plugins/LilyPad/LilyPad.cpp
@ -23,7 +23,7 @@
 #endif

 // LilyPad version.
-#define VERSION ((0<<8) | 9 | (11<<24))
+#define VERSION ((0<<8) | 10 | (0<<24))

 // Used to prevent reading input and cleaning up input devices at the same time.
 // Only an issue when not reading input in GS thread and disabling devices due to
@ -115,7 +115,7 @@ struct ButtonSum {
 	Stick sticks[3];
 };

-
+// Freeze data, for a single pad.
 struct PadFreezeData {
 	// Digital / Analog / DS2 Native
 	u8 mode;
@ -168,6 +168,12 @@ u8 Cap (int i) {
 	return (u8) i;
 }

+inline void ReleaseModifierKeys() {
+	QueueKeyEvent(VK_SHIFT, KEYRELEASE);
+	QueueKeyEvent(VK_MENU, KEYRELEASE);
+	QueueKeyEvent(VK_CONTROL, KEYRELEASE);
+}
+
 // RefreshEnabledDevices() enables everything that can potentially
 // be bound to, as well as the "Ignore keyboard" device.
 //
@ -677,9 +683,7 @@ s32 CALLBACK PADinit(u32 flags) {
 	query.numBytes = 0;
 	ClearKeyQueue();
 	// Just in case, when resuming emulation.
-	QueueKeyEvent(VK_SHIFT, KEYRELEASE);
-	QueueKeyEvent(VK_MENU, KEYRELEASE);
-	QueueKeyEvent(VK_CONTROL, KEYRELEASE);
+	ReleaseModifierKeys();
 	return 0;
 }

@ -756,9 +760,7 @@ ExtraWndProcResult HackWndProc(HWND hWnd, UINT uMsg, WPARAM wParam, LPARAM lPara
 		case WM_ACTIVATEAPP:
 			// Release any buttons PCSX2 may think are down when
 			// losing/gaining focus.
-			QueueKeyEvent(VK_SHIFT, KEYRELEASE);
-			QueueKeyEvent(VK_MENU, KEYRELEASE);
-			QueueKeyEvent(VK_CONTROL, KEYRELEASE);
+			ReleaseModifierKeys();

 			// Need to do this when not reading input from gs thread.
 			// Checking for that case not worth the effort.
@ -1227,12 +1229,14 @@ DWORD WINAPI RenameWindowThreadProc(void *lpParameter) {
 }

 keyEvent* CALLBACK PADkeyEvent() {
+	// If running both pads, ignore every other call.  So if two keys pressed in same interval...
 	static char eventCount = 0;
 	eventCount++;
 	if (eventCount < openCount) {
 		return 0;
 	}
 	eventCount = 0;
+
 	if (!config.GSThreadUpdates) {
 		Update(2, 0);
 	}
@ -1327,7 +1331,7 @@ s32 CALLBACK PADfreeze(int mode, freezeData *data) {
 				break;
 			}

-			// Note sure if the cast is strictly necessary, but feel safest with it there...
+			// Not sure if the cast is strictly necessary, but feel safest with it there...
 			*(PadFreezeData*)&pads[port][slot] = pdata.padData[slot];
 		}
 		if (pdata.slot < 4)
--- a/plugins/LilyPad/XInput.cpp
+++ b/plugins/LilyPad/XInput.cpp
@ -150,11 +150,10 @@ public:
 	}

 	void Deactivate() {
-		if (xInputVibration.wLeftMotorSpeed || xInputVibration.wRightMotorSpeed) {
 		memset(&xInputVibration, 0, sizeof(xInputVibration));
-			pXInputSetState(index, &xInputVibration);
-		}
 		memset(ps2Vibration, 0, sizeof(ps2Vibration));
+		pXInputSetState(index, &xInputVibration);
+
 		FreeState();
 		if (active) {
 			if (!--xInputActiveCount) {