pcsx2: Implemented Threaded VU1 :D

Threading VU1 took a lot of rewrites and new code to make possible (MTGS, microVU, gifUnit...), but we finally got to the point where it was feasible, and now we've done it! (so now everyone can stop complaining that pcsx2 only takes advantages of 2 cores :p). The speedups in the games that benefit from it are great if you have a cpu with 3+ cores (generally a 10~45% speedup), however games that are GS limited can be a slowdown (especially on dual core cpu's). The option can be found in the speedhacks section as "MTVU (Multi-Threaded microVU1)". And when enabled it should should show the VU thread-time percentage on the title bar window (Like we currently do for EE/GS/UI threads). It is listed as a speedhack because in order for threading VU1 to have been a speedup, we need to assume that games will not send gif packets containing Signal/Finish/Label commands from path 1 (vu1's xgkick). The good news is very-few games ever do this, so the compatibility of MTVU is very high (a game that does do this will likely hang). Note: vs2010 builds and Linux builds need to be updated to include "MTVU.h" and "MTVU.cpp". git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4865 96395faa-99c1-11dd-bbfe-3dabce05a288
2011-08-12 02:31:49 +00:00 · 2011-08-12 02:31:49 +00:00 · ac9bf45f98
parent 60cec5a9b0
commit ac9bf45f98
60 changed files with 1186 additions and 434 deletions
--- a/common/include/Utilities/PageFaultSource.h
+++ b/common/include/Utilities/PageFaultSource.h
@ -342,8 +342,8 @@ protected:
 struct _EXCEPTION_POINTERS;
 extern int SysPageFaultExceptionFilter(struct _EXCEPTION_POINTERS* eps);

-#	define PCSX2_PAGEFAULT_PROTECT		__try
-#	define PCSX2_PAGEFAULT_EXCEPT		__except(SysPageFaultExceptionFilter(GetExceptionInformation())) {}
+#	define PCSX2_PAGEFAULT_PROTECT __try
+#	define PCSX2_PAGEFAULT_EXCEPT  __except(SysPageFaultExceptionFilter(GetExceptionInformation())) {}

 #else
 #	error PCSX2 - Unsupported operating system platform.
@ -352,5 +352,7 @@ extern int SysPageFaultExceptionFilter(struct _EXCEPTION_POINTERS* eps);
 extern void pxInstallSignalHandler();
 extern void _platform_InstallSignalHandler();

+#include "Threading.h"
 extern SrcType_PageFault* Source_PageFault;
+extern Threading::Mutex   PageFault_Mutex;

--- a/common/include/Utilities/Threading.h
+++ b/common/include/Utilities/Threading.h
@ -179,17 +179,20 @@ namespace Threading
 // from these little beasties!  (these are all implemented internally using cross-platform
 // implementations of _InterlockedExchange and such)

+	extern u32 AtomicRead( volatile u32& Target );
+	extern s32 AtomicRead( volatile s32& Target );
 	extern u32 AtomicExchange( volatile u32& Target, u32 value );
-	extern u32 AtomicExchangeAdd( volatile u32& Target, u32 value );
-	extern u32 AtomicIncrement( volatile u32& Target );
-	extern u32 AtomicDecrement( volatile u32& Target );
 	extern s32 AtomicExchange( volatile s32& Target, s32 value );
+	extern u32 AtomicExchangeAdd( volatile u32& Target, u32 value );
 	extern s32 AtomicExchangeAdd( volatile s32& Target, s32 value );
 	extern s32 AtomicExchangeSub( volatile s32& Target, s32 value );
+	extern u32 AtomicIncrement( volatile u32& Target );
 	extern s32 AtomicIncrement( volatile s32& Target );
+	extern u32 AtomicDecrement( volatile u32& Target );
 	extern s32 AtomicDecrement( volatile s32& Target );

 	extern bool AtomicBitTestAndReset( volatile u32& bitset, u8 bit );
+	extern bool AtomicBitTestAndReset( volatile s32& bitset, u8 bit );

 	extern void* _AtomicExchangePointer( volatile uptr& target, uptr value );
 	extern void* _AtomicCompareExchangePointer( volatile uptr& target, uptr value, uptr comparand );
@ -393,5 +396,34 @@ namespace Threading

 		bool Failed() const { return !m_IsLocked; }
 	};
+
+// --------------------------------------------------------------------------------------
+//  ScopedLockBool
+// --------------------------------------------------------------------------------------
+// A ScopedLock in which you specify an external bool to get updated on locks/unlocks.
+// Note that the isLockedBool should only be used as an indicator for the locked status,
+// and not actually depended on for thread synchronization...
+
+	struct ScopedLockBool {	
+		ScopedLock m_lock;
+		volatile __aligned(4) bool& m_bool;
+
+		ScopedLockBool(Mutex& mutexToLock, volatile __aligned(4) bool& isLockedBool)
+			: m_lock(mutexToLock),
+			  m_bool(isLockedBool) {
+			m_bool = m_lock.IsLocked();
+		}
+		virtual ~ScopedLockBool() throw() {
+			m_bool = false;
+		}
+		void Acquire() {
+			m_lock.Acquire();
+			m_bool = m_lock.IsLocked();
+		}
+		void Release() {
+			m_bool = false;
+			m_lock.Release();
+		}
+	};
 }

--- a/common/include/x86emitter/x86types.h
+++ b/common/include/x86emitter/x86types.h
@ -35,10 +35,12 @@ enum XMMSSEType
 // as a project option.  The multithreaded emitter relies on native compiler support for
 // TLS -- Macs are crap out of luck there (for now).

+#include "Utilities/Threading.h"
+
 #ifndef x86EMIT_MULTITHREADED
-#	define x86EMIT_MULTITHREADED	0
-#else
-#	if !PCSX2_THREAD_LOCAL
+#	if PCSX2_THREAD_LOCAL
+#		define x86EMIT_MULTITHREADED	1
+#	else
 		// No TLS support?  Force-clear the MT flag:
 #		pragma message("x86emitter: TLS not available, multithreaded emitter disabled.")
 #		undef x86EMIT_MULTITHREADED
--- a/common/src/Utilities/Linux/LnxHostSys.cpp
+++ b/common/src/Utilities/Linux/LnxHostSys.cpp
@ -46,6 +46,12 @@ static void SysPageFaultSignalFilter( int signal, siginfo_t *siginfo, void * )
 	// Note: Use of stdio functions isn't safe here.  Avoid console logs,
 	// assertions, file logs, or just about anything else useful.

+
+	// Note: This signal can be accessed by the EE or MTVU thread
+	// Source_PageFault is a global variable with its own state information
+	// so for now we lock this exception code unless someone can fix this better...
+	Threading::ScopedLock lock(PageFault_Mutex);
+
 	Source_PageFault->Dispatch( PageFaultInfo( (uptr)siginfo->si_addr & ~m_pagemask ) );

 	// resumes execution right where we left off (re-executes instruction that
--- a/common/src/Utilities/ThreadTools.cpp
+++ b/common/src/Utilities/ThreadTools.cpp
@ -786,72 +786,70 @@ void Threading::WaitEvent::Wait()
 //  InterlockedExchanges / AtomicExchanges (PCSX2's Helper versions)
 // --------------------------------------------------------------------------------------
 // define some overloads for InterlockedExchanges for commonly used types, like u32 and s32.
+// Note: For all of these atomic operations below to be atomic, the variables need to be 4-byte
+// aligned. Read: http://msdn.microsoft.com/en-us/library/ms684122%28v=vs.85%29.aspx

-__fi bool Threading::AtomicBitTestAndReset( volatile u32& bitset, u8 bit )
-{
+__fi u32 Threading::AtomicRead(volatile u32& Target) {
+	return Target; // Properly-aligned 32-bit reads are atomic
+}
+__fi s32 Threading::AtomicRead(volatile s32& Target) {
+	return Target; // Properly-aligned 32-bit reads are atomic
+}
+
+__fi bool Threading::AtomicBitTestAndReset( volatile u32& bitset, u8 bit ) {
+	return _interlockedbittestandreset( (volatile long*)& bitset, bit ) != 0;
+}
+__fi bool Threading::AtomicBitTestAndReset( volatile s32& bitset, u8 bit ) {
 	return _interlockedbittestandreset( (volatile long*)& bitset, bit ) != 0;
 }

-__fi u32 Threading::AtomicExchange( volatile u32& Target, u32 value )
-{
+__fi u32 Threading::AtomicExchange(volatile u32& Target, u32 value ) {
+	return _InterlockedExchange( (volatile long*)&Target, value );
+}
+__fi s32 Threading::AtomicExchange( volatile s32& Target, s32 value ) {
 	return _InterlockedExchange( (volatile long*)&Target, value );
 }

-__fi u32 Threading::AtomicExchangeAdd( volatile u32& Target, u32 value )
-{
+__fi u32 Threading::AtomicExchangeAdd( volatile u32& Target, u32 value ) {
+	return _InterlockedExchangeAdd( (volatile long*)&Target, value );
+}
+__fi s32 Threading::AtomicExchangeAdd( volatile s32& Target, s32 value ) {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, value );
 }

-__fi u32 Threading::AtomicIncrement( volatile u32& Target )
-{
-	return _InterlockedExchangeAdd( (volatile long*)&Target, 1 );
-}
-
-__fi u32 Threading::AtomicDecrement( volatile u32& Target )
-{
-	return _InterlockedExchangeAdd( (volatile long*)&Target, -1 );
-}
-
-__fi s32 Threading::AtomicExchange( volatile s32& Target, s32 value )
-{
-	return _InterlockedExchange( (volatile long*)&Target, value );
-}
-
-__fi s32 Threading::AtomicExchangeAdd( volatile s32& Target, s32 value )
-{
-	return _InterlockedExchangeAdd( (volatile long*)&Target, value );
-}
-
-__fi s32 Threading::AtomicExchangeSub( volatile s32& Target, s32 value )
-{
+__fi s32 Threading::AtomicExchangeSub( volatile s32& Target, s32 value ) {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, -value );
 }

-__fi s32 Threading::AtomicIncrement( volatile s32& Target )
-{
+__fi u32 Threading::AtomicIncrement( volatile u32& Target ) {
+	return _InterlockedExchangeAdd( (volatile long*)&Target, 1 );
+}
+__fi s32 Threading::AtomicIncrement( volatile s32& Target) {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, 1 );
 }

-__fi s32 Threading::AtomicDecrement( volatile s32& Target )
-{
+__fi u32 Threading::AtomicDecrement( volatile u32& Target ) {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, -1 );
 }
+__fi s32 Threading::AtomicDecrement(volatile s32& Target) {
+	return _InterlockedExchangeAdd((volatile long*)&Target, -1);
+}

-__fi void* Threading::_AtomicExchangePointer( volatile uptr& target, uptr value )
+__fi void* Threading::_AtomicExchangePointer(volatile uptr& target, uptr value)
 {
 #ifdef _M_AMD64		// high-level atomic ops, please leave these 64 bit checks in place.
-	return (void*)_InterlockedExchange64( &(volatile s64&)target, value );
+	return (void*)_InterlockedExchange64(&(volatile s64&)target, value);
 #else
-	return (void*)_InterlockedExchange( (volatile long*)&target, value );
+	return (void*)_InterlockedExchange((volatile long*)&target, value);
 #endif
 }

-__fi void* Threading::_AtomicCompareExchangePointer( volatile uptr& target, uptr value, uptr comparand )
+__fi void* Threading::_AtomicCompareExchangePointer(volatile uptr& target, uptr value, uptr comparand)
 {
 #ifdef _M_AMD64		// high-level atomic ops, please leave these 64 bit checks in place.
-	return (void*)_InterlockedCompareExchange64( &(volatile s64&)target, value );
+	return (void*)_InterlockedCompareExchange64(&(volatile s64&)target, value);
 #else
-	return (void*)_InterlockedCompareExchange( &(volatile long&)target, value, comparand );
+	return (void*)_InterlockedCompareExchange(&(volatile long&)target, value, comparand);
 #endif
 }

--- a/common/src/Utilities/VirtualMemory.cpp
+++ b/common/src/Utilities/VirtualMemory.cpp
@ -26,11 +26,11 @@
 template class EventSource< IEventListener_PageFault >;

 SrcType_PageFault* Source_PageFault = NULL;
+Threading::Mutex   PageFault_Mutex;

 void pxInstallSignalHandler()
 {
-	if (!Source_PageFault)
-	{
+	if(!Source_PageFault) {
 		Source_PageFault = new SrcType_PageFault();
 	}

--- a/common/src/Utilities/Windows/WinHostSys.cpp
+++ b/common/src/Utilities/Windows/WinHostSys.cpp
@ -25,6 +25,10 @@ int SysPageFaultExceptionFilter( EXCEPTION_POINTERS* eps )
 	if( eps->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION )
 		return EXCEPTION_CONTINUE_SEARCH;

+	// Note: This exception can be accessed by the EE or MTVU thread
+	// Source_PageFault is a global variable with its own state information
+	// so for now we lock this exception code unless someone can fix this better...
+	Threading::ScopedLock lock(PageFault_Mutex);
 	Source_PageFault->Dispatch( PageFaultInfo( (uptr)eps->ExceptionRecord->ExceptionInformation[1] ) );
 	return Source_PageFault->WasHandled() ? EXCEPTION_CONTINUE_EXECUTION : EXCEPTION_CONTINUE_SEARCH;
 }
--- a/pcsx2/Config.h
+++ b/pcsx2/Config.h
@ -377,7 +377,8 @@ struct Pcsx2Config
 				IntcStat		:1,		// tells Pcsx2 to fast-forward through intc_stat waits.
 				WaitLoop		:1,		// enables constant loop detection and fast-forwarding
 				vuFlagHack		:1,		// microVU specific flag hack
-				vuBlockHack		:1;		// microVU specific block flag no-propagation hack
+				vuBlockHack		:1,		// microVU specific block flag no-propagation hack
+				vuThread        :1;		// Enable Threaded VU1
 		BITFIELD_END

 		u8	EECycleRate;		// EE cycle rate selector (1.0, 1.5, 2.0)
@ -471,6 +472,7 @@ TraceLogFilters&				SetTraceConfig();

 // ------------ CPU / Recompiler Options ---------------

+#define THREAD_VU1					(EmuConfig.Cpu.Recompiler.UseMicroVU1 && EmuConfig.Speedhacks.vuThread)
 #define CHECK_MICROVU0				(EmuConfig.Cpu.Recompiler.UseMicroVU0)
 #define CHECK_MICROVU1				(EmuConfig.Cpu.Recompiler.UseMicroVU1)
 #define CHECK_EEREC					(EmuConfig.Cpu.Recompiler.EnableEE && GetCpuProviders().IsRecAvailable_EE())
--- a/pcsx2/FiFo.cpp
+++ b/pcsx2/FiFo.cpp
@ -17,9 +17,8 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"

-#include "Gif.h"
-#include "Gif_Unit.h"
 #include "GS.h"
+#include "Gif_Unit.h"
 #include "Vif.h"
 #include "Vif_Dma.h"
 #include "IPU/IPU.h"
--- a/pcsx2/GS.cpp
+++ b/pcsx2/GS.cpp
@ -19,7 +19,6 @@
 #include <list>

 #include "GS.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "Counters.h"

--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@ -245,6 +245,7 @@ enum MTGS_RingCommand
 ,	GS_RINGTYPE_MODECHANGE		// for issued mode changes.
 ,	GS_RINGTYPE_CRC
 ,	GS_RINGTYPE_GSPACKET
+,	GS_RINGTYPE_MTVU_GSPACKET
 };


@ -263,8 +264,8 @@ class SysMtgsThread : public SysThreadBase

 public:
 	// note: when m_ReadPos == m_WritePos, the fifo is empty
-	uint			m_ReadPos;			// cur pos gs is reading from
-	uint			m_WritePos;			// cur pos ee thread is writing to
+	__aligned(4) uint m_ReadPos;	// cur pos gs is reading from
+	__aligned(4) uint m_WritePos;	// cur pos ee thread is writing to

 	volatile bool	m_RingBufferIsBusy;
 	volatile u32	m_SignalRingEnable;
@ -273,7 +274,9 @@ public:
 	volatile s32	m_QueuedFrameCount;
 	volatile u32	m_VsyncSignalListener;

-	Mutex			m_mtx_RingBufferBusy;
+	Mutex			m_mtx_RingBufferBusy;  // Is obtained while processing ring-buffer data
+	Mutex			m_mtx_RingBufferBusy2; // This one gets released on semaXGkick waiting...
+	Mutex			m_mtx_WaitGS;
 	Semaphore		m_sem_OnRingReset;
 	Semaphore		m_sem_Vsync;

@ -304,8 +307,7 @@ public:
 	virtual ~SysMtgsThread() throw();

 	// Waits for the GS to empty out the entire ring buffer contents.
-	// Used primarily for plugin startup/shutdown.
-	void WaitGS();
+	void WaitGS(bool syncRegs=true, bool weakWait=false, bool isMTVU=false);
 	void ResetGS();

 	void PrepDataPacket( MTGS_RingCommand cmd, u32 size );
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@ -17,7 +17,6 @@
 #include "Common.h"

 #include "GS.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "Vif_Dma.h"

@ -87,6 +86,7 @@ __fi void gifInterrupt()
 }

 static u32 WRITERING_DMA(u32 *pMem, u32 qwc) {
+	//qwc = min(qwc, 1024u);
 	uint size = gifUnit.TransferGSPacketData(GIF_TRANS_DMA, (u8*)pMem, qwc*16) / 16;
 	incGifChAddr(size);
 	return size;
--- a/pcsx2/Gif.h
+++ b/pcsx2/Gif.h
@ -35,15 +35,17 @@ enum GIF_PATH {
 enum GIF_TRANSFER_TYPE {
 	GIF_TRANS_INVALID  = 0x000, // Invalid
 	GIF_TRANS_XGKICK   = 0x100, // Path 1
-	GIF_TRANS_DIRECT   = 0x201, // Path 2
-	GIF_TRANS_DIRECTHL = 0x301, // Path 2
-	GIF_TRANS_DMA      = 0x402, // Path 3
-	GIF_TRANS_FIFO     = 0x502  // Path 3
+	GIF_TRANS_MTVU     = 0x200, // Path 1
+	GIF_TRANS_DIRECT   = 0x301, // Path 2
+	GIF_TRANS_DIRECTHL = 0x401, // Path 2
+	GIF_TRANS_DMA      = 0x502, // Path 3
+	GIF_TRANS_FIFO     = 0x602  // Path 3
 };

-static const char Gif_TransferStr[6][32] = {
+static const char Gif_TransferStr[7][32] = {
 	"Invalid Transfer Type",
 	"GIF_TRANS_XGKICK",
+	"GIF_TRANS_MTVU",
 	"GIF_TRANS_DIRECT",
 	"GIF_TRANS_DIRECTHL",
 	"GIF_TRANS_DMA",
--- a/pcsx2/Gif_Logger.cpp
+++ b/pcsx2/Gif_Logger.cpp
@ -15,7 +15,6 @@

 #include "PrecompiledHeader.h"
 #include "Common.h"
-#include "Gif.h"
 #include "Gif_Unit.h"

 #define GIF_PARSE DevCon.WriteLn
--- a/pcsx2/Gif_Unit.cpp
+++ b/pcsx2/Gif_Unit.cpp
@ -19,6 +19,7 @@
 #include "GS.h"
 #include "Gif_Unit.h"
 #include "Vif_Dma.h"
+#include "MTVU.h"

 Gif_Unit gifUnit;

@ -76,12 +77,32 @@ bool Gif_HandlerAD(u8* pMem) {
 	return false;
 }

+// Returns true if pcsx2 needed to process the packet...
+bool Gif_HandlerAD_Debug(u8* pMem) {
+	u32   reg = pMem[8];
+	if   (reg == 0x50) { Console.Error("GIF Handler Debug - BITBLTBUF"); return 1; }
+	elif (reg == 0x52) { Console.Error("GIF Handler Debug - TRXREG");    return 1; }
+	elif (reg == 0x53) { Console.Error("GIF Handler Debug - TRXDIR");    return 1; }
+	elif (reg == 0x60) { Console.Error("GIF Handler Debug - SIGNAL");    return 1; }
+	elif (reg == 0x61) { Console.Error("GIF Handler Debug - FINISH");    return 1; }
+	elif (reg == 0x62) { Console.Error("GIF Handler Debug - LABEL");     return 1; }
+	elif (reg >= 0x63 && reg != 0x7f) {
+		DevCon.Warning("GIF Handler Debug - Write to unknown register! [reg=%x]", reg);
+	}
+	return 0;
+}
+
 void Gif_FinishIRQ() {
 	if (CSRreg.FINISH && !(GSIMR&0x200)) {
 		gsIrq();
 	}
 }

+// Used in MTVU mode... MTVU will later complete a real packet
+void Gif_AddGSPacketMTVU(GS_Packet& gsPack, GIF_PATH path) {
+	GetMTGS().SendSimpleGSPacket(GS_RINGTYPE_MTVU_GSPACKET, 0, 0, path);
+}
+
 void Gif_AddCompletedGSPacket(GS_Packet& gsPack, GIF_PATH path) {
 	//DevCon.WriteLn("Adding Completed Gif Packet [size=%x]", gsPack.size);
 	if (COPY_GS_PACKET_TO_MTGS) {
@ -91,6 +112,7 @@ void Gif_AddCompletedGSPacket(GS_Packet& gsPack, GIF_PATH path) {
 		GetMTGS().SendDataPacket();
 	}
 	else {
+		pxAssertDev(!gsPack.readAmount, "Gif Unit - gsPack.readAmount only valid for MTVU path 1!");
 		AtomicExchangeAdd(gifUnit.gifPath[path].readAmount, gsPack.size);
 		GetMTGS().SendSimpleGSPacket(GS_RINGTYPE_GSPACKET,  gsPack.offset, gsPack.size, path);
 	}
@ -102,35 +124,47 @@ void Gif_AddBlankGSPacket(u32 size, GIF_PATH path) {
 	GetMTGS().SendSimpleGSPacket(GS_RINGTYPE_GSPACKET, ~0u, size, path);
 }

-void Gif_MTGS_Wait() {
-	GetMTGS().WaitGS();
-}
-
-void Gif_Execute() {
-	gifUnit.Execute();
+void Gif_MTGS_Wait(bool isMTVU) {
+	GetMTGS().WaitGS(false, true, isMTVU);
 }

 void SaveStateBase::gifPathFreeze(u32 path) {

 	Gif_Path& gifPath = gifUnit.gifPath[path];
-	pxAssertDev(gifPath.readAmount==0, "Gif Path readAmount should be 0!");
+	pxAssertDev(!gifPath.readAmount,           "Gif Path readAmount should be 0!");
+	pxAssertDev(!gifPath.gsPack.readAmount,     "GS Pack readAmount should be 0!");
+	pxAssertDev(!gifPath.GetPendingGSPackets(), "MTVU GS Pack Queue should be 0!");
+
 	if (IsSaving()) { // Move all the buffered data to the start of buffer
 		gifPath.RealignPacket(); // May add readAmount which we need to clear on load
 	}
 	u8* bufferPtr = gifPath.buffer; // Backup current buffer ptr
-	Freeze(gifPath);
+	Freeze(gifPath.mtvu.fakePackets);
+	FreezeMem(&gifPath,  sizeof(gifPath) - sizeof(gifPath.mtvu));
 	FreezeMem(bufferPtr, gifPath.curSize);
 	gifPath.buffer = bufferPtr;
-	if (!IsSaving()) gifPath.readAmount = 0;
+	if(!IsSaving()) {
+		gifPath.readAmount        = 0;
+		gifPath.gsPack.readAmount = 0;
+	}
 }

 void SaveStateBase::gifFreeze() {
-	Gif_MTGS_Wait();
+	bool mtvuMode = THREAD_VU1;
+	pxAssert(vu1Thread.IsDone());
+	GetMTGS().WaitGS();
 	FreezeTag("Gif Unit");
+	Freeze(mtvuMode);
 	Freeze(gifUnit.stat);
 	Freeze(gifUnit.gsSIGNAL);
 	Freeze(gifUnit.lastTranType);
 	gifPathFreeze(GIF_PATH_1);
 	gifPathFreeze(GIF_PATH_2);
 	gifPathFreeze(GIF_PATH_3);
+	if (!IsSaving()) {
+		if (mtvuMode != THREAD_VU1) {
+			DevCon.Warning("gifUnit: MTVU Mode has switched between save/load state");
+			// ToDo: gifUnit.SwitchMTVU(mtvuMode);
+		}
+	}
 }
--- a/pcsx2/Gif_Unit.h
+++ b/pcsx2/Gif_Unit.h
@ -14,11 +14,16 @@
 */

 #pragma once
+#include <deque>
 #include "System/SysThreads.h"
+#include "Gif.h"
 struct GS_Packet;
-extern void Gif_MTGS_Wait();
+extern void Gif_MTGS_Wait(bool isMTVU);
 extern void Gif_FinishIRQ();
 extern bool Gif_HandlerAD(u8* pMem);
+extern bool Gif_HandlerAD_Debug(u8* pMem);
+extern void Gif_AddBlankGSPacket(u32 size, GIF_PATH path);
+extern void Gif_AddGSPacketMTVU     (GS_Packet& gsPack, GIF_PATH path);
 extern void Gif_AddCompletedGSPacket(GS_Packet& gsPack, GIF_PATH path);
 extern void Gif_ParsePacket(u8* data, u32 size, GIF_PATH path);
 extern void Gif_ParsePacket(GS_Packet& gsPack, GIF_PATH path);
@ -105,10 +110,11 @@ struct Gif_Tag {
 };

 struct GS_Packet {
-	u32  offset; // Path buffer offset for start of packet
-	u32  size;	 // Full size of GS-Packet
-	s32  cycles; // EE Cycles taken to process this GS packet
-	bool done;	 // 0 = Incomplete, 1 = Complete
+	u32  offset;     // Path buffer offset for start of packet
+	u32  size;	     // Full size of GS-Packet
+	s32  cycles;     // EE Cycles taken to process this GS packet
+	s32  readAmount; // Dummy read-amount data needed for proper buffer calculations
+	bool done;	     // 0 = Incomplete, 1 = Complete
 	GS_Packet()  { Reset(); }
 	void Reset() { memzero(*this); }
 };
@ -124,8 +130,16 @@ static __fi void incTag(u32& offset, u32& size, u32 incAmount) {
 	offset += incAmount;
 }

+struct Gif_Path_MTVU {
+	u32   fakePackets; // Fake packets pending to be sent to MTGS
+	Mutex gsPackMutex; // Used for atomic access to gsPackQueue
+	std::deque<GS_Packet> gsPackQueue; // VU1 programs' XGkick(s)
+	Gif_Path_MTVU() { Reset(); }
+	void Reset()    { fakePackets = 0; gsPackQueue.clear(); }
+};
+
 struct Gif_Path {
-	volatile s32 __aligned(4) readAmount; // Amount of data MTGS still needs to read
+	__aligned(4) volatile s32 readAmount; // Amount of data MTGS still needs to read
 	u8* buffer;		  // Path packet buffer
 	u32 buffSize;	  // Full size of buffer
 	u32 buffLimit;	  // Cut off limit to wrap around
@ -135,6 +149,7 @@ struct Gif_Path {
 	GS_Packet gsPack; // Current GS Packet info
 	GIF_PATH  idx;	  // Gif Path Index
 	GIF_PATH_STATE state; // Path State
+	Gif_Path_MTVU  mtvu;  // Must be last for saved states

 	Gif_Path()  {}
 	~Gif_Path() { _aligned_free(buffer); }
@ -156,6 +171,7 @@ struct Gif_Path {
 			//curOffset = curSize;
 			return;
 		}
+		mtvu.Reset();
 		curSize     = 0;
 		curOffset   = 0;
 		readAmount  = 0;
@ -163,32 +179,38 @@ struct Gif_Path {
 		gsPack.Reset();
 	}

+	bool isMTVU()           { return !idx && THREAD_VU1; }
+	s32 getReadAmount()     { return AtomicRead(readAmount) + gsPack.readAmount; }
 	bool hasDataRemaining() { return curOffset < curSize; }
-	bool isDone() { return !hasDataRemaining() && state == GIF_PATH_IDLE; }
+	bool isDone()           { return isMTVU() ? !mtvu.fakePackets 
+							: (!hasDataRemaining() && state == GIF_PATH_IDLE); }

 	// Waits on the MTGS to process gs packets
 	void mtgsReadWait() {
-		//pxAssertDev(AtomicExchangeAdd(readAmount, 0) != 0, "Gif Path Buffer Overflow!");
-		DevCon.WriteLn(Color_Red, "Gif Path[%d] - MTGS Wait! [r=0x%x]", 
-			           idx+1, AtomicExchangeAdd(readAmount, 0));
-		Gif_MTGS_Wait();
+		if (IsDevBuild) {
+			DevCon.WriteLn(Color_Red,   "Gif Path[%d] - MTGS Wait! [r=0x%x]", idx+1, getReadAmount());
+			Gif_MTGS_Wait(isMTVU());
+			DevCon.WriteLn(Color_Green, "Gif Path[%d] - MTGS Wait! [r=0x%x]", idx+1, getReadAmount());
+			return;
+		}
+		Gif_MTGS_Wait(isMTVU());
 	}

 	// Moves packet data to start of buffer
 	void RealignPacket() {
-		extern void Gif_AddBlankGSPacket(u32 size, GIF_PATH path);
 		GUNIT_LOG("Path Buffer: Realigning packet!");
 		s32 offset    = curOffset - gsPack.size;
 		s32 sizeToAdd = curSize   - offset;
 		s32 intersect = sizeToAdd - offset;
 		if (intersect < 0) intersect = 0;
 		for(;;) {
-			s32 frontFree  = offset - AtomicExchangeAdd(readAmount, 0);
+			s32 frontFree  = offset - getReadAmount();
 			if (frontFree >= sizeToAdd - intersect) break;
 			mtgsReadWait();
 		}
 		if (offset < (s32)buffLimit) { // Needed for correct readAmount values
-			Gif_AddBlankGSPacket(buffLimit - offset, idx);
+			if (isMTVU()) gsPack.readAmount += buffLimit - offset;
+			else Gif_AddBlankGSPacket(buffLimit - offset, idx);
 		}
 		//DevCon.WriteLn("Realign Packet [%d]", curSize - offset);
 		if (intersect) memmove(buffer, &buffer[offset], curSize - offset);
@ -200,12 +222,12 @@ struct Gif_Path {

 	void CopyGSPacketData(u8* pMem, u32 size, bool aligned = false) {	
 		if (curSize + size > buffSize) { // Move gsPack to front of buffer
-			DevCon.Warning("CopyGSPacketData: Realigning packet!");
+			GUNIT_LOG("CopyGSPacketData: Realigning packet!");
 			RealignPacket();
 		}
 		for(;;) {
 			s32 offset  = curOffset - gsPack.size;
-			s32 readPos = offset    - AtomicExchangeAdd(readAmount, 0);
+			s32 readPos = offset    - getReadAmount();
 			if (readPos >= 0) break; // MTGS is reading in back of curOffset
 			if ((s32)buffLimit + readPos > (s32)curSize + (s32)size) break; // Enough free front space
 			mtgsReadWait(); // Let MTGS run to free up buffer space
@ -217,12 +239,21 @@ struct Gif_Path {
 	}

 	// If completed a GS packet (with EOP) then returned GS_Packet.done = 1
+	// MTVU: This function only should be called called on EE thread
 	GS_Packet ExecuteGSPacket() {
+		if (mtvu.fakePackets) { // For MTVU mode...
+			mtvu.fakePackets--;
+			GS_Packet fakePack;
+			fakePack.done =  1; // Fake packets don't get processed by pcsx2
+			fakePack.size =~0u; // Used to indicate that its a fake packet
+			return fakePack;
+		}
+		pxAssert(!isMTVU());
 		for(;;) {
 			if (!gifTag.isValid) { // Need new Gif Tag
 				// We don't have enough data for a Gif Tag
 				if (curOffset + 16 > curSize) {
-					GUNIT_LOG("Path Buffer: Not enough data for gif tag! [%d]", curSize-curOffset);
+					//GUNIT_LOG("Path Buffer: Not enough data for gif tag! [%d]", curSize-curOffset);
 					return gsPack;
 				}

@ -249,7 +280,7 @@ struct Gif_Path {
 				while(gifTag.nLoop && !dblSIGNAL) {
 					if (curOffset + 16 > curSize) return gsPack; // Exit Early
 					if (gifTag.curReg() == GIF_REG_A_D) {
-						dblSIGNAL = Gif_HandlerAD(&buffer[curOffset]);
+						if (!isMTVU()) dblSIGNAL = Gif_HandlerAD(&buffer[curOffset]);
 					}
 					incTag(curOffset, gsPack.size, 16); // 1 QWC
 					gifTag.packedStep();
@ -271,6 +302,84 @@ struct Gif_Path {
 			}
 		}
 	}
+
+	// MTVU: Gets called on VU XGkicks on MTVU thread
+	void ExecuteGSPacketMTVU() {
+		// Move packet to start of buffer
+		if (curOffset > buffLimit) {
+			RealignPacket();
+		}
+		if (IsDevBuild) { // We check the packet to see if it actually
+			for(;;) {     // needed to be processed by pcsx2...
+				if (curOffset + 16 > curSize) break;
+				gifTag.setTag(&buffer[curOffset], 1);
+				
+				if(!gifTag.hasAD && curOffset + 16 + gifTag.len > curSize) break;
+				incTag(curOffset, gsPack.size, 16); // Tag Size
+				
+				if (gifTag.hasAD) { // Only can be true if GIF_FLG_PACKED
+					while(gifTag.nLoop) {
+						if (curOffset + 16 > curSize) break; // Exit Early
+						if (gifTag.curReg() == GIF_REG_A_D) {
+							pxAssert(!Gif_HandlerAD_Debug(&buffer[curOffset]));
+						}
+						incTag(curOffset, gsPack.size, 16); // 1 QWC
+						gifTag.packedStep();
+					}
+				}
+				else incTag(curOffset, gsPack.size, gifTag.len); // Data length
+				if (curOffset >= curSize) break;
+				if (gifTag.tag.EOP)       break;
+			}
+			pxAssert(curOffset == curSize);
+			gifTag.isValid = false;
+		}
+		else {
+			// We assume every packet is a full GS Packet
+			// And we don't process anything on pcsx2 side
+			gsPack.size += curSize - curOffset;
+			curOffset    = curSize;
+		}
+	}
+
+	// MTVU: Gets called after VU1 execution on MTVU thread
+	void FinishGSPacketMTVU() {
+		if (1) {
+			ScopedLock lock(mtvu.gsPackMutex);
+			AtomicExchangeAdd(readAmount, gsPack.size + gsPack.readAmount);
+			mtvu.gsPackQueue.push_back(gsPack);
+		}
+		gsPack.Reset();
+		gsPack.offset = curOffset;
+	}
+
+	// MTVU: Gets called by MTGS thread
+	GS_Packet GetGSPacketMTVU() {
+		ScopedLock lock(mtvu.gsPackMutex);
+		if (mtvu.gsPackQueue.size()) {
+			GS_Packet t = mtvu.gsPackQueue[0];
+			return t; // XGkick GS packet(s)
+		}
+		Console.Error("MTVU: Expected gsPackQueue to have elements!");
+		pxAssert(0);
+		return GS_Packet(); // gsPack.size will be 0
+	}
+
+	// MTVU: Gets called by MTGS thread
+	void PopGSPacketMTVU() {
+		ScopedLock lock(mtvu.gsPackMutex);
+		if (mtvu.gsPackQueue.size()) {
+			mtvu.gsPackQueue.pop_front();
+		}
+	}
+
+	// MTVU: Returns the amount of pending
+	// GS Packets that MTGS hasn't yet processed
+	u32 GetPendingGSPackets() {
+		ScopedLock lock(mtvu.gsPackMutex);
+		u32 t = mtvu.gsPackQueue.size();
+		return t;
+	}
 };

 struct Gif_Unit {
@ -280,8 +389,8 @@ struct Gif_Unit {
 	GIF_TRANSFER_TYPE lastTranType; // Last Transfer Type

 	Gif_Unit() : stat(gifRegs.stat) {
-		gifPath[0].Init(GIF_PATH_1, _1mb*8, _16kb + _1kb);
-		gifPath[1].Init(GIF_PATH_2, _1mb*8, _1mb  + _1kb);
+		gifPath[0].Init(GIF_PATH_1, _1mb*9, _1mb  + _1kb);
+		gifPath[1].Init(GIF_PATH_2, _1mb*9, _1mb  + _1kb);
 		gifPath[2].Init(GIF_PATH_3, _1mb*9, _1mb  + _1kb);
 	}

@ -307,24 +416,24 @@ struct Gif_Unit {

 	// Adds a finished GS Packet to the MTGS ring buffer
 	__fi void AddCompletedGSPacket(GS_Packet& gsPack, GIF_PATH path) {
-		Gif_AddCompletedGSPacket(gsPack, path);
+		if (gsPack.size==~0u) Gif_AddGSPacketMTVU     (gsPack, path);
+		else                  Gif_AddCompletedGSPacket(gsPack, path);
 		if (PRINT_GIF_PACKET) Gif_ParsePacket(gsPack, path);
 	}

 	// Returns GS Packet Size in bytes
-	u32 GetGSPacketSize(GIF_PATH pathIdx, u8* pMem, u32 offset = 0) {
-		u32 memMask = pathIdx ? 0xffffffffu : 0x3fffu;
-		u32 size    = 0;
+	u32 GetGSPacketSize(GIF_PATH pathIdx, u8* pMem, u32 offset = 0, u32 size = ~0u) {
+		u32 memMask = pathIdx ? ~0u : 0x3fffu;
+		u32 curSize = 0;
 		for(;;) {
 			Gif_Tag gifTag(&pMem[offset & memMask]);
-			incTag(offset, size, 16 + gifTag.len); // Tag + Data length
-			if (pathIdx == GIF_PATH_1 && size >= 0x4000) {
+			incTag(offset, curSize, 16 + gifTag.len); // Tag + Data length
+			if (pathIdx == GIF_PATH_1 && curSize >= 0x4000) {
 				Console.Warning("Gif Unit - GS packet size exceeded VU memory size!");
 				return 0; // Bios does this... (Fixed if you delay vu1's xgkick by 103 vu cycles)
 			}
-			if (gifTag.tag.EOP) {
-				return size;
-			}
+			if (curSize >= size) return size;
+			if (gifTag.tag.EOP)  return curSize;
 		}
 	}

@ -332,8 +441,22 @@ struct Gif_Unit {
 	// The return value is the amount of data (in bytes) that was processed
 	// If transfer cannot take place at this moment the return value is 0
 	u32 TransferGSPacketData(GIF_TRANSFER_TYPE tranType, u8* pMem, u32 size, bool aligned=false) {
-		
-		GIF_LOG("%s - [path=%d][size=%d]", Gif_TransferStr[(tranType>>8)&0xf], (tranType&3)+1, size);
+
+		if (THREAD_VU1) {
+			Gif_Path& path1 = gifPath[GIF_PATH_1];
+			if (tranType == GIF_TRANS_XGKICK) { // This is on the MTVU thread
+				path1.CopyGSPacketData(pMem, size, aligned);
+				path1.ExecuteGSPacketMTVU();
+				return size;
+			}
+			if (tranType == GIF_TRANS_MTVU) {   // This is on the EE thread
+				path1.mtvu.fakePackets++;
+				if (CanDoGif()) Execute();
+				return 0;
+			}
+		}
+
+		GUNIT_LOG("%s - [path=%d][size=%d]", Gif_TransferStr[(tranType>>8)&0xf], (tranType&3)+1, size);
 		if (size == 0)  { GUNIT_WARN("Gif Unit - Size == 0"); return 0; }
 		if(!CanDoGif()) { GUNIT_WARN("Gif Unit - Signal or PSE Set or Dir = GS to EE"); }
 		pxAssertDev((stat.APATH==0) || checkPaths(1,1,1), "Gif Unit - APATH wasn't cleared?");
@ -344,6 +467,7 @@ struct Gif_Unit {
 		}
 		if (tranType == GIF_TRANS_DMA) {
 			if(!CanDoPath3())   { if (!Path3Masked()) stat.P3Q = 1; return 0; } // DMA Stall
+			//if (stat.P2Q) DevCon.WriteLn("P2Q while path 3");
 		}
 		if (tranType == GIF_TRANS_XGKICK) {
 			if(!CanDoPath1())   { stat.P1Q = 1; } // We always buffer path1 packets
@ -404,7 +528,7 @@ struct Gif_Unit {
 				GS_Packet gsPack = path.ExecuteGSPacket();
 				if(!gsPack.done) {
 					if (stat.APATH == 3 && CanDoP3Slice() && !gsSIGNAL.queued) {
-						if(!didPath3 && checkPaths(1,1,0)) { // Path3 slicing
+						if(!didPath3 && /*!Path3Masked() &&*/ checkPaths(1,1,0)) { // Path3 slicing
 							didPath3 = true;
 							stat.APATH = 0;
 							stat.IP3   = 1;
@ -433,7 +557,7 @@ struct Gif_Unit {
 			}
 			if   (!gsSIGNAL.queued && !gifPath[0].isDone()) { stat.APATH = 1; stat.P1Q = 0; }
 			elif (!gsSIGNAL.queued && !gifPath[1].isDone()) { stat.APATH = 2; stat.P2Q = 0; }
-			elif (!gsSIGNAL.queued && !gifPath[2].isDone() && !Path3Masked())
+			elif (!gsSIGNAL.queued && !gifPath[2].isDone() && !Path3Masked() /*&& !stat.P2Q*/)
 				 { stat.APATH = 3; stat.P3Q = 0; stat.IP3 = 0; }
 			else { stat.APATH = 0; stat.OPH = 0; break; }
 		}
--- a/pcsx2/Hw.cpp
+++ b/pcsx2/Hw.cpp
@ -19,7 +19,6 @@
 #include "Hardware.h"
 #include "newVif.h"
 #include "IPU/IPUdma.h"
-#include "Gif.h"
 #include "Gif_Unit.h"

 using namespace R5900;
--- a/pcsx2/HwWrite.cpp
+++ b/pcsx2/HwWrite.cpp
@ -17,7 +17,6 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"
 #include "Hardware.h"
-#include "Gif.h"
 #include "Gif_Unit.h"

 #include "ps2/HwInternal.h"
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@ -21,6 +21,7 @@

 #include "GS.h"
 #include "Gif_Unit.h"
+#include "MTVU.h"
 #include "Elfheader.h"
 #include "SamplProf.h"

@ -242,36 +243,29 @@ void SysMtgsThread::OpenPlugin()
 	GSsetGameCRC( ElfCRC, 0 );
 }

-class RingBufferLock : public ScopedLock
-{
-	typedef ScopedLock _parent;
-	
-protected:
-	SysMtgsThread&		m_mtgs;
+struct RingBufferLock {	
+	ScopedLock     m_lock1;
+	ScopedLock     m_lock2;
+	SysMtgsThread& m_mtgs;

-public:
-	RingBufferLock( SysMtgsThread& mtgs )
-		: ScopedLock( mtgs.m_mtx_RingBufferBusy )
-		, m_mtgs( mtgs )
-	{
+	RingBufferLock(SysMtgsThread& mtgs)
+		: m_lock1(mtgs.m_mtx_RingBufferBusy),
+		  m_lock2(mtgs.m_mtx_RingBufferBusy2),
+		  m_mtgs(mtgs) {
 		m_mtgs.m_RingBufferIsBusy = true;
 	}
-
-	virtual ~RingBufferLock() throw()
-	{
+	virtual ~RingBufferLock() throw() {
 		m_mtgs.m_RingBufferIsBusy = false;
 	}
-	
-	void Acquire()
-	{
-		_parent::Acquire();
+	void Acquire() {
+		m_lock1.Acquire();
+		m_lock2.Acquire();
 		m_mtgs.m_RingBufferIsBusy = true;
 	}
-	
-	void Release()
-	{
+	void Release() {
 		m_mtgs.m_RingBufferIsBusy = false;
-		_parent::Release();	
+		m_lock2.Release();
+		m_lock1.Release();
 	}
 };

@ -281,10 +275,9 @@ void SysMtgsThread::ExecuteTaskInThread()
 	PacketTagType prevCmd;
 #endif

-	RingBufferLock busy( *this );
+	RingBufferLock busy (*this);

-	while( true )
-	{
+	while(true) {
 		busy.Release();

 		// Performance note: Both of these perform cancellation tests, but pthread_testcancel
@ -299,8 +292,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 		// ever be modified by this thread.
 		while( m_ReadPos != volatize(m_WritePos))
 		{
-			if( EmuConfig.GS.DisableOutput )
-			{
+			if (EmuConfig.GS.DisableOutput) {
 				m_ReadPos = m_WritePos;
 				continue;
 			}
@ -327,7 +319,7 @@ void SysMtgsThread::ExecuteTaskInThread()

 			switch( tag.command )
 			{
-#if COPY_GS_PACKET_TO_MTGS == 1 // d
+#if COPY_GS_PACKET_TO_MTGS == 1
 				case GS_RINGTYPE_P1:
 				{
 					uint datapos = (m_ReadPos+1) & RingBufferMask;
@ -412,6 +404,21 @@ void SysMtgsThread::ExecuteTaskInThread()
 					break;
 				}

+				case GS_RINGTYPE_MTVU_GSPACKET: {
+					MTVU_LOG("MTGS - Waiting on semaXGkick!");
+					vu1Thread.KickStart(true);
+					busy.m_lock2.Release();
+					// Wait for MTVU to complete vu1 program
+					vu1Thread.semaXGkick.WaitWithoutYield();
+					busy.m_lock2.Acquire();
+					Gif_Path& path   = gifUnit.gifPath[GIF_PATH_1];
+					GS_Packet gsPack = path.GetGSPacketMTVU(); // Get vu1 program's xgkick packet(s)
+					if (gsPack.size) GSgifTransfer((u32*)&path.buffer[gsPack.offset], gsPack.size/16);
+					AtomicExchangeSub(path.readAmount, gsPack.size + gsPack.readAmount);
+					path.PopGSPacketMTVU(); // Should be done last, for proper Gif_MTGS_Wait()
+					break;
+				}
+
 				default:
 				{
 					switch( tag.command )
@ -572,27 +579,43 @@ void SysMtgsThread::OnCleanupInThread()
 }

 // Waits for the GS to empty out the entire ring buffer contents.
-// Used primarily for plugin startup/shutdown.
-void SysMtgsThread::WaitGS()
+// If syncRegs, then writes pcsx2's gs regs to MTGS's internal copy
+// If weakWait, then this function is allowed to exit after MTGS finished a path1 packet
+// If isMTVU, then this implies this function is being called from the MTVU thread...
+void SysMtgsThread::WaitGS(bool syncRegs, bool weakWait, bool isMTVU)
 {
 	pxAssertDev( !IsSelf(), "This method is only allowed from threads *not* named MTGS." );

 	if( m_ExecMode == ExecMode_NoThreadYet || !IsRunning() ) return;
 	if( !pxAssertDev( IsOpen(), "MTGS Warning!  WaitGS issued on a closed thread." ) ) return;

-	if( volatize(m_ReadPos) != m_WritePos )
-	{
+	Gif_Path&   path = gifUnit.gifPath[GIF_PATH_1];
+	u32 startP1Packs = weakWait ? path.GetPendingGSPackets() : 0;
+
+	if (isMTVU || volatize(m_ReadPos) != m_WritePos) {
 		SetEvent();
 		RethrowException();
-
-		do {
-			m_mtx_RingBufferBusy.Wait();
+		for(;;) {
+			if (weakWait) m_mtx_RingBufferBusy2.Wait();
+			else          m_mtx_RingBufferBusy .Wait();
 			RethrowException();
-		} while( volatize(m_ReadPos) != m_WritePos );
+			if(!isMTVU && volatize(m_ReadPos) == m_WritePos) break;
+			u32 curP1Packs = weakWait ? path.GetPendingGSPackets() : 0;
+			if (weakWait && ((startP1Packs-curP1Packs) || !curP1Packs)) break;
+			// On weakWait we will stop waiting on the MTGS thread if the
+			// MTGS thread has processed a vu1 xgkick packet, or is pending on
+			// its final vu1 xgkick packet (!curP1Packs)...
+			// Note: m_WritePos doesn't seem to have proper atomic write
+			// code, so reading it from the MTVU thread might be dangerous;
+			// hence it has been avoided...
+		}
 	}
 	
-	// Completely synchronize GS and MTGS register states.
-	memcpy_fast( RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs) );
+	if (syncRegs) {
+		ScopedLock lock(m_mtx_WaitGS);
+		// Completely synchronize GS and MTGS register states.
+		memcpy_fast(RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs));
+	}
 }

 // Sets the gsEvent flag and releases a timeslice.
--- a/pcsx2/MTVU.cpp
+++ b/pcsx2/MTVU.cpp
@ -0,0 +1,37 @@
+/*  PCSX2 - PS2 Emulator for PCs
+ *  Copyright (C) 2002-2010  PCSX2 Dev Team
+ *
+ *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ *  of the GNU Lesser General Public License as published by the Free Software Found-
+ *  ation, either version 3 of the License, or (at your option) any later version.
+ *
+ *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ *  PURPOSE.  See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along with PCSX2.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "PrecompiledHeader.h"
+#include "Common.h"
+#include "MTVU.h"
+#include "newVif.h"
+
+__aligned16 VU_Thread vu1Thread(CpuVU1, VU1);
+
+// Calls the vif unpack functions from the MTVU thread
+void MTVU_Unpack(void* data, VIFregisters& vifRegs) {
+	bool isFill = vifRegs.cycle.cl < vifRegs.cycle.wl;
+	if (newVifDynaRec) dVifUnpack<1>((u8*)data, isFill);
+	else              _nVifUnpack(1, (u8*)data, vifRegs.mode, isFill);
+}
+
+// Called on Saving/Loading states...
+void SaveStateBase::mtvuFreeze() {
+	FreezeTag("MTVU");
+	pxAssert(vu1Thread.IsDone());
+	if (!IsSaving()) vu1Thread.Reset();
+	Freeze(vu1Thread.vuCycles);
+	Freeze(vu1Thread.vuCycleIdx);
+}
--- a/pcsx2/MTVU.h
+++ b/pcsx2/MTVU.h
@ -0,0 +1,305 @@
+
+#pragma once
+#include "System/SysThreads.h"
+#include "Vif.h"
+#include "Vif_Dma.h"
+#include "VUmicro.h"
+#include "Gif_Unit.h"
+
+extern void MTVU_Unpack(void* data, VIFregisters& vifRegs);
+#define volatize(x) (*reinterpret_cast<volatile uint*>(&(x)))
+#define size_u32(x) (((u32)x+3u)>>2) // Rounds up a size in bytes for size in u32's
+#define MTVU_ALWAYS_KICK 0
+#define MTVU_SYNC_MODE   0
+#define MTVU_LOG(...) do{} while(0)
+//#define MTVU_LOG DevCon.WriteLn
+
+enum MTVU_EVENT {
+	MTVU_VU_EXECUTE,     // Execute VU program
+	MTVU_VU_WRITE_MICRO, // Write to VU micro-mem
+	MTVU_VU_WRITE_DATA,  // Write to VU data-mem
+	MTVU_VIF_WRITE_COL,  // Write to Vif col reg
+	MTVU_VIF_WRITE_ROW,  // Write to Vif row reg
+	MTVU_VIF_UNPACK,     // Execute Vif Unpack
+	MTVU_NULL_PACKET,    // Go back to beginning of buffer
+	MTVU_RESET
+};
+
+// Notes:
+// - This class should only be accessed from the EE thread...
+// - buffer_size must be power of 2
+// - ring-buffer has no complete pending packets when read_pos==write_pos
+struct VU_Thread : public pxThread {
+	static const u32 buffer_size = (_1mb * 16) / sizeof(u32);
+	static const u32 buffer_mask = buffer_size - 1;
+	__aligned(4) u32 buffer[buffer_size];
+	__aligned(4) volatile s32  read_pos; // Only modified by VU thread
+	__aligned(4) volatile bool isBusy;   // Is thread processing data?
+	__aligned(4) s32  write_pos;    // Only modified by EE thread
+	__aligned(4) s32  write_offset; // Only modified by EE thread
+	__aligned(4) Mutex     mtxBusy;
+	__aligned(4) Semaphore semaEvent;
+	__aligned(4) Semaphore semaXGkick;
+	__aligned(4) BaseVUmicroCPU*& vuCPU;
+	__aligned(4) VURegs&          vuRegs;
+	__aligned16  vifStruct        vif;
+	__aligned16  VIFregisters     vifRegs;
+	__aligned(4) u32 vuCycles[4]; // Used for VU cycle stealing hack
+	__aligned(4) u32 vuCycleIdx;  // Used for VU cycle stealing hack
+
+	VU_Thread(BaseVUmicroCPU*& _vuCPU, VURegs& _vuRegs) : 
+			vuCPU(_vuCPU), vuRegs(_vuRegs) {
+		m_name = L"MTVU";
+		Reset();
+	}
+	virtual ~VU_Thread() throw() {
+		pxThread::Cancel();
+	}
+	void InitThread() {
+		Start(); // Starts the pxThread
+	}
+	void Reset() {
+		read_pos     = 0;
+		write_pos    = 0;
+		write_offset = 0;
+		vuCycleIdx   = 0;
+		isBusy = false;
+		memzero(vif);
+		memzero(vifRegs);
+		memzero(vuCycles);
+	}
+protected:
+	// Should only be called by ReserveSpace()
+	__ri void WaitOnSize(s32 size) {
+		for(;;) {
+			s32 readPos  = GetReadPos();
+			if (readPos <= write_pos) break; // MTVU is reading in back of write_pos
+			if (readPos >  write_pos + size) break; // Enough free front space
+			if (1) { // Let MTVU run to free up buffer space
+				KickStart();
+				if (IsDevBuild) DevCon.WriteLn("WaitOnSize()");
+				ScopedLock lock(mtxBusy);
+			}
+		}
+	}
+
+	// Makes sure theres enough room in the ring buffer
+	// to write a continuous 'size * sizeof(u32)' bytes
+	void ReserveSpace(s32 size) {
+		pxAssert(write_pos < buffer_size);
+		pxAssert(size      < buffer_size);
+		pxAssert(size > 0);
+		pxAssert(write_offset == 0);
+		if (write_pos + size > buffer_size) {
+			pxAssert(write_pos > 0);
+			WaitOnSize(1); // Size of MTVU_NULL_PACKET
+			Write(MTVU_NULL_PACKET);
+			write_offset = 0;
+			AtomicExchange(volatize(write_pos), 0);
+		}
+		WaitOnSize(size);
+	}
+
+	// Use this when reading read_pos from ee thread
+	__fi volatile s32 GetReadPos() {
+		return AtomicRead(read_pos);
+	}
+	// Use this when reading write_pos from vu thread
+	__fi volatile s32 GetWritePos() {
+		return AtomicRead(volatize(write_pos));
+	}
+	// Gets the effective write pointer after adding write_offset
+	__fi u32* GetWritePtr() {
+		return &buffer[(write_pos + write_offset) & buffer_mask];
+	}
+
+	__fi void incReadPos(s32 offset) { // Offset in u32 sizes
+		s32 temp = (read_pos + offset) & buffer_mask;
+		AtomicExchange(read_pos, temp);
+	}
+	__fi void incWritePos() { // Adds write_offset
+		s32 temp = (write_pos + write_offset) & buffer_mask;
+		write_offset = 0;
+		AtomicExchange(volatize(write_pos), temp);
+		if (MTVU_ALWAYS_KICK) KickStart();
+		if (MTVU_SYNC_MODE)   WaitVU();
+	}
+
+	__fi u32 Read() {
+		u32 ret = buffer[read_pos];
+		incReadPos(1);
+		return ret;
+	}
+	__fi void Read(void* dest, u32 size) { // Size in bytes
+		memcpy_fast(dest, &buffer[read_pos], size);
+		incReadPos(size_u32(size));
+	}
+
+	__fi void Write(u32 val) {
+		GetWritePtr()[0] = val;
+		write_offset += 1;
+	}
+	__fi void Write(void* src, u32 size) { // Size in bytes
+		memcpy_fast(GetWritePtr(), src, size);
+		write_offset += size_u32(size);
+	}
+
+	void ExecuteTaskInThread() {
+		PCSX2_PAGEFAULT_PROTECT {
+			ExecuteRingBuffer();
+		} PCSX2_PAGEFAULT_EXCEPT;
+	}
+
+	void ExecuteRingBuffer() {
+		for(;;) {
+			semaEvent.WaitWithoutYield();
+			ScopedLockBool lock(mtxBusy, isBusy);
+			while (read_pos != GetWritePos()) {
+				u32 tag = Read();
+				switch (tag) {
+					case MTVU_VU_EXECUTE: {
+						vuRegs.cycle = 0;
+						s32 addr     = Read();
+						vifRegs.top  = Read();
+						vifRegs.itop = Read();
+						if (addr != -1) vuRegs.VI[REG_TPC].UL = addr;
+						vuCPU->Execute(vu1RunCycles);
+						gifUnit.gifPath[GIF_PATH_1].FinishGSPacketMTVU();
+						semaXGkick.Post(); // Tell MTGS a path1 packet is complete
+						AtomicExchange(vuCycles[vuCycleIdx], vuRegs.cycle);
+						vuCycleIdx  = (vuCycleIdx + 1) & 3;
+						break;
+					}
+					case MTVU_VU_WRITE_MICRO: {
+						u32 vu_micro_addr = Read();
+						u32 size = Read();
+						vuCPU->Clear(vu_micro_addr, size);
+						Read(&vuRegs.Micro[vu_micro_addr], size);
+						break;
+					}
+					case MTVU_VU_WRITE_DATA: {
+						u32 vu_data_addr = Read();
+						u32 size = Read();
+						Read(&vuRegs.Mem[vu_data_addr], size);
+						break;
+					}
+					case MTVU_VIF_WRITE_COL:
+						Read(&vif.MaskCol, sizeof(vif.MaskCol));
+						break;
+					case MTVU_VIF_WRITE_ROW:
+						Read(&vif.MaskRow, sizeof(vif.MaskRow));
+						break;
+					case MTVU_VIF_UNPACK: {
+						u32 vif_copy_size = (uptr)&vif.StructEnd - (uptr)&vif.tag;
+						Read(&vif.tag, vif_copy_size);
+						Read(&vifRegs, sizeof(vifRegs));
+						u32 size = Read();
+						MTVU_Unpack(&buffer[read_pos], vifRegs);
+						incReadPos(size_u32(size));
+						break;
+					}
+					case MTVU_NULL_PACKET:
+						AtomicExchange(read_pos, 0);
+						break;
+					jNO_DEFAULT;
+				}
+			}
+		}
+	}
+
+	// Returns Average number of vu Cycles from last 4 runs
+	u32 Get_vuCycles() { // Used for vu cycle stealing hack
+		return (AtomicRead(vuCycles[0]) + AtomicRead(vuCycles[1])
+			  + AtomicRead(vuCycles[2]) + AtomicRead(vuCycles[3])) >> 2;
+	}
+public:
+	
+	// Get MTVU to start processing its packets if it isn't already
+	void KickStart(bool forceKick = false) {
+		if ((forceKick && !semaEvent.Count())
+		|| (!isBusy && GetReadPos() != write_pos)) semaEvent.Post();
+	}
+
+	// Used for assertions...
+	bool IsDone() { return !isBusy && GetReadPos() == GetWritePos(); }
+
+	// Waits till MTVU is done processing
+	void WaitVU() {
+		MTVU_LOG("MTVU - WaitVU!");
+		for(;;) {
+			if (IsDone()) break;
+			//DevCon.WriteLn("WaitVU()");
+			pxAssert(THREAD_VU1);
+			KickStart();
+			ScopedLock lock(mtxBusy);
+		}
+	}
+
+	void ExecuteVU(u32 vu_addr, u32 vif_top, u32 vif_itop) {
+		MTVU_LOG("MTVU - ExecuteVU!");
+		ReserveSpace(4);
+		Write(MTVU_VU_EXECUTE);
+		Write(vu_addr);
+		Write(vif_top);
+		Write(vif_itop);
+		incWritePos();
+		gifUnit.TransferGSPacketData(GIF_TRANS_MTVU, NULL, 0);
+		KickStart();
+		u32 cycles = std::min(Get_vuCycles(), 3000u);
+		cpuRegs.cycle += cycles * EmuConfig.Speedhacks.VUCycleSteal;
+	}
+
+	void VifUnpack(vifStruct& _vif, VIFregisters& _vifRegs, u8* data, u32 size) {
+		MTVU_LOG("MTVU - VifUnpack!");
+		u32 vif_copy_size = (uptr)&_vif.StructEnd - (uptr)&_vif.tag;
+		ReserveSpace(1 + size_u32(vif_copy_size) + size_u32(sizeof(_vifRegs)) + 1 + size_u32(size));
+		Write(MTVU_VIF_UNPACK);
+		Write(&_vif.tag, vif_copy_size);
+		Write(&_vifRegs, sizeof(_vifRegs));
+		Write(size);
+		Write(data, size);
+		incWritePos();
+		KickStart();
+	}
+
+	// Writes to VU's Micro Memory (size in bytes)
+	void WriteMicroMem(u32 vu_micro_addr, void* data, u32 size) {
+		MTVU_LOG("MTVU - WriteMicroMem!");
+		ReserveSpace(3 + size_u32(size));
+		Write(MTVU_VU_WRITE_MICRO);
+		Write(vu_micro_addr);
+		Write(size);
+		Write(data, size);
+		incWritePos();
+	}
+
+	// Writes to VU's Data Memory (size in bytes)
+	void WriteDataMem(u32 vu_data_addr, void* data, u32 size) {
+		MTVU_LOG("MTVU - WriteDataMem!");
+		ReserveSpace(3 + size_u32(size));
+		Write(MTVU_VU_WRITE_DATA);
+		Write(vu_data_addr);
+		Write(size);
+		Write(data, size);
+		incWritePos();
+	}
+
+	void WriteCol(vifStruct& _vif) {
+		MTVU_LOG("MTVU - WriteCol!");
+		ReserveSpace(1 + size_u32(sizeof(_vif.MaskCol)));
+		Write(MTVU_VIF_WRITE_COL);
+		Write(&_vif.MaskCol, sizeof(_vif.MaskCol));
+		incWritePos();
+	}
+
+	void WriteRow(vifStruct& _vif) {
+		MTVU_LOG("MTVU - WriteRow!");
+		ReserveSpace(1 + size_u32(sizeof(_vif.MaskRow)));
+		Write(MTVU_VIF_WRITE_ROW);
+		Write(&_vif.MaskRow, sizeof(_vif.MaskRow));
+		incWritePos();
+	}
+};
+
+extern __aligned16 VU_Thread vu1Thread;
+
--- a/pcsx2/Memory.cpp
+++ b/pcsx2/Memory.cpp
@ -38,8 +38,9 @@ BIOS
 #include <wx/file.h>

 #include "IopCommon.h"
-#include "VUmicro.h"
 #include "GS.h"
+#include "VUmicro.h"
+#include "MTVU.h"

 #include "ps2/HwInternal.h"
 #include "ps2/BiosTools.h"
@ -102,6 +103,7 @@ static vtlbHandler

 	vu0_micro_mem,
 	vu1_micro_mem,
+	vu1_data_mem,

 	hw_by_page[0x10] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },

@ -131,7 +133,11 @@ void memMapVUmicro()
 	// VU0/VU1 memory (data)
 	// VU0 is 4k, mirrored 4 times across a 16k area.
 	vtlb_MapBlock(VU0.Mem,0x11004000,0x00004000,0x1000);
-	vtlb_MapBlock(VU1.Mem,0x1100c000,0x00004000);
+	// Note: In order for the below conditional to work correctly
+	// support needs to be coded to reset the memMappings when MTVU is
+	// turned off/on. For now we just always use the vu data handlers...
+	if (1||THREAD_VU1) vtlb_MapHandler(vu1_data_mem,0x1100c000,0x00004000);
+	else               vtlb_MapBlock  (VU1.Mem,     0x1100c000,0x00004000);
 }

 void memMapPhy()
@ -431,127 +437,185 @@ static void __fastcall _ext_memWrite128(u32 mem, const mem128_t *value)

 typedef void __fastcall ClearFunc_t( u32 addr, u32 qwc );

-template<int vunum>
-static __fi void ClearVuFunc( u32 addr, u32 size )
-{
-	if( vunum==0 )
-		CpuVU0->Clear(addr,size);
-	else
-		CpuVU1->Clear(addr,size);
+template<int vunum> static __fi void ClearVuFunc(u32 addr, u32 size) {
+	if (vunum) CpuVU1->Clear(addr, size);
+	else       CpuVU0->Clear(addr, size);
 }

-template<int vunum>
-static mem8_t __fastcall vuMicroRead8(u32 addr)
-{
-	addr&=(vunum==0)?0xfff:0x3fff;
-	VURegs* vu=(vunum==0)?&VU0:&VU1;
-
+// VU Micro Memory Reads...
+template<int vunum> static mem8_t __fc vuMicroRead8(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
 	return vu->Micro[addr];
 }
-
-template<int vunum>
-static mem16_t __fastcall vuMicroRead16(u32 addr)
-{
-	addr&=(vunum==0)?0xfff:0x3fff;
-	VURegs* vu=(vunum==0)?&VU0:&VU1;
-
+template<int vunum> static mem16_t __fc vuMicroRead16(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
 	return *(u16*)&vu->Micro[addr];
 }
-
-template<int vunum>
-static mem32_t __fastcall vuMicroRead32(u32 addr)
-{
-	addr&=(vunum==0)?0xfff:0x3fff;
-	VURegs* vu=(vunum==0)?&VU0:&VU1;
-
+template<int vunum> static mem32_t __fc vuMicroRead32(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
 	return *(u32*)&vu->Micro[addr];
 }
-
-template<int vunum>
-static void __fastcall vuMicroRead64(u32 addr,mem64_t* data)
-{
-	addr&=(vunum==0)?0xfff:0x3fff;
-	VURegs* vu=(vunum==0)?&VU0:&VU1;
-
+template<int vunum> static void __fc vuMicroRead64(u32 addr,mem64_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
 	*data=*(u64*)&vu->Micro[addr];
 }
-
-template<int vunum>
-static void __fastcall vuMicroRead128(u32 addr,mem128_t* data)
-{
-	addr&=(vunum==0)?0xfff:0x3fff;
-	VURegs* vu=(vunum==0)?&VU0:&VU1;
-
+template<int vunum> static void __fc vuMicroRead128(u32 addr,mem128_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
 	CopyQWC(data,&vu->Micro[addr]);
 }

 // Profiled VU writes: Happen very infrequently, with exception of BIOS initialization (at most twice per
 //   frame in-game, and usually none at all after BIOS), so cpu clears aren't much of a big deal.
-
-template<int vunum>
-static void __fastcall vuMicroWrite8(u32 addr,mem8_t data)
-{
-	addr &= (vunum==0) ? 0xfff : 0x3fff;
-	VURegs& vu = (vunum==0) ? VU0 : VU1;
-
-	if (vu.Micro[addr]!=data)
-	{
-		ClearVuFunc<vunum>(addr&(~7), 8); // Clear before writing new data (clearing 8 bytes because an instruction is 8 bytes) (cottonvibes)
-		vu.Micro[addr]=data;
+template<int vunum> static void __fc vuMicroWrite8(u32 addr,mem8_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, &data, sizeof(u8));
+		return;
+	}
+	if (vu->Micro[addr]!=data) {     // Clear before writing new data
+		ClearVuFunc<vunum>(addr, 8); //(clearing 8 bytes because an instruction is 8 bytes) (cottonvibes)
+		vu->Micro[addr] =data;
+	}
+}
+template<int vunum> static void __fc vuMicroWrite16(u32 addr, mem16_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, &data, sizeof(u16));
+		return;
+	}
+	if (*(u16*)&vu->Micro[addr]!=data) {
+		ClearVuFunc<vunum>(addr, 8);
+		*(u16*)&vu->Micro[addr] =data;
+	}
+}
+template<int vunum> static void __fc vuMicroWrite32(u32 addr, mem32_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, &data, sizeof(u32));
+		return;
+	}
+	if (*(u32*)&vu->Micro[addr]!=data) {
+		ClearVuFunc<vunum>(addr, 8);
+		*(u32*)&vu->Micro[addr] =data;
+	}
+}
+template<int vunum> static void __fc vuMicroWrite64(u32 addr, const mem64_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, (void*)data, sizeof(u64));
+		return;
+	}
+	if (*(u64*)&vu->Micro[addr]!=data[0]) {
+		ClearVuFunc<vunum>(addr, 8);
+		*(u64*)&vu->Micro[addr] =data[0];
+	}
+}
+template<int vunum> static void __fc vuMicroWrite128(u32 addr, const mem128_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, (void*)data, sizeof(u128));
+		return;
+	}
+	if ((u128&)vu->Micro[addr]!=*data) {
+		ClearVuFunc<vunum>(addr, 16);
+		CopyQWC(&vu->Micro[addr],data);
 	}
 }

-template<int vunum>
-static void __fastcall vuMicroWrite16(u32 addr,mem16_t data)
-{
-	addr &= (vunum==0) ? 0xfff : 0x3fff;
-	VURegs& vu = (vunum==0) ? VU0 : VU1;
-
-	if (*(u16*)&vu.Micro[addr]!=data)
-	{
-		ClearVuFunc<vunum>(addr&(~7), 8);
-		*(u16*)&vu.Micro[addr]=data;
-	}
+// VU Data Memory Reads...
+template<int vunum> static mem8_t __fc vuDataRead8(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
+	return vu->Mem[addr];
+}
+template<int vunum> static mem16_t __fc vuDataRead16(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
+	return *(u16*)&vu->Mem[addr];
+}
+template<int vunum> static mem32_t __fc vuDataRead32(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
+	return *(u32*)&vu->Mem[addr];
+}
+template<int vunum> static void __fc vuDataRead64(u32 addr, mem64_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
+	*data=*(u64*)&vu->Mem[addr];
+}
+template<int vunum> static void __fc vuDataRead128(u32 addr, mem128_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
+	CopyQWC(data,&vu->Mem[addr]);
 }

-template<int vunum>
-static void __fastcall vuMicroWrite32(u32 addr,mem32_t data)
-{
-	addr &= (vunum==0) ? 0xfff : 0x3fff;
-	VURegs& vu = (vunum==0) ? VU0 : VU1;
-
-	if (*(u32*)&vu.Micro[addr]!=data)
-	{
-		ClearVuFunc<vunum>(addr&(~7), 8);
-		*(u32*)&vu.Micro[addr]=data;
+// VU Data Memory Writes...
+template<int vunum> static void __fc vuDataWrite8(u32 addr, mem8_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteDataMem(addr, &data, sizeof(u8));
+		return;
 	}
+	vu->Mem[addr] = data;
+}
+template<int vunum> static void __fc vuDataWrite16(u32 addr, mem16_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteDataMem(addr, &data, sizeof(u16));
+		return;
+	}
+	*(u16*)&vu->Mem[addr] = data;
+}
+template<int vunum> static void __fc vuDataWrite32(u32 addr, mem32_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteDataMem(addr, &data, sizeof(u32));
+		return;
+	}
+	*(u32*)&vu->Mem[addr] = data;
+}
+template<int vunum> static void __fc vuDataWrite64(u32 addr, const mem64_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteDataMem(addr, (void*)data, sizeof(u64));
+		return;
+	}
+	*(u64*)&vu->Mem[addr] = data[0];
+}
+template<int vunum> static void __fc vuDataWrite128(u32 addr, const mem128_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteDataMem(addr, (void*)data, sizeof(u128));
+		return;
+	}
+	CopyQWC(&vu->Mem[addr], data);
 }

-template<int vunum>
-static void __fastcall vuMicroWrite64(u32 addr,const mem64_t* data)
-{
-	addr &= (vunum==0) ? 0xfff : 0x3fff;
-	VURegs& vu = (vunum==0) ? VU0 : VU1;
-
-	if (*(u64*)&vu.Micro[addr]!=data[0])
-	{
-		ClearVuFunc<vunum>(addr&(~7), 8);
-		*(u64*)&vu.Micro[addr]=data[0];
-	}
-}
-
-template<int vunum>
-static void __fastcall vuMicroWrite128(u32 addr,const mem128_t* data)
-{
-	addr &= (vunum==0) ? 0xfff : 0x3fff;
-	VURegs& vu = (vunum==0) ? VU0 : VU1;
-
-	if ((u128&)vu.Micro[addr] != *data)
-	{
-		ClearVuFunc<vunum>(addr&(~7), 16);
-		CopyQWC(&vu.Micro[addr],data);
-	}
-}

 void memSetPageAddr(u32 vaddr, u32 paddr)
 {
@ -640,9 +704,8 @@ void eeMemoryReserve::Commit()
 // Resets memory mappings, unmaps TLBs, reloads bios roms, etc.
 void eeMemoryReserve::Reset()
 {
-	if (!mmap_faultHandler)
-	{
-		pxAssume(Source_PageFault);
+	if(!mmap_faultHandler) {
+		pxAssert(Source_PageFault);
 		mmap_faultHandler = new mmap_PageFaultHandler();
 	}
 	
@ -674,7 +737,8 @@ void eeMemoryReserve::Reset()
 	// Dynarec versions of VUs
 	vu0_micro_mem = vtlb_RegisterHandlerTempl1(vuMicro,0);
 	vu1_micro_mem = vtlb_RegisterHandlerTempl1(vuMicro,1);
-
+	vu1_data_mem  = (1||THREAD_VU1) ? vtlb_RegisterHandlerTempl1(vuData,1) : NULL;
+	
 	//////////////////////////////////////////////////////////////////////////////////////////
 	// IOP's "secret" Hardware Register mapping, accessible from the EE (and meant for use
 	// by debugging or BIOS only).  The IOP's hw regs are divided into three main pages in
--- a/pcsx2/Pcsx2Config.cpp
+++ b/pcsx2/Pcsx2Config.cpp
@ -64,6 +64,7 @@ void Pcsx2Config::SpeedhackOptions::LoadSave( IniInterface& ini )
 	IniBitBool( WaitLoop );
 	IniBitBool( vuFlagHack );
 	IniBitBool( vuBlockHack );
+	IniBitBool( vuThread );
 }

 void Pcsx2Config::ProfilerOptions::LoadSave( IniInterface& ini )
--- a/pcsx2/R5900.cpp
+++ b/pcsx2/R5900.cpp
@ -21,6 +21,7 @@
 #include "R3000A.h"
 #include "VUmicro.h"
 #include "COP0.h"
+#include "MTVU.h"

 #include "System/SysThreads.h"
 #include "R5900Exceptions.h"
@ -54,6 +55,7 @@ extern SysMainMemory& GetVmMemory();

 void cpuReset()
 {
+	vu1Thread.WaitVU();
 	if (GetMTGS().IsOpen())
 		GetMTGS().WaitGS();		// GS better be done processing before we reset the EE, just in case.

@ -281,9 +283,6 @@ static __fi void _cpuTestInterrupts()
 	TESTINT(DMAC_GIF,		gifInterrupt);
 	TESTINT(DMAC_SIF0,		EEsif0Interrupt);
 	TESTINT(DMAC_SIF1,		EEsif1Interrupt);
-
-	//extern void Gif_Execute();
-	//TESTINT(DMAC_GIF_UNIT,	Gif_Execute);
 	
 	// Profile-guided Optimization (sorta)
 	// The following ints are rarely called.  Encasing them in a conditional
--- a/pcsx2/SPR.cpp
+++ b/pcsx2/SPR.cpp
@ -18,6 +18,7 @@

 #include "SPR.h"
 #include "VUmicro.h"
+#include "MTVU.h"

 extern void mfifoGIFtransfer(int);

@ -31,19 +32,23 @@ void sprInit()
 {
 }

-static void TestClearVUs(u32 madr, u32 size)
+static void TestClearVUs(u32 madr, u32 qwc)
 {
 	if (madr >= 0x11000000)
 	{
 		if (madr < 0x11004000)
 		{
 			DbgCon.Warning("scratch pad clearing vu0");
-			CpuVU0->Clear(madr&0xfff, size);
+			CpuVU0->Clear(madr&0xfff, qwc * 16);
 		}
 		else if (madr >= 0x11008000 && madr < 0x1100c000)
 		{
 			DbgCon.Warning("scratch pad clearing vu1");
-			CpuVU1->Clear(madr&0x3fff, size);
+			if (THREAD_VU1) {
+				DevCon.Error("MTVU Warning: SPR Accessing VU1 Memory!!!");
+				vu1Thread.WaitVU();
+			}
+			CpuVU1->Clear(madr&0x3fff, qwc * 16);
 		}
 	}
 }
@ -83,7 +88,7 @@ int  _SPR0chain()
 			memcpy_qwc(pMem, &psSu128(spr0ch.sadr), partialqwc);

 			// clear VU mem also!
-			TestClearVUs(spr0ch.madr, partialqwc << 2); // Wtf is going on here? AFAIK, only VIF should affect VU micromem (cottonvibes)
+			TestClearVUs(spr0ch.madr, partialqwc);

 			spr0ch.madr += partialqwc << 4;
 			spr0ch.sadr += partialqwc << 4;
@ -135,7 +140,7 @@ void _SPR0interleave()
 			case NO_MFD:
 			case MFD_RESERVED:
 				// clear VU mem also!
-				TestClearVUs(spr0ch.madr, spr0ch.qwc << 2);
+				TestClearVUs(spr0ch.madr, spr0ch.qwc);
 				memcpy_qwc(pMem, &psSu128(spr0ch.sadr), spr0ch.qwc);
 				break;
 		}
--- a/pcsx2/SaveState.cpp
+++ b/pcsx2/SaveState.cpp
@ -21,6 +21,7 @@
 #include "ps2/BiosTools.h"
 #include "COP0.h"
 #include "VUmicro.h"
+#include "MTVU.h"
 #include "Cache.h"
 #include "AppConfig.h"

@ -150,10 +151,9 @@ static const uint MainMemorySizeInBytes =

 SaveStateBase& SaveStateBase::FreezeMainMemory()
 {
-	if (IsLoading())
-		PreLoadPrep();
-	else
-		m_memory->MakeRoomFor( m_idx + MainMemorySizeInBytes );
+	vu1Thread.WaitVU(); // Finish VU1 just in-case...
+	if (IsLoading()) PreLoadPrep();
+	else m_memory->MakeRoomFor( m_idx + MainMemorySizeInBytes );

 	// First Block - Memory Dumps
 	// ---------------------------
@ -175,8 +175,8 @@ SaveStateBase& SaveStateBase::FreezeMainMemory()

 SaveStateBase& SaveStateBase::FreezeInternals()
 {
-	if( IsLoading() )
-		PreLoadPrep();
+	vu1Thread.WaitVU(); // Finish VU1 just in-case...
+	if (IsLoading()) PreLoadPrep();

 	// Second Block - Various CPU Registers and States
 	// -----------------------------------------------
--- a/pcsx2/SaveState.h
+++ b/pcsx2/SaveState.h
@ -24,7 +24,7 @@
 //  the lower 16 bit value.  IF the change is breaking of all compatibility with old
 //  states, increment the upper 16 bit value, and clear the lower 16 bits to 0.

-static const u32 g_SaveVersion = (0x9A02 << 16) | 0x0000;
+static const u32 g_SaveVersion = (0x9A03 << 16) | 0x0000;

 // this function is meant to be used in the place of GSfreeze, and provides a safe layer
 // between the GS saving function and the MTGS's needs. :)
@ -193,6 +193,7 @@ protected:

 	// Load/Save functions for the various components of our glorious emulator!

+	void mtvuFreeze();
 	void rcntFreeze();
 	void vuMicroFreeze();
 	void vif0Freeze();
--- a/pcsx2/System.h
+++ b/pcsx2/System.h
@ -154,7 +154,7 @@ protected:
 // implemented by the provisioning interface.
 extern SysCpuProviderPack& GetCpuProviders();

-extern void SysLogMachineCaps();				// Detects cpu type and fills cpuInfo structs.
+extern void SysLogMachineCaps();		// Detects cpu type and fills cpuInfo structs.
 extern void SysClearExecutionCache();	// clears recompiled execution caches!
 extern void SysOutOfMemory_EmergencyResponse(uptr blocksize);

--- a/pcsx2/VU1micro.cpp
+++ b/pcsx2/VU1micro.cpp
@ -19,10 +19,9 @@

 #include "PrecompiledHeader.h"
 #include "Common.h"
-
 #include <cmath>
-
 #include "VUmicro.h"
+#include "MTVU.h"

 #ifdef PCSX2_DEBUG
 u32 vudump = 0;
@ -39,6 +38,10 @@ void vu1ResetRegs()
 }

 void vu1Finish() {
+	if (THREAD_VU1) {
+		if (VU0.VI[REG_VPU_STAT].UL & 0x100) DevCon.Error("MTVU: VU0.VI[REG_VPU_STAT].UL & 0x100");
+		return;
+	}
 	while (VU0.VI[REG_VPU_STAT].UL & 0x100) {
 		VUM_LOG("vu1ExecMicro > Stalling until current microprogram finishes");
 		CpuVU1->Execute(vu1RunCycles);
@ -47,10 +50,15 @@ void vu1Finish() {

 void __fastcall vu1ExecMicro(u32 addr)
 {
+	if (THREAD_VU1) {
+		vu1Thread.ExecuteVU(addr, vif1Regs.top, vif1Regs.itop);
+		vif1Regs.stat.VEW        = false;
+		VU0.VI[REG_VPU_STAT].UL &= ~0xFF00;
+		return;
+	}
 	static int count = 0;
 	vu1Finish();

-	VUM_LOG("vu1ExecMicro %x", addr);
 	VUM_LOG("vu1ExecMicro %x (count=%d)", addr, count++);

 	VU0.VI[REG_VPU_STAT].UL &= ~0xFF00;
--- a/pcsx2/VU1microInterp.cpp
+++ b/pcsx2/VU1microInterp.cpp
@ -18,6 +18,7 @@
 #include "Common.h"

 #include "VUmicro.h"
+#include "MTVU.h"

 extern void _vuFlushAll(VURegs* VU);

@ -173,6 +174,14 @@ InterpVU1::InterpVU1()
 	IsInterpreter = true;
 }

+void InterpVU1::Reset() {
+	vu1Thread.WaitVU();
+}
+
+void InterpVU1::Shutdown() {
+	vu1Thread.WaitVU();
+}
+
 void InterpVU1::Step()
 {
 	VU1.VI[REG_TPC].UL &= VU1_PROGMASK;
--- a/pcsx2/VUmicro.h
+++ b/pcsx2/VUmicro.h
@ -193,8 +193,8 @@ public:
 	wxString GetLongName() const		{ return L"VU1 Interpreter"; }

 	void Reserve() { }
-	void Shutdown() throw() { }
-	void Reset() { }
+	void Shutdown() throw();
+	void Reset();

 	void Step();
 	void Execute(u32 cycles);
--- a/pcsx2/VUmicroMem.cpp
+++ b/pcsx2/VUmicroMem.cpp
@ -54,7 +54,8 @@ void vuMemoryReserve::Reset()
 	pxAssert( VU0.Mem );
 	pxAssert( VU1.Mem );

-	memMapVUmicro();
+	// Below memMap is already called by "void eeMemoryReserve::Reset()"
+	//memMapVUmicro();

 	// === VU0 Initialization ===
 	memzero(VU0.ACC);
--- a/pcsx2/VUops.cpp
+++ b/pcsx2/VUops.cpp
@ -18,6 +18,7 @@
 #include "VUops.h"
 #include "GS.h"
 #include "Gif_Unit.h"
+#include "MTVU.h"

 #include <cmath>

@ -2018,7 +2019,8 @@ static __ri void _vuEEXP(VURegs * VU) {

 static __ri void _vuXITOP(VURegs * VU) {
 	if (_It_ == 0) return;
-	VU->VI[_It_].US[0] = VU->GetVifRegs().itop;
+	if (VU==&VU1 && THREAD_VU1) VU->VI[_It_].US[0] = vu1Thread.vifRegs.itop;
+	else                        VU->VI[_It_].US[0] = VU->GetVifRegs().itop;
 }

 static __ri void _vuXGKICK(VURegs * VU)
@ -2041,7 +2043,8 @@ static __ri void _vuXGKICK(VURegs * VU)

 static __ri void _vuXTOP(VURegs * VU) {
 	if(_It_ == 0) return;
-	VU->VI[_It_].US[0] = (u16)VU->GetVifRegs().top;
+	if (VU==&VU1 && THREAD_VU1) VU->VI[_It_].US[0] = (u16)vu1Thread.vifRegs.top;
+	else                        VU->VI[_It_].US[0] = (u16)VU->GetVifRegs().top;
 }

 #define GET_VF0_FLAG(reg) (((reg)==0)?(1<<REG_VF0_FLAG):0)
--- a/pcsx2/Vif.cpp
+++ b/pcsx2/Vif.cpp
@ -20,6 +20,7 @@
 #include "newVif.h"
 #include "GS.h"
 #include "Gif.h"
+#include "MTVU.h"

 __aligned16 vifStruct  vif0, vif1;

@ -289,18 +290,18 @@ __fi void vif1STAT(u32 value) {
 #define caseVif(x) (idx ? VIF1_##x : VIF0_##x)

 _vifT __fi u32 vifRead32(u32 mem) {
-	vifStruct& vif = GetVifX;
-
+	vifStruct& vif = MTVU_VifX;
+	bool wait = idx && THREAD_VU1;
 	switch (mem) {
-		case caseVif(ROW0): return vif.MaskRow._u32[0];
-		case caseVif(ROW1): return vif.MaskRow._u32[1];
-		case caseVif(ROW2): return vif.MaskRow._u32[2];
-		case caseVif(ROW3): return vif.MaskRow._u32[3];
+		case caseVif(ROW0): if (wait) vu1Thread.WaitVU(); return vif.MaskRow._u32[0];
+		case caseVif(ROW1): if (wait) vu1Thread.WaitVU(); return vif.MaskRow._u32[1];
+		case caseVif(ROW2): if (wait) vu1Thread.WaitVU(); return vif.MaskRow._u32[2];
+		case caseVif(ROW3): if (wait) vu1Thread.WaitVU(); return vif.MaskRow._u32[3];

-		case caseVif(COL0): return vif.MaskCol._u32[0];
-		case caseVif(COL1): return vif.MaskCol._u32[1];
-		case caseVif(COL2): return vif.MaskCol._u32[2];
-		case caseVif(COL3): return vif.MaskCol._u32[3];
+		case caseVif(COL0): if (wait) vu1Thread.WaitVU(); return vif.MaskCol._u32[0];
+		case caseVif(COL1): if (wait) vu1Thread.WaitVU(); return vif.MaskCol._u32[1];
+		case caseVif(COL2): if (wait) vu1Thread.WaitVU(); return vif.MaskCol._u32[2];
+		case caseVif(COL3): if (wait) vu1Thread.WaitVU(); return vif.MaskCol._u32[3];
 	}
 	
 	return psHu32(mem);
@ -334,15 +335,15 @@ _vifT __fi bool vifWrite32(u32 mem, u32 value) {
 			// standard register writes -- handled by caller.
 		break;

-		case caseVif(ROW0): vif.MaskRow._u32[0] = value; return false;
-		case caseVif(ROW1): vif.MaskRow._u32[1] = value; return false;
-		case caseVif(ROW2): vif.MaskRow._u32[2] = value; return false;
-		case caseVif(ROW3): vif.MaskRow._u32[3] = value; return false;
+		case caseVif(ROW0): vif.MaskRow._u32[0] = value; if (idx && THREAD_VU1) vu1Thread.WriteRow(vif); return false;
+		case caseVif(ROW1): vif.MaskRow._u32[1] = value; if (idx && THREAD_VU1) vu1Thread.WriteRow(vif); return false;
+		case caseVif(ROW2): vif.MaskRow._u32[2] = value; if (idx && THREAD_VU1) vu1Thread.WriteRow(vif); return false;
+		case caseVif(ROW3): vif.MaskRow._u32[3] = value; if (idx && THREAD_VU1) vu1Thread.WriteRow(vif); return false;

-		case caseVif(COL0): vif.MaskCol._u32[0] = value; return false;
-		case caseVif(COL1): vif.MaskCol._u32[1] = value; return false;
-		case caseVif(COL2): vif.MaskCol._u32[2] = value; return false;
-		case caseVif(COL3): vif.MaskCol._u32[3] = value; return false;
+		case caseVif(COL0): vif.MaskCol._u32[0] = value; if (idx && THREAD_VU1) vu1Thread.WriteCol(vif); return false;
+		case caseVif(COL1): vif.MaskCol._u32[1] = value; if (idx && THREAD_VU1) vu1Thread.WriteCol(vif); return false;
+		case caseVif(COL2): vif.MaskCol._u32[2] = value; if (idx && THREAD_VU1) vu1Thread.WriteCol(vif); return false;
+		case caseVif(COL3): vif.MaskCol._u32[3] = value; if (idx && THREAD_VU1) vu1Thread.WriteCol(vif); return false;
 	}

 	// fall-through case: issue standard writeback behavior.
--- a/pcsx2/Vif.h
+++ b/pcsx2/Vif.h
@ -106,6 +106,7 @@ union tVIF_STAT {
 	};
 	u32 _u32;

+	tVIF_STAT() {}
 	tVIF_STAT(u32 val)			{ _u32 = val; }
 	bool test(u32 flags) const	{ return !!(_u32 & flags); }
 	void set_flags	(u32 flags)	{ _u32 |=  flags; }
@ -145,6 +146,7 @@ union tVIF_ERR {
 	};
 	u32 _u32;

+	tVIF_ERR() {}
 	tVIF_ERR  (u32 val)					{ _u32 = val; }
 	void write(u32 val)					{ _u32 = val; }
 	bool test		(u32 flags) const	{ return !!(_u32 & flags); }
@ -221,6 +223,9 @@ static VIFregisters& vif1Regs = (VIFregisters&)eeHw[0x3C00];
 #define  vifXch		(idx ? (vif1ch)   : (vif0ch))
 #define  vifXRegs	(idx ? (vif1Regs) : (vif0Regs))

+#define  MTVU_VifX     (idx ? ((THREAD_VU1) ? vu1Thread.vif     : vif1)     : (vif0))
+#define  MTVU_VifXRegs (idx ? ((THREAD_VU1) ? vu1Thread.vifRegs : vif1Regs) : (vif0Regs))
+
 extern void dmaVIF0();
 extern void dmaVIF1();
 extern void mfifoVIF1transfer(int qwc);
--- a/pcsx2/Vif1_Dma.cpp
+++ b/pcsx2/Vif1_Dma.cpp
@ -17,7 +17,6 @@
 #include "Common.h"
 #include "Vif_Dma.h"
 #include "GS.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "VUmicro.h"
 #include "newVif.h"
--- a/pcsx2/Vif1_MFIFO.cpp
+++ b/pcsx2/Vif1_MFIFO.cpp
@ -16,7 +16,6 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"
 #include "Vif.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "Vif_Dma.h"

--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@ -16,11 +16,11 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"
 #include "GS.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "Vif_Dma.h"
 #include "newVif.h"
 #include "VUmicro.h"
+#include "MTVU.h"

 #define vifOp(vifCodeName) _vifT int __fastcall vifCodeName(int pass, const u32 *data)
 #define pass1    if (pass == 0)
@ -36,7 +36,7 @@ vifOp(vifCode_Null);

 static __fi void vifFlush(int idx) {
 	if (!idx) vif0FLUSH();
-	else	  vif1FLUSH();
+	else      vif1FLUSH();
 }

 static __fi void vuExecMicro(int idx, u32 addr) {
@ -70,14 +70,16 @@ static __fi void vuExecMicro(int idx, u32 addr) {
 		}
 	}

-	if(!idx)startcycles = VU0.cycle;
-	else    startcycles = VU1.cycle;
+	if (!idx) startcycles = VU0.cycle;
+	else      startcycles = VU1.cycle;

 	if (!idx) vu0ExecMicro(addr);
 	else	  vu1ExecMicro(addr);

-	if(!idx) { g_vu0Cycles += (VU0.cycle-startcycles); g_packetsizeonvu = vif0.vifpacketsize; }
-	else     { g_vu1Cycles += (VU1.cycle-startcycles); g_packetsizeonvu = vif1.vifpacketsize; }
+	if (!idx || !THREAD_VU1) {
+		if (!idx) { g_vu0Cycles += (VU0.cycle-startcycles); g_packetsizeonvu = vif0.vifpacketsize; }
+		else      { g_vu1Cycles += (VU1.cycle-startcycles); g_packetsizeonvu = vif1.vifpacketsize; }
+	}
 	//DevCon.Warning("Ran VU%x, VU0 Cycles %x, VU1 Cycles %x, start %x cycle %x", idx, g_vu0Cycles, g_vu1Cycles, startcycles, VU1.cycle);
 	GetVifX.vifstalled = true;
 }
@ -225,11 +227,14 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
 	VURegs& VUx = idx ? VU1 : VU0;
 	pxAssert(VUx.Micro > 0);

+	if (idx && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, (u8*)data, size*4);
+		return;
+	}
 	if (memcmp_mmx(VUx.Micro + addr, data, size*4)) {
 		// Clear VU memory before writing!
-		// (VUs expect size to be 32-bit scale, same as VIF's internal working sizes)
-		if (!idx)  CpuVU0->Clear(addr, size);
-		else	   CpuVU1->Clear(addr, size);
+		if (!idx)  CpuVU0->Clear(addr, size*4);
+		else	   CpuVU1->Clear(addr, size*4);
 		memcpy_fast(VUx.Micro + addr, data, size*4);
 	}
 }
@ -387,7 +392,9 @@ vifOp(vifCode_STCol) {
 		return 1;
 	}
 	pass2 {
-		return _vifCode_STColRow<idx>(data, &vifX.MaskCol._u32[vifX.tag.addr]);
+		u32 ret = _vifCode_STColRow<idx>(data, &vifX.MaskCol._u32[vifX.tag.addr]);
+		if (idx && THREAD_VU1) { vu1Thread.WriteCol(vifX); }
+		return ret;
 	}
 	pass3 { VifCodeLog("STCol"); }
 	return 0;
@ -401,7 +408,9 @@ vifOp(vifCode_STRow) {
 		return 1;
 	}
 	pass2 {
-		return _vifCode_STColRow<idx>(data, &vifX.MaskRow._u32[vifX.tag.addr]);
+		u32 ret = _vifCode_STColRow<idx>(data, &vifX.MaskRow._u32[vifX.tag.addr]);
+		if (idx && THREAD_VU1) { vu1Thread.WriteRow(vifX); }
+		return ret;
 	}
 	pass3 { VifCodeLog("STRow"); }
 	return 0;
@ -447,7 +456,9 @@ vifOp(vifCode_Unpack) {
 		vifUnpackSetup<idx>(data);
 		return 1;
 	}
-	pass2 { return nVifUnpack<idx>((u8*)data); }
+	pass2 { 
+		return nVifUnpack<idx>((u8*)data);
+	}
 	pass3 {
 		vifStruct& vifX = GetVifX;
 		VIFregisters& vifRegs = vifXRegs;
--- a/pcsx2/Vif_Dma.h
+++ b/pcsx2/Vif_Dma.h
@ -56,14 +56,18 @@ union tTRXREG {

 // NOTE, if debugging vif stalls, use sega classics, spyro, gt4, and taito
 struct vifStruct {
-	u128	MaskRow, MaskCol;
+	__aligned16 u128 MaskRow;
+	__aligned16 u128 MaskCol;
+
+	struct { // These must be together for MTVU
+		vifCode tag;
+		int cmd;
+		int cl;
+		u8  usn;
+		u8  StructEnd; // Address of this is used to calculate end of struct
+	};

-	vifCode tag;
-	int cmd;
 	int irq;
-	int cl;
-	int qwcalign;
-	u8 usn;

 	bool done;
 	bool vifstalled;
@ -72,17 +76,13 @@ struct vifStruct {
 	// GS registers used for calculating the size of the last local->host transfer initiated on the GS
 	// Transfer size calculation should be restricted to GS emulation in the future
 	tBITBLTBUF BITBLTBUF;
-	tTRXREG TRXREG;
-	u32 GSLastDownloadSize;
+	tTRXREG    TRXREG;
+	u32        GSLastDownloadSize;

-	u8 irqoffset; // 32bit offset where next vif code is
-	u32 savedtag; // need this for backwards compat with save states
+	u8  irqoffset; // 32bit offset where next vif code is
 	u32 vifpacketsize;
-	u8 inprogress;
-	u32 lastcmd;
-	u8 dmamode;
-	u8 Unused_GifWaitState; // Only here for saved state compatibility
-	//u8 GifWaitState; // 0 = General PATH checking, 1 = Flush path 3, 2 == Wait for VU1
+	u8  inprogress;
+	u8  dmamode;
 };

 extern __aligned16 vifStruct  vif0, vif1;
--- a/pcsx2/Vif_Transfer.cpp
+++ b/pcsx2/Vif_Transfer.cpp
@ -94,7 +94,6 @@ _vifT void vifTransferLoop(u32* &data) {

 			vifCmdHandler[idx][vifX.cmd & 0x7f](0, data);
 			data++; pSize--;
-			vifX.lastcmd = (vifXRegs.code >> 24) & 0x7f;
 			if (analyzeIbit<idx>(data, iBit)) break;
 			continue;
 		}
--- a/pcsx2/Vif_Unpack.cpp
+++ b/pcsx2/Vif_Unpack.cpp
@ -17,6 +17,7 @@
 #include "Common.h"
 #include "Vif.h"
 #include "Vif_Dma.h"
+#include "MTVU.h"

 enum UnpackOffset {
 	OFFSET_X = 0,
@ -36,10 +37,10 @@ template< uint idx, uint mode, bool doMask >
 static __ri void writeXYZW(u32 offnum, u32 &dest, u32 data) {
 	int n = 0;

-	vifStruct& vif = GetVifX;
+	vifStruct& vif = MTVU_VifX;

 	if (doMask) {
-		const VIFregisters& regs = vifXRegs;
+		const VIFregisters& regs = MTVU_VifXRegs;
 		switch (vif.cl) {
 			case 0:  n = (regs.mask >> (offnum * 2)) & 0x3;		break;
 			case 1:  n = (regs.mask >> ( 8 + (offnum * 2))) & 0x3;	break;
--- a/pcsx2/gui/CpuUsageProvider.cpp
+++ b/pcsx2/gui/CpuUsageProvider.cpp
@ -23,21 +23,24 @@
 #endif

 #include "GS.h"
+#include "MTVU.h"

-void AllThreeThreads::LoadWithCurrentTimes()
+void AllPCSX2Threads::LoadWithCurrentTimes()
 {
 	ee		= GetCoreThread().GetCpuTime();
 	gs		= GetMTGS().GetCpuTime();
+	vu		= vu1Thread.GetCpuTime();
 	ui		= GetThreadCpuTime();
 	update	= GetCPUTicks();
 }

-AllThreeThreads AllThreeThreads::operator-( const AllThreeThreads& right ) const
+AllPCSX2Threads AllPCSX2Threads::operator-( const AllPCSX2Threads& right ) const
 {
-	AllThreeThreads retval;
+	AllPCSX2Threads retval;

 	retval.ee		= ee - right.ee;
 	retval.gs		= gs - right.gs;
+	retval.vu		= vu - right.vu;
 	retval.ui		= ui - right.ui;
 	retval.update	= update - right.update;

@ -48,6 +51,7 @@ DefaultCpuUsageProvider::DefaultCpuUsageProvider()
 {
 	m_pct_ee = 0;
 	m_pct_gs = 0;
+	m_pct_vu = 0;
 	m_pct_ui = 0;
 	m_writepos = 0;

@ -69,16 +73,17 @@ void DefaultCpuUsageProvider::UpdateStats()
 {
 	// Measure deltas between the first and last positions in the ring buffer:

-	AllThreeThreads& newone( m_queue[m_writepos] );
+	AllPCSX2Threads& newone( m_queue[m_writepos] );
 	newone.LoadWithCurrentTimes();
 	m_writepos = (m_writepos+1) % QueueDepth;
-	const AllThreeThreads deltas( newone - m_queue[m_writepos] );
+	const AllPCSX2Threads deltas( newone - m_queue[m_writepos] );

 	// get the real time passed, scaled to the Thread's tick frequency.
 	u64 timepass	= (deltas.update * GetThreadTicksPerSecond()) / GetTickFrequency();

 	m_pct_ee = (deltas.ee * 100) / timepass;
 	m_pct_gs = (deltas.gs * 100) / timepass;
+	m_pct_vu = (deltas.vu * 100) / timepass;
 	m_pct_ui = (deltas.ui * 100) / timepass;
 }

@ -92,6 +97,11 @@ int DefaultCpuUsageProvider::GetGsPct() const
 	return m_pct_gs;
 }

+int DefaultCpuUsageProvider::GetVUPct() const
+{
+	return m_pct_vu;
+}
+
 int DefaultCpuUsageProvider::GetGuiPct() const
 {
 	return m_pct_ui;
--- a/pcsx2/gui/CpuUsageProvider.h
+++ b/pcsx2/gui/CpuUsageProvider.h
@ -27,6 +27,7 @@ public:
 	virtual void UpdateStats()=0;
 	virtual int GetEEcorePct() const=0;
 	virtual int GetGsPct() const=0;
+	virtual int GetVUPct() const=0;
 	virtual int GetGuiPct() const=0;
 };

@ -44,16 +45,17 @@ public:
 	virtual void UpdateStats()			{ m_Implementation->UpdateStats(); }
 	virtual int GetEEcorePct() const	{ return m_Implementation->GetEEcorePct(); }
 	virtual int GetGsPct() const		{ return m_Implementation->GetGsPct(); }
+	virtual int GetVUPct() const		{ return m_Implementation->GetVUPct(); }
 	virtual int GetGuiPct() const		{ return m_Implementation->GetGuiPct(); }
 };

-struct AllThreeThreads
+struct AllPCSX2Threads
 {
-	u64		ee, gs, ui;
+	u64		ee, gs, vu, ui;
 	u64		update;

 	void LoadWithCurrentTimes();
-	AllThreeThreads operator-( const AllThreeThreads& right ) const;
+	AllPCSX2Threads operator-( const AllPCSX2Threads& right ) const;
 };

 class DefaultCpuUsageProvider :
@ -64,11 +66,12 @@ public:
 	static const uint QueueDepth = 4;

 protected:
-	AllThreeThreads m_queue[QueueDepth];
+	AllPCSX2Threads m_queue[QueueDepth];

 	uint	m_writepos;
 	u32		m_pct_ee;
 	u32		m_pct_gs;
+	u32		m_pct_vu;
 	u32		m_pct_ui;

 public:
@ -80,6 +83,7 @@ public:
 	void UpdateStats();
 	int GetEEcorePct() const;
 	int GetGsPct() const;
+	int GetVUPct() const;
 	int GetGuiPct() const;

 protected:
--- a/pcsx2/gui/CpuUsageProviderMSW.cpp
+++ b/pcsx2/gui/CpuUsageProviderMSW.cpp
@ -55,6 +55,7 @@ public:
 	void UpdateStats();
 	int GetEEcorePct() const;
 	int GetGsPct() const;
+	int GetVUPct() const;
 	int GetGuiPct() const;
 };

@ -264,6 +265,11 @@ int CpuUsageProviderMSW::GetGsPct() const
 	return 0;
 }

+int CpuUsageProviderMSW::GetVUPct() const
+{
+	return 0;
+}
+
 int CpuUsageProviderMSW::GetGuiPct() const
 {
 	return 0;
--- a/pcsx2/gui/FrameForGS.cpp
+++ b/pcsx2/gui/FrameForGS.cpp
@ -533,10 +533,18 @@ void GSFrame::OnUpdateTitle( wxTimerEvent& evt )
 	}

 	FastFormatUnicode cpuUsage;
-	if( m_CpuUsage.IsImplemented() )
-	{
+	if (m_CpuUsage.IsImplemented()) {
 		m_CpuUsage.UpdateStats();
-		cpuUsage.Write( L" | EE: %3d%% | GS: %3d%% | UI: %3d%%", m_CpuUsage.GetEEcorePct(), m_CpuUsage.GetGsPct(), m_CpuUsage.GetGuiPct() );
+		if (THREAD_VU1) { // Display VU thread's usage
+			cpuUsage.Write(L" | EE: %3d%% | GS: %3d%% | VU: %3d%% | UI: %3d%%",
+				m_CpuUsage.GetEEcorePct(),	m_CpuUsage.GetGsPct(),
+				m_CpuUsage.GetVUPct(),		m_CpuUsage.GetGuiPct());
+		}
+		else {
+			cpuUsage.Write(L" | EE: %3d%% | GS: %3d%% | UI: %3d%%",
+				m_CpuUsage.GetEEcorePct(),	m_CpuUsage.GetGsPct(),
+				m_CpuUsage.GetGuiPct());
+		}
 	}

 	const u64& smode2 = *(u64*)PS2GS_BASE(GS_SMODE2);
--- a/pcsx2/gui/Panels/ConfigurationPanels.h
+++ b/pcsx2/gui/Panels/ConfigurationPanels.h
@ -335,6 +335,7 @@ namespace Panels
 		pxCheckBox*		m_check_fastCDVD;
 		pxCheckBox*		m_check_vuFlagHack;
 		pxCheckBox*		m_check_vuBlockHack;
+		pxCheckBox*		m_check_vuThread;

 	public:
 		virtual ~SpeedHacksPanel() throw() {}
--- a/pcsx2/gui/Panels/SpeedhacksPanel.cpp
+++ b/pcsx2/gui/Panels/SpeedhacksPanel.cpp
@ -161,10 +161,13 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
 	wxPanelWithHelpers* vuHacksPanel = new wxPanelWithHelpers( right, wxVERTICAL, _("microVU Hacks") );

 	m_check_vuFlagHack = new pxCheckBox( vuHacksPanel, _("mVU Flag Hack"),
-		_("Good Speedup and High Compatibility; may cause garbage graphics, SPS, etc... [Recommended]") );
+		_("Good Speedup and High Compatibility; may cause bad graphics... [Recommended]" ) );

 	m_check_vuBlockHack = new pxCheckBox( vuHacksPanel, _("mVU Block Hack"),
-		_("Good Speedup and High Compatibility; may cause garbage graphics, SPS, etc...") );
+		_("Good Speedup and High Compatibility; may cause bad graphics, SPS, etc...") );
+
+	m_check_vuThread = new pxCheckBox( vuHacksPanel, _("MTVU (Multi-Threaded microVU1)"),
+		_("Good Speedup and High Compatibility; may cause hanging... [Recommended if 3+ cores]") );

 	m_check_vuFlagHack->SetToolTip( pxEt( "!ContextTip:Speedhacks:vuFlagHack",
 		L"Updates Status Flags only on blocks which will read them, instead of all the time. "
@ -176,6 +179,12 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
 		L"This should be pretty safe. It is unknown if this breaks any game..."
 	) );

+	m_check_vuThread->SetToolTip( pxEt( "!ContextTip:Speedhacks:vuThread",
+		L"Runs VU1 on its own thread (microVU1-only). Generally a speedup on CPUs with 3 or more cores. "
+		L"This is safe for most games, but a few games are incompatible and may hang. "
+		L"In the case of GS limited games, it may be a slowdown (especially on dual core CPUs)."
+	) );
+
 	// ------------------------------------------------------------------------
 	// All other hacks Section:

@ -226,7 +235,8 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )

 	*vuHacksPanel	+= m_check_vuFlagHack;
 	*vuHacksPanel	+= m_check_vuBlockHack;
-	*vuHacksPanel	+= 57; // Aligns left and right boxes in default language and font size
+	*vuHacksPanel	+= m_check_vuThread;
+	//*vuHacksPanel	+= 57; // Aligns left and right boxes in default language and font size

 	*miscHacksPanel	+= m_check_intc;
 	*miscHacksPanel	+= m_check_waitloop;
@ -304,6 +314,7 @@ void Panels::SpeedHacksPanel::ApplyConfigToGui( AppConfig& configToApply, int fl

 	m_check_vuFlagHack	->SetValue(opts.vuFlagHack);
 	m_check_vuBlockHack	->SetValue(opts.vuBlockHack);
+	m_check_vuThread	->SetValue(opts.vuThread);
 	m_check_intc		->SetValue(opts.IntcStat);
 	m_check_waitloop	->SetValue(opts.WaitLoop);
 	m_check_fastCDVD	->SetValue(opts.fastCDVD);
@ -333,6 +344,7 @@ void Panels::SpeedHacksPanel::Apply()
 	opts.IntcStat			= m_check_intc->GetValue();
 	opts.vuFlagHack			= m_check_vuFlagHack->GetValue();
 	opts.vuBlockHack		= m_check_vuBlockHack->GetValue();
+	opts.vuThread			= m_check_vuThread->GetValue();

 	// If the user has a command line override specified, we need to disable it
 	// so that their changes take effect
--- a/pcsx2/ps2/LegacyDmac.cpp
+++ b/pcsx2/ps2/LegacyDmac.cpp
@ -17,6 +17,7 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"
 #include "Hardware.h"
+#include "MTVU.h"

 #include "IPU/IPUdma.h"
 #include "ps2/HwInternal.h"
@ -91,7 +92,7 @@ __fi void setDmacStat(u32 num)
 }

 // Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits)
-__fi tDMA_TAG *SPRdmaGetAddr(u32 addr, bool write)
+__fi tDMA_TAG* SPRdmaGetAddr(u32 addr, bool write)
 {
 	// if (addr & 0xf) { DMA_LOG("*PCSX2*: DMA address not 128bit aligned: %8.8x", addr); }

@ -114,6 +115,10 @@ __fi tDMA_TAG *SPRdmaGetAddr(u32 addr, bool write)
 	}
 	else if ((addr >= 0x11004000) && (addr < 0x11010000))
 	{
+		if (THREAD_VU1) {
+			DevCon.Error("MTVU: SPRdmaGetAddr Accessing VU Memory!");
+			vu1Thread.WaitVU();
+		}
 		//Access for VU Memory
 		return (tDMA_TAG*)vtlb_GetPhyPtr(addr & 0x1FFFFFF0);
 	}
--- a/pcsx2/vtlb.cpp
+++ b/pcsx2/vtlb.cpp
@ -41,7 +41,7 @@
 using namespace R5900;
 using namespace vtlb_private;

-#define verify pxAssume
+#define verify pxAssert

 namespace vtlb_private
 {
@ -512,14 +512,14 @@ void vtlb_MapBlock(void* base, u32 start, u32 size, u32 blocksize)
 {
 	verify(0==(start&VTLB_PAGE_MASK));
 	verify(0==(size&VTLB_PAGE_MASK) && size>0);
-	if (!blocksize)
+	if(!blocksize)
 		blocksize = size;
 	verify(0==(blocksize&VTLB_PAGE_MASK) && blocksize>0);
 	verify(0==(size%blocksize));

 	s32 baseint = (s32)base;
 	u32 end = start + (size - VTLB_PAGE_SIZE);
-	pxAssume( (end>>VTLB_PAGE_BITS) < ArraySize(vtlbdata.pmap) );
+	verify((end>>VTLB_PAGE_BITS) < ArraySize(vtlbdata.pmap));

 	while (start <= end)
 	{
@ -544,7 +544,7 @@ void vtlb_Mirror(u32 new_region,u32 start,u32 size)
 	verify(0==(size&VTLB_PAGE_MASK) && size>0);

 	u32 end = start + (size-VTLB_PAGE_SIZE);
-	pxAssume( (end>>VTLB_PAGE_BITS) < ArraySize(vtlbdata.pmap) );
+	verify((end>>VTLB_PAGE_BITS) < ArraySize(vtlbdata.pmap));

 	while(start <= end)
 	{
--- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
+++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
@ -1322,6 +1322,14 @@
 					<Filter
 						Name="VU"
 						>
+						<File
+							RelativePath="..\..\MTVU.cpp"
+							>
+						</File>
+						<File
+							RelativePath="..\..\MTVU.h"
+							>
+						</File>
 						<File
 							RelativePath="..\..\VU.h"
 							>
--- a/pcsx2/x86/microVU.cpp
+++ b/pcsx2/x86/microVU.cpp
@ -99,6 +99,7 @@ void mVUreset(microVU& mVU, bool resetReserve) {
 	mVU.prog.x86start	= z;
 	mVU.prog.x86ptr		= z;
 	mVU.prog.x86end		= z + ((mVU.cacheSize - mVUcacheSafeZone) * _1mb);
+	//memset(mVU.prog.x86start, 0xcc, mVU.cacheSize*_1mb);

 	for(u32 i = 0; i < (mVU.progSize / 2); i++) {
 		if(!mVU.prog.prog[i]) {
@ -279,7 +280,6 @@ _mVUt __fi void* mVUsearchProg(u32 startPC, uptr pState) {
 //------------------------------------------------------------------
 // recMicroVU0 / recMicroVU1
 //------------------------------------------------------------------
-
 recMicroVU0::recMicroVU0()		  { m_Idx = 0; IsInterpreter = false; }
 recMicroVU1::recMicroVU1()		  { m_Idx = 1; IsInterpreter = false; }
 void recMicroVU0::Vsync() throw() { mVUvsyncUpdate(microVU0); }
@ -290,8 +290,10 @@ void recMicroVU0::Reserve() {
 		mVUinit(microVU0, 0);
 }
 void recMicroVU1::Reserve() {
-	if (AtomicExchange(m_Reserved, 1) == 0)
+	if (AtomicExchange(m_Reserved, 1) == 0) {
 		mVUinit(microVU1, 1);
+		vu1Thread.InitThread();
+	}
 }

 void recMicroVU0::Shutdown() throw() {
@ -299,8 +301,10 @@ void recMicroVU0::Shutdown() throw() {
 		mVUclose(microVU0);
 }
 void recMicroVU1::Shutdown() throw() {
-	if (AtomicExchange(m_Reserved, 0) == 1)
+	if (AtomicExchange(m_Reserved, 0) == 1) {
+		vu1Thread.WaitVU();
 		mVUclose(microVU1);
+	}
 }

 void recMicroVU0::Reset() {
@ -309,6 +313,7 @@ void recMicroVU0::Reset() {
 }
 void recMicroVU1::Reset() {
 	if(!pxAssertDev(m_Reserved, "MicroVU1 CPU Provider has not been reserved prior to reset!")) return;
+	vu1Thread.WaitVU();
 	mVUreset(microVU1, true);
 }

@ -325,8 +330,10 @@ void recMicroVU0::Execute(u32 cycles) {
 void recMicroVU1::Execute(u32 cycles) {
 	pxAssert(m_Reserved); // please allocate me first! :|

-	if(!(VU0.VI[REG_VPU_STAT].UL & 0x100)) return;
-	((mVUrecCall)microVU1.startFunct)(VU1.VI[REG_TPC].UL, vu1RunCycles);
+	if (!THREAD_VU1) {
+		if(!(VU0.VI[REG_VPU_STAT].UL & 0x100)) return;
+	}
+	((mVUrecCall)microVU1.startFunct)(VU1.VI[REG_TPC].UL, cycles);
 }

 void recMicroVU0::Clear(u32 addr, u32 size) {
--- a/pcsx2/x86/microVU.h
+++ b/pcsx2/x86/microVU.h
@ -24,8 +24,8 @@ using namespace x86Emitter;
 #include <algorithm>
 #include "Common.h"
 #include "VU.h"
+#include "MTVU.h"
 #include "GS.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "iR5900.h"
 #include "R5900OpcodeTables.h"
@ -217,9 +217,11 @@ struct microVU {

 	VURegs& regs() const { return ::vuRegs[index]; }

-	__fi VIFregisters&	getVifRegs()	const	{ return regs().GetVifRegs(); }
-	__fi REG_VI&		getVI(uint reg) const	{ return regs().VI[reg]; }
-	__fi VECTOR&		getVF(uint reg) const	{ return regs().VF[reg]; }
+	__fi REG_VI& getVI(uint reg) const	{ return regs().VI[reg]; }
+	__fi VECTOR& getVF(uint reg) const	{ return regs().VF[reg]; }
+	__fi VIFregisters& getVifRegs()	const {
+		return (index && THREAD_VU1) ? vu1Thread.vifRegs : regs().GetVifRegs();
+	}
 };

 // microVU rec structs
--- a/pcsx2/x86/microVU_Branch.inl
+++ b/pcsx2/x86/microVU_Branch.inl
@ -77,8 +77,10 @@ void mVUendProgram(mV, microFlagCycles* mFC, int isEbit) {
 	xMOV(ptr32[&mVU.regs().VI[REG_CLIP_FLAG].UL],	gprT2);

 	if (isEbit || isVU1) { // Clear 'is busy' Flags
-		xAND(ptr32[&VU0.VI[REG_VPU_STAT].UL], (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
-		xAND(ptr32[&mVU.getVifRegs().stat], ~VIF1_STAT_VEW); // Clear VU 'is busy' signal for vif
+		if (!mVU.index || !THREAD_VU1) {
+			xAND(ptr32[&VU0.VI[REG_VPU_STAT].UL], (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
+			xAND(ptr32[&mVU.getVifRegs().stat], ~VIF1_STAT_VEW); // Clear VU 'is busy' signal for vif
+		}
 	}

 	if (isEbit != 2) { // Save PC, and Jump to Exit Point
--- a/pcsx2/x86/microVU_Execute.inl
+++ b/pcsx2/x86/microVU_Execute.inl
@ -199,7 +199,10 @@ _mVUt void mVUcleanUp() {

 	mVU.cycles = mVU.totalCycles - mVU.cycles;
 	mVU.regs().cycle += mVU.cycles;
-	cpuRegs.cycle += ((mVU.cycles < 3000) ? mVU.cycles : 3000) * EmuConfig.Speedhacks.VUCycleSteal;
+
+	if (!vuIndex || !THREAD_VU1) {
+		cpuRegs.cycle += std::min(mVU.cycles, 3000u) * EmuConfig.Speedhacks.VUCycleSteal;
+	}
 	//static int ax = 0; ax++;
 	//if (!(ax % 100000)) {
 	//	for (u32 i = 0; i < (mVU.progSize / 2); i++) {
--- a/pcsx2/x86/microVU_Misc.inl
+++ b/pcsx2/x86/microVU_Misc.inl
@ -239,7 +239,14 @@ __fi void mVUrestoreRegs(microVU& mVU, bool fromMemory = false)
 }

 // Gets called by mVUaddrFix at execution-time
-static void __fastcall mVUwarningRegAccess(u32 prog, u32 pc) { Console.Error("microVU0 Warning: Accessing VU1 Regs! [%04x] [%x]", pc, prog); }
+static void __fc mVUwarningRegAccess(u32 prog, u32 pc) {
+	Console.Error("microVU0 Warning: Accessing VU1 Regs! [%04x] [%x]", pc, prog);
+}
+
+static void __fc mVUwaitMTVU() {
+	if (IsDevBuild) DevCon.WriteLn("microVU0: Waiting on VU1 thread to access VU1 regs!");
+	if (THREAD_VU1) vu1Thread.WaitVU();
+}

 // Transforms the Address in gprReg to valid VU0/VU1 Address
 __fi void mVUaddrFix(mV, const x32& gprReg)
@ -249,28 +256,31 @@ __fi void mVUaddrFix(mV, const x32& gprReg)
 		xSHL(gprReg, 4);
 	}
 	else {
-		if (IsDevBuild && !isCOP2) mVUbackupRegs(mVU, true);
 		xTEST(gprReg, 0x400);
 		xForwardJNZ8 jmpA;		// if addr & 0x4000, reads VU1's VF regs and VI regs
 			xAND(gprReg, 0xff); // if !(addr & 0x4000), wrap around
-			xForwardJump8 jmpB;
+			xForwardJump32 jmpB;
 		jmpA.SetTarget();
-			if (IsDevBuild && !isCOP2) { // Lets see which games do this!
-				xPUSH(gprT1);			 // Note: Kernel does it via COP2 to initialize VU1!
-				xPUSH(gprT2);			 // So we don't spam console, we'll only check micro-mode...
+			if (THREAD_VU1 || (IsDevBuild && !isCOP2)) {
+				mVUbackupRegs(mVU, true);
+				xPUSH(gprT1);
+				xPUSH(gprT2);
 				xPUSH(gprT3);
-				xMOV (gprT2, mVU.prog.cur->idx);
-				xMOV (gprT3, xPC);
-				xCALL(mVUwarningRegAccess);
+				if (IsDevBuild && !isCOP2) {         // Lets see which games do this!
+					xMOV (gprT2, mVU.prog.cur->idx); // Note: Kernel does it via COP2 to initialize VU1!
+					xMOV (gprT3, xPC);               // So we don't spam console, we'll only check micro-mode...
+					xCALL(mVUwarningRegAccess);
+				}
+				xCALL(mVUwaitMTVU);
 				xPOP (gprT3);
 				xPOP (gprT2);
 				xPOP (gprT1);
+				mVUrestoreRegs(mVU, true);
 			}
 			xAND(gprReg, 0x3f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs!
 			xADD(gprReg, (u128*)VU1.VF - (u128*)VU0.Mem);
 		jmpB.SetTarget();
 		xSHL(gprReg, 4); // multiply by 16 (shift left by 4)
-		if (IsDevBuild && !isCOP2) mVUrestoreRegs(mVU, true);
 	}
 }

--- a/pcsx2/x86/newVif.h
+++ b/pcsx2/x86/newVif.h
@ -57,7 +57,6 @@ _vifT extern void  dVifUnpack  (const u8* data, bool isFill);

 // nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
 //             used as the hash bucket selector.
-//
 struct __aligned16 nVifBlock {
 	u8   num;		// [00] Num  Field
 	u8   upkType;	// [01] Unpack Type [usn*1:mask*1:upk*4]
@ -74,6 +73,8 @@ struct __aligned16 nVifBlock {
 #define _tParams nVifBlock, _hSize, _cmpS
 struct nVifStruct {

+	__aligned16 nVifBlock   block;
+
 	// Buffer for partial transfers (should always be first to ensure alignment)
 	// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
 	__aligned16 u8			buffer[256*16];
--- a/pcsx2/x86/newVif_Dynarec.cpp
+++ b/pcsx2/x86/newVif_Dynarec.cpp
@ -19,30 +19,28 @@

 #include "PrecompiledHeader.h"
 #include "newVif_UnpackSSE.h"
+#include "MTVU.h"

-static __aligned16 nVifBlock _vBlock = {0};
-
-void dVifReserve(int idx)
-{
-	if (!nVif[idx].recReserve)
+void dVifReserve(int idx) {
+	if(!nVif[idx].recReserve)
 		nVif[idx].recReserve = new RecompiledCodeReserve(pxsFmt(L"VIF%u Unpack Recompiler Cache", idx));

 	nVif[idx].recReserve->Reserve( nVif[idx].recReserveSizeMB * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec );
 }

 void dVifReset(int idx) {
-
 	pxAssertDev(nVif[idx].recReserve, "Dynamic VIF recompiler reserve must be created prior to VIF use or reset!");

-	if (!nVif[idx].vifBlocks)
+	if(!nVif[idx].vifBlocks)
 		nVif[idx].vifBlocks = new HashBucket<_tParams>();
 	else
 		nVif[idx].vifBlocks->clear();

 	nVif[idx].recReserve->Reset();

-	nVif[idx].numBlocks		=  0;
-	nVif[idx].recWritePtr	= nVif[idx].recReserve->GetPtr();
+	nVif[idx].numBlocks   =  0;
+	nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
+	//memset(nVif[idx].recWritePtr, 0xcc, nVif[idx].recReserveSizeMB * _1mb);
 }

 void dVifClose(int idx) {
@ -74,7 +72,8 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo
 }

 __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
-	const vifStruct& vif = v.idx ? vif1 : vif0;
+	const int idx = v.idx;
+	const vifStruct& vif = MTVU_VifX;

 	u32 m0 = vB.mask;
 	u32 m1 =  m0 & 0xaaaaaaaa;
@ -126,7 +125,8 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
 }

 void VifUnpackSSE_Dynarec::writeBackRow() const {
-	xMOVAPS(ptr128[&((v.idx ? vif1 : vif0).MaskRow)], xmmRow);
+	const int idx = v.idx;
+	xMOVAPS(ptr128[&(MTVU_VifX.MaskRow)], xmmRow);
 	DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
 	// ToDo: Do we need to write back to vifregs.rX too!? :/
 }
@ -208,25 +208,25 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
 }

 _vifT static __fi u8* dVifsetVUptr(uint cl, uint wl, bool isFill) {
-	vifStruct& vif			= GetVifX;
-	const VURegs& VU		= vuRegs[idx];
-	const uint vuMemLimit	= idx ? 0x4000 : 0x1000;
+	nVifStruct&   v          = nVif[idx];
+	vifStruct&    vif        = MTVU_VifX;
+	const VURegs& VU         = vuRegs[idx];
+	const uint    vuMemLimit = idx ? 0x4000 : 0x1000;

-	u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
-	u8* endmem = VU.Mem + vuMemLimit;
-	uint length = (_vBlock.num > 0) ? (_vBlock.num * 16) : 4096; // 0 = 256
+	u8*  startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
+	u8*  endmem   = VU.Mem + vuMemLimit;
+	uint length   = (v.block.num > 0) ? (v.block.num * 16) : 4096; // 0 = 256

 	if (!isFill) {
 		// Accounting for skipping mode: Subtract the last skip cycle, since the skipped part of the run
 		// shouldn't count as wrapped data.  Otherwise, a trailing skip can cause the emu to drop back
 		// to the interpreter. -- Refraction (test with MGS3)
-
 		uint skipSize  = (cl - wl) * 16;
-		uint blocks    = _vBlock.num / wl;
+		uint blocks    = v.block.num / wl;
 		length += (blocks-1) * skipSize;
 	}

-	if ( (startmem+length) <= endmem ) {
+	if ((startmem + length) <= endmem) {
 		return startmem;
 	}
 	//Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, vif.tag.addr, vif.tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl);
@ -245,12 +245,12 @@ static __fi void dVifRecLimit(int idx) {
 	}
 }

-_vifT static __fi bool dVifExecuteUnpack(const u8* data, bool isFill)
+_vifT static __ri bool dVifExecuteUnpack(const u8* data, bool isFill)
 {
-	const nVifStruct& v		= nVif[idx];
-	VIFregisters& vifRegs	= vifXRegs;
+	nVifStruct&   v		  = nVif[idx];
+	VIFregisters& vifRegs = MTVU_VifXRegs;

-	if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
+	if (nVifBlock* b = v.vifBlocks->find(&v.block)) {
 		if (u8* dest = dVifsetVUptr<idx>(vifRegs.cycle.cl, vifRegs.cycle.wl, isFill)) {
 			//DevCon.WriteLn("Running Recompiled Block!");
 			((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
@ -266,39 +266,37 @@ _vifT static __fi bool dVifExecuteUnpack(const u8* data, bool isFill)

 _vifT __fi void dVifUnpack(const u8* data, bool isFill) {

-	const nVifStruct& v		= nVif[idx];
-	vifStruct& vif			= GetVifX;
-	VIFregisters& vifRegs	= vifXRegs;
+	nVifStruct&   v       = nVif[idx];
+	vifStruct&    vif	  = MTVU_VifX;
+	VIFregisters& vifRegs = MTVU_VifXRegs;

-	const u8	upkType		= (vif.cmd & 0x1f) | (vif.usn << 5);
-	const int	doMask		= isFill? 1 : (vif.cmd & 0x10);
+	const u8	upkType   = (vif.cmd & 0x1f) | (vif.usn << 5);
+	const int	doMask    = isFill? 1 : (vif.cmd & 0x10);

-	_vBlock.upkType = upkType;
-	_vBlock.num		= (u8&)vifRegs.num;
-	_vBlock.mode	= (u8&)vifRegs.mode;
-	_vBlock.cl		= vifRegs.cycle.cl;
-	_vBlock.wl		= vifRegs.cycle.wl;
+	v.block.upkType = upkType;
+	v.block.num     = (u8&)vifRegs.num;
+	v.block.mode    = (u8&)vifRegs.mode;
+	v.block.cl      = vifRegs.cycle.cl;
+	v.block.wl      = vifRegs.cycle.wl;

 	// Zero out the mask parameter if it's unused -- games leave random junk
 	// values here which cause false recblock cache misses.
-	_vBlock.mask	= doMask ? vifRegs.mask : 0;
+	v.block.mask	= doMask ? vifRegs.mask : 0;

 	//DevCon.WriteLn("nVif%d: Recompiled Block! [%d]", idx, nVif[idx].numBlocks++);
 	//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
-	//	_vBlock.num, _vBlock.upkType, _vBlock.scl, _vBlock.cl, _vBlock.wl, _vBlock.mode,
-	//	doMask >> 4, doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
+	//	v.Block.num, v.Block.upkType, v.Block.scl, v.Block.cl, v.Block.wl, v.Block.mode,
+	//	doMask >> 4, doMask ? wxsFormat( L"0x%08x", v.Block.mask ).c_str() : L"ignored"
 	//);

 	if (dVifExecuteUnpack<idx>(data, isFill)) return;

 	xSetPtr(v.recWritePtr);
-	_vBlock.startPtr = (uptr)xGetAlignedCallTarget();
-	v.vifBlocks->add(_vBlock);
-	VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine();
+	v.block.startPtr = (uptr)xGetAlignedCallTarget();
+	v.vifBlocks->add(v.block);
+	VifUnpackSSE_Dynarec(v, v.block).CompileRoutine();
 	nVif[idx].recWritePtr = xGetPtr();

-	// [TODO] : Ideally we should test recompile buffer limits prior to each instruction,
-	//   which would be safer and more memory efficient than using an 0.25 meg recEnd marker.
 	dVifRecLimit(idx);

 	// Run the block we just compiled.  Various conditions may force us to still use
--- a/pcsx2/x86/newVif_Unpack.cpp
+++ b/pcsx2/x86/newVif_Unpack.cpp
@ -21,6 +21,7 @@
 #include "Common.h"
 #include "Vif_Dma.h"
 #include "newVif.h"
+#include "MTVU.h"

 __aligned16 nVifStruct	nVif[2];

@ -75,7 +76,7 @@ nVifStruct::nVifStruct()
 	vifBlocks	=  NULL;
 	numBlocks	=  0;

-	recReserveSizeMB	= 8;
+	recReserveSizeMB = 8;
 }

 void reserveNewVif(int idx)
@ -87,8 +88,8 @@ void resetNewVif(int idx)
 	// Safety Reset : Reassign all VIF structure info, just in case the VU1 pointers have
 	// changed for some reason.

-	nVif[idx].idx			= idx;
-	nVif[idx].bSize			= 0;
+	nVif[idx].idx   = idx;
+	nVif[idx].bSize = 0;
 	memzero(nVif[idx].buffer);

 	if (newVifDynaRec) dVifReset(idx);
@ -106,8 +107,8 @@ static __fi u8* getVUptr(uint idx, int offset) {


 _vifT int nVifUnpack(const u8* data) {
-	nVifStruct& v = nVif[idx];
-	vifStruct& vif = GetVifX;
+	nVifStruct&   v       = nVif[idx];
+	vifStruct&    vif     = GetVifX;
 	VIFregisters& vifRegs = vifXRegs;

 	const uint ret    = aMin(vif.vifpacketsize, vif.tag.size);
@ -118,6 +119,7 @@ _vifT int nVifUnpack(const u8* data) {
 		if (v.bSize) { // Last transfer was partial
 			memcpy_fast(&v.buffer[v.bSize], data, size);
 			v.bSize		+= size;
+			size        = v.bSize;
 			data		= v.buffer;

 			vif.cl		= 0;
@ -125,8 +127,11 @@ _vifT int nVifUnpack(const u8* data) {
 			if (!vifRegs.num) vifRegs.num = 256;
 		}

-		if (newVifDynaRec)	dVifUnpack<idx>(data, isFill);
-		else			   _nVifUnpack(idx, data, vifRegs.mode, isFill);
+		if (!idx || !THREAD_VU1) {
+			if (newVifDynaRec)	dVifUnpack<idx>(data, isFill);
+			else			   _nVifUnpack(idx, data, vifRegs.mode, isFill);
+		}
+		else vu1Thread.VifUnpack(vif, vifRegs, (u8*)data, size);

 		vif.tag.size	= 0;
 		vif.cmd			= 0;
@ -147,12 +152,10 @@ _vifT int nVifUnpack(const u8* data) {
 		// We can optimize the calculation either way as some games have big partial chunks (Guitar Hero). 
 		// Skipping writes are easy, filling is a bit more complex, so for now until we can 
 		// be sure its right (if it happens) it just prints debug stuff and processes the old way.
-		if(!isFill) 
-		{
-				vifRegs.num -= (size / vSize);
+		if (!isFill) {
+			vifRegs.num -= (size / vSize);
 		}
-		else
-		{
+		else {
 			int guessedsize = (size / vSize);
 			guessedsize = vifRegs.num - (((guessedsize / vifRegs.cycle.cl) * (vifRegs.cycle.wl - vifRegs.cycle.cl)) + guessedsize);

@ -164,14 +167,11 @@ _vifT int nVifUnpack(const u8* data) {
 					if (vif.cl <= vifRegs.cycle.cl)			size -= vSize;
 					else if (vif.cl == vifRegs.cycle.wl)	vif.cl = 0;
 				}
-				else
-				{
+				else {
 					size -= vSize;
 					if (vif.cl >= vifRegs.cycle.wl) vif.cl = 0;
 				}
 			}
-			
-			
 			DevCon.Warning("Fill!! Partial num left = %x, guessed %x", vifRegs.num, guessedsize);
 		}
 	}
@ -236,8 +236,8 @@ static void setMasks(const vifStruct& vif, const VIFregisters& v) {
 template< int idx, bool doMode, bool isFill >
 __ri void __fastcall _nVifUnpackLoop(const u8* data) {

-	vifStruct& vif = GetVifX;
-	VIFregisters& vifRegs = vifXRegs;
+	vifStruct&    vif     = MTVU_VifX;
+	VIFregisters& vifRegs = MTVU_VifXRegs;

 	// skipSize used for skipping writes only
 	const int skipSize  = (vifRegs.cycle.cl - vifRegs.cycle.wl) * 16;
@ -253,8 +253,8 @@ __ri void __fastcall _nVifUnpackLoop(const u8* data) {
 	//uint vn = (vif.cmd >> 2) & 0x3;
 	//uint vSize = ((32 >> vl) * (vn+1)) / 8;		// size of data (in bytes) used for each write cycle

-	const nVifCall*	fnbase			= &nVifUpk[ ((usn*2*16) + upkNum) * (4*1) ];
-	const UNPACKFUNCTYPE ft			= VIFfuncTable[idx][doMode ? vifRegs.mode : 0][ ((usn*2*16) + upkNum) ];
+	const nVifCall*	fnbase  = &nVifUpk[ ((usn*2*16) + upkNum) * (4*1) ];
+	const UNPACKFUNCTYPE ft = VIFfuncTable[idx][doMode ? vifRegs.mode : 0][ ((usn*2*16) + upkNum) ];

 	pxAssume (vif.cl == 0);
 	pxAssume (vifRegs.cycle.wl > 0);
--- a/pcsx2/x86/sVU_Lower.cpp
+++ b/pcsx2/x86/sVU_Lower.cpp
@ -26,7 +26,6 @@
 #include "sVU_Micro.h"
 #include "sVU_Debug.h"
 #include "sVU_zerorec.h"
-#include "Gif.h"
 #include "Gif_Unit.h"

 using namespace x86Emitter;
--- a/pcsx2/x86/sVU_zerorec.cpp
+++ b/pcsx2/x86/sVU_zerorec.cpp
@ -32,6 +32,7 @@
 #include "GS.h"
 #include "Gif.h"
 #include "VU.h"
+#include "MTVU.h"

 #include "R5900.h"
 #include "iR5900.h"
@ -456,15 +457,14 @@ void SuperVUReset(int vuindex)
 	s_recVUPtr[vuindex] = *s_recVUMem[vuindex];
 }

-// clear the block and any joining blocks
+// clear the block and any joining blocks (size given in bytes)
 static void __fastcall SuperVUClear(u32 startpc, u32 size, int vuindex)
 {
 	vector<VuFunctionHeader::RANGE>::iterator itrange;
 	list<VuFunctionHeader*>::iterator it = s_listVUHeaders[vuindex].begin();
-	u32 endpc = startpc + ((size * 4 + 7) & ~7); // Adding this code to ensure size is always a multiple of 8, it can be simplified to startpc+size if size is always a multiple of 8 (cottonvibes)
+	u32 endpc = startpc + ((size + 7) & ~7); // Ensure size is a multiple of u64 (round up)
 	while (it != s_listVUHeaders[vuindex].end())
 	{
-
 		// for every fn, check if it has code in the range
 		for(itrange = (*it)->ranges.begin(); itrange != (*it)->ranges.end(); itrange++)
 		{
@ -4641,11 +4641,13 @@ void recSuperVU1::Reserve()

 void recSuperVU1::Shutdown() throw()
 {
+	vu1Thread.WaitVU();
 	SuperVUDestroy( 1 );
 }

 void recSuperVU1::Reset()
 {
+	vu1Thread.WaitVU();
 	SuperVUReset( 1 );
 }