diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp
index 2b0a022404..1523702820 100644
--- a/pcsx2/FiFo.cpp
+++ b/pcsx2/FiFo.cpp
@@ -173,6 +173,7 @@ void __fastcall WriteFIFO_page_6(u32 mem, const mem128_t *value)
 	psHu64(0x6000) = value[0];
 	psHu64(0x6008) = value[1];
 
+	FreezeRegs(1);
 	if( mtgsThread != NULL )
 	{
 		const uint count = mtgsThread->PrepDataPacket( GIF_PATH_3, value, 1 );
@@ -184,10 +185,9 @@ void __fastcall WriteFIFO_page_6(u32 mem, const mem128_t *value)
 	}
 	else
 	{
-		FreezeRegs(1);
 		GSGIFTRANSFER3((u32*)value, 1);
-		FreezeRegs(0);
 	}
+	FreezeRegs(0);
 }
 		
 void __fastcall WriteFIFO_page_7(u32 mem, const mem128_t *value)
diff --git a/pcsx2/GS.h b/pcsx2/GS.h
index 39c2221844..7fd14d8561 100644
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@@ -176,9 +176,14 @@ protected:
 	uint m_RingPos;		// cur pos gs is reading from
 	uint m_WritePos;	// cur pos ee thread is writing to
 
-	Threading::Semaphore m_post_InitDone;	// used to regulate thread startup and gsInit
-	Threading::MutexLock m_lock_RingRestart;
+	// used to regulate thread startup and gsInit
+	Threading::Semaphore m_post_InitDone;
 
+	Threading::MutexLock m_lock_RingRestart;
+	
+	// used to keep multiple threads from sending packets to the ringbuffer concurrently.
+	Threading::MutexLock m_PacketLocker;
+	
 	// Used to delay the sending of events.  Performance is better if the ringbuffer
 	// has more than one command in it when the thread is kicked.
 	int m_CopyCommandTally;
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index 4b18c5d001..4463e75b07 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -197,6 +197,7 @@ mtgsThreadObject::mtgsThreadObject() :
 
 ,	m_post_InitDone()
 ,	m_lock_RingRestart()
+,	m_PacketLocker( true )		// true - makes it a recursive lock
 
 ,	m_CopyCommandTally( 0 )
 ,	m_CopyDataTally( 0 )
@@ -712,14 +713,12 @@ int mtgsThreadObject::Callback()
 void mtgsThreadObject::WaitGS()
 {
 	// Freeze registers because some kernel code likes to destroy them
-	FreezeRegs(1);
 	SetEvent();
 	while( volatize(m_RingPos) != volatize(m_WritePos) )
 	{
 		Timeslice();
 		//SpinWait();
 	}
-	FreezeRegs(0);
 }
 
 // Sets the gsEvent flag and releases a timeslice.
@@ -733,8 +732,6 @@ void mtgsThreadObject::SetEvent()
 
 void mtgsThreadObject::PrepEventWait()
 {
-	// Freeze registers because some kernel code likes to destroy them
-	FreezeRegs(1);
 	//Console::Notice( "MTGS Stall!  EE waits for nothing! ... except your GPU sometimes." );
 	SetEvent();
 	Timeslice();
@@ -742,7 +739,6 @@ void mtgsThreadObject::PrepEventWait()
 
 void mtgsThreadObject::PostEventWait() const
 {
-	FreezeRegs(0);
 }
 
 u8* mtgsThreadObject::GetDataPacketPtr() const
@@ -784,29 +780,29 @@ void mtgsThreadObject::SendDataPacket()
 
 	m_packet_size = 0;
 
-	if( m_RingBufferIsBusy ) return;
-
-	// The ringbuffer is current in a resting state, so if enough copies have
-	// queued up then go ahead and initiate the GS thread..
-	
-	// Optimization notes:  What we're doing here is initiating a "burst" mode on
-	// the thread, which improves its cache hit performance and makes it more friendly
-	// to other threads in Pcsx2 and such.  Primary is the Command Tally, and then a 
-	// secondary data size threshold for games that do lots of texture swizzling.
-	
-	// 16 was the best value I found so far.
-	// tested values:
-	//  24 - very slow on HT machines (+5% drop in fps)
-	//  8 - roughly 2% slower on HT machines.
-
-	m_CopyDataTally += m_packet_size;
-	if( ( m_CopyDataTally > 0x8000 ) || ( ++m_CopyCommandTally > 16 ) )
+	if( !m_RingBufferIsBusy )
 	{
-		FreezeRegs(1);
-		//Console::Status( "MTGS Kick! DataSize : 0x%5.8x, CommandTally : %d", m_CopyDataTally, m_CopyCommandTally );
-		SetEvent();
-		FreezeRegs(0);
+		// The ringbuffer is current in a resting state, so if enough copies have
+		// queued up then go ahead and initiate the GS thread..
+		
+		// Optimization notes:  What we're doing here is initiating a "burst" mode on
+		// the thread, which improves its cache hit performance and makes it more friendly
+		// to other threads in Pcsx2 and such.  Primary is the Command Tally, and then a 
+		// secondary data size threshold for games that do lots of texture swizzling.
+		
+		// 16 was the best value I found so far.
+		// tested values:
+		//  24 - very slow on HT machines (+5% drop in fps)
+		//  8 - roughly 2% slower on HT machines.
+
+		m_CopyDataTally += m_packet_size;
+		if( ( m_CopyDataTally > 0x8000 ) || ( ++m_CopyCommandTally > 16 ) )
+		{
+			//Console::Status( "MTGS Kick! DataSize : 0x%5.8x, CommandTally : %d", m_CopyDataTally, m_CopyCommandTally );
+			SetEvent();
+		}
 	}
+	//m_PacketLocker.Unlock();
 }
 
 int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u64* srcdata, u32 size )
@@ -840,6 +836,8 @@ static u32 GSRingBufCopySz = 0;
 //  size - size of the packet data, in smd128's
 int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size )
 {
+	//m_PacketLocker.Lock();
+
 #ifdef PCSX2_GSRING_TX_STATS
 	ringtx_s += size;
 	ringtx_s_ulg += size&0x7F;
@@ -1064,6 +1062,8 @@ __forceinline void mtgsThreadObject::_FinishSimplePacket( uint future_writepos )
 
 void mtgsThreadObject::SendSimplePacket( GS_RINGTYPE type, int data0, int data1, int data2 )
 {
+	//ScopedLock locker( m_PacketLocker );
+
 	const uint thefuture = _PrepForSimplePacket();
 	PacketTagType& tag = (PacketTagType&)m_RingBuffer[m_WritePos];
 
@@ -1072,11 +1072,13 @@ void mtgsThreadObject::SendSimplePacket( GS_RINGTYPE type, int data0, int data1,
 	tag.data[1] = data1;
 	tag.data[2] = data2;
 
-	_FinishSimplePacket( thefuture );	
+	_FinishSimplePacket( thefuture );
 }
 
 void mtgsThreadObject::SendPointerPacket( GS_RINGTYPE type, u32 data0, void* data1 )
 {
+	//ScopedLock locker( m_PacketLocker );
+
 	const uint thefuture = _PrepForSimplePacket();
 	PacketTagType& tag = (PacketTagType&)m_RingBuffer[m_WritePos];
 
diff --git a/pcsx2/ThreadTools.cpp b/pcsx2/ThreadTools.cpp
index 665ad569bc..dcb0bead91 100644
--- a/pcsx2/ThreadTools.cpp
+++ b/pcsx2/ThreadTools.cpp
@@ -38,6 +38,7 @@ namespace Threading
 
 	void Thread::Start()
 	{
+		m_terminated = false;
 		if( pthread_create( &m_thread, NULL, _internal_callback, this ) != 0 )
 			throw Exception::ThreadCreationError();
 	}
@@ -135,6 +136,26 @@ namespace Threading
 		err = pthread_mutex_init( &mutex, NULL );
 	}
 
+	MutexLock::MutexLock( bool isRecursive )
+	{
+		if( isRecursive )
+		{
+			pthread_mutexattr_t mutexAttribute; 
+			int status = pthread_mutexattr_init( &mutexAttribute );
+			if (status != 0) { /* ... */ } 
+			status = pthread_mutexattr_settype( &mutexAttribute, PTHREAD_MUTEX_RECURSIVE); 
+			if (status != 0) { /* ... */} 
+
+			int err = 0;
+			err = pthread_mutex_init( &mutex, &mutexAttribute );
+		}
+		else
+		{
+			int err = 0;
+			err = pthread_mutex_init( &mutex, NULL );
+		}
+	}
+
 	MutexLock::~MutexLock()
 	{
 		pthread_mutex_destroy( &mutex );
@@ -149,7 +170,7 @@ namespace Threading
 	{
 		pthread_mutex_unlock( &mutex );
 	}
-
+	
 	//////////////////////////////////////////////////////////////////////
 	// define some overloads for InterlockedExchanges
 	// for commonly used types, like u32 and s32.
diff --git a/pcsx2/Threading.h b/pcsx2/Threading.h
index f75d3e4262..f731676757 100644
--- a/pcsx2/Threading.h
+++ b/pcsx2/Threading.h
@@ -61,6 +61,7 @@ namespace Threading
 		pthread_mutex_t mutex;
 
 		MutexLock();
+		MutexLock( bool isRecursive );
 		~MutexLock();
 
 		void Lock();
diff --git a/pcsx2/VU0.cpp b/pcsx2/VU0.cpp
index a017527406..823d4827a2 100644
--- a/pcsx2/VU0.cpp
+++ b/pcsx2/VU0.cpp
@@ -178,8 +178,7 @@ void CTC2() {
 			break;
 		case REG_CMSAR1: // REG_CMSAR1
 			if (!(VU0.VI[REG_VPU_STAT].UL & 0x100) ) {
-				VU1.VI[REG_TPC].UL = cpuRegs.GPR.r[_Rt_].US[0];
-				vu1ExecMicro(VU1.VI[REG_TPC].UL);	// Execute VU1 Micro SubRoutine
+				vu1ExecMicro(cpuRegs.GPR.r[_Rt_].US[0]);	// Execute VU1 Micro SubRoutine
 			}
 			break;
 		default:
diff --git a/pcsx2/VifDma.cpp b/pcsx2/VifDma.cpp
index 9ae6a768e7..797be0c452 100644
--- a/pcsx2/VifDma.cpp
+++ b/pcsx2/VifDma.cpp
@@ -1896,6 +1896,7 @@ static int __fastcall Vif1TransDirectHL(u32 *data)
 			}
 		}
 
+		FreezeRegs(1);
 		if (mtgsThread != NULL)
 		{
 			// copy 16 bytes the fast way:
@@ -1910,10 +1911,9 @@ static int __fastcall Vif1TransDirectHL(u32 *data)
 		}
 		else
 		{
-			FreezeRegs(1);
 			GSGIFTRANSFER2((u32*)splittransfer[0], 1);
-			FreezeRegs(0);
 		}
+		FreezeRegs(0);
 
 		if (vif1.tag.size == 0) vif1.cmd = 0;
 		splitptr = 0;
@@ -1945,6 +1945,7 @@ static int __fastcall Vif1TransDirectHL(u32 *data)
 
 	//TODO: ret is guaranteed to be qword aligned ?
 
+	FreezeRegs(1);
 	if (mtgsThread != NULL)
 	{
 		//unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :)
@@ -1955,10 +1956,9 @@ static int __fastcall Vif1TransDirectHL(u32 *data)
 	}
 	else
 	{
-		FreezeRegs(1);
 		GSGIFTRANSFER2(data, (ret >> 2));
-		FreezeRegs(0);
 	}
+	FreezeRegs(0);
 
 	return ret;
 }