diff --git a/common/PS2Etypes.h b/common/PS2Etypes.h
index d980530d66..4c62b47f0f 100644
--- a/common/PS2Etypes.h
+++ b/common/PS2Etypes.h
@@ -156,6 +156,56 @@ typedef s32 sptr;
 #endif
 #endif
 
+// A rough-and-ready cross platform 128-bit datatype, Non-SSE style.
+#ifdef __cplusplus
+struct u128
+{
+	u64 lo;
+	u64 hi;
+
+	// Implicit conversion from u64
+	u128( u64 src ) :
+		lo( src )
+	,	hi( 0 ) {}
+
+	// Implicit conversion from u32
+	u128( u32 src ) :
+		lo( src )
+	,	hi( 0 ) {}
+};
+
+struct s128
+{
+	s64 lo;
+	s64 hi;
+
+	// Implicit conversion from u64
+	s128( s64 src ) :
+		lo( src )
+	,	hi( 0 ) {}
+
+	// Implicit conversion from u32
+	s128( s32 src ) :
+		lo( src )
+	,	hi( 0 ) {}
+};
+
+#else
+
+typedef union _u128_t
+{
+	u64 lo;
+	u64 hi;
+} u128;
+
+typedef union _s128_t
+{
+	s64 lo;
+	s64 hi;
+} s128;
+
+#endif
+
 typedef struct {
 	int size;
 	s8 *data;
diff --git a/pcsx2/Cache.h b/pcsx2/Cache.h
index da4e542335..62d513430a 100644
--- a/pcsx2/Cache.h
+++ b/pcsx2/Cache.h
@@ -26,14 +26,14 @@ struct _u8bit_128 {
 
 };
 
-struct u128 {
+struct u8bit_128 {
 	_u8bit_128 b8;
 
 };
 
 struct _cacheS {
 	u32 tag[2];
-	u128 data[2][4];
+	u8bit_128 data[2][4];
 };
 
 extern _cacheS pCache[64];
diff --git a/pcsx2/Exceptions.h b/pcsx2/Exceptions.h
index 411500f0ce..bd1a66007f 100644
--- a/pcsx2/Exceptions.h
+++ b/pcsx2/Exceptions.h
@@ -104,7 +104,7 @@ namespace Exception
 			RuntimeError( msg ) {}
 	};
 
-	// This exception  exception thrown any time an operation is attempted when an object
+	// This exception thrown any time an operation is attempted when an object
 	// is in an uninitialized state.
 	class InvalidOperation : public LogicError
 	{
@@ -114,6 +114,16 @@ namespace Exception
 			LogicError( msg ) {}
 	};
 
+	// Keep those array indexers in bounds when using the SafeArray type, or you'll be
+	// seeing these.
+	class IndexBoundsFault : public LogicError
+	{
+	public:
+		virtual ~IndexBoundsFault() throw() {}
+		explicit IndexBoundsFault( const std::string& msg="Array index is outsides the bounds of an array." ) :
+			LogicError( msg ) {}
+	};
+
 	class HardwareDeficiency : public RuntimeError
 	{
 	public:
diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp
index 739320c833..e69338a09b 100644
--- a/pcsx2/FiFo.cpp
+++ b/pcsx2/FiFo.cpp
@@ -120,8 +120,8 @@ void WriteFIFO(u32 mem, const u64 *value) {
 
 		if( mtgsThread != NULL )
 		{
-			const uint count = mtgsThread->PrepDataPacket( GIF_PATH_3, value, 16 );
-			jASSUME( count == 16 );
+			const uint count = mtgsThread->PrepDataPacket( GIF_PATH_3, value, 1 );
+			jASSUME( count == 1 );
 			u64* data = (u64*)mtgsThread->GetDataPacketPtr();
 			data[0] = value[0];
 			data[1] = value[1];
diff --git a/pcsx2/GS.cpp b/pcsx2/GS.cpp
index 8fef4146ea..c4673385e0 100644
--- a/pcsx2/GS.cpp
+++ b/pcsx2/GS.cpp
@@ -552,7 +552,7 @@ static void WRITERING_DMA(u32 *pMem, u32 qwc)
 	if( mtgsThread != NULL )
 	{ 
 		int sizetoread = (qwc)<<4; 
-		sizetoread = mtgsThread->PrepDataPacket( GIF_PATH_3, pMem, sizetoread );
+		sizetoread = mtgsThread->PrepDataPacket( GIF_PATH_3, pMem, qwc );
 		u8* pgsmem = mtgsThread->GetDataPacketPtr();
 
 		/* check if page of endmem is valid (dark cloud2) */
@@ -579,7 +579,7 @@ static void WRITERING_DMA(u32 *pMem, u32 qwc)
 		}
 		else
 #endif
-		memcpy_aligned(pgsmem, pMem, sizetoread); 
+		memcpy_aligned(pgsmem, pMem, sizetoread<<4); 
 		
 		mtgsThread->SendDataPacket();
 	} 
diff --git a/pcsx2/GS.h b/pcsx2/GS.h
index d60afa4e06..d1c2e29149 100644
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@@ -112,7 +112,9 @@ struct GIFPath
 /////////////////////////////////////////////////////////////////////////////
 // MTGS Threaded Class Declaration
 
-#define MTGS_RINGBUFFERSIZE	0x00300000 // 3Mb
+// Uncomment this to enable the MTGS debug stack, which tracks to ensure reads
+// and writes stay synchronized.  Warning: the debug stack is VERY slow.
+//#define RINGBUF_DEBUG_STACK
 
 enum GIF_PATH
 {
@@ -143,18 +145,29 @@ enum GS_RINGTYPE
 ,	GS_RINGTYPE_STARTTIME	// special case for min==max fps frameskip settings
 };
 
-
 class mtgsThreadObject : public Threading::Thread
 {
 	friend class SaveState;
 
 protected:
-	// note: when g_pGSRingPos == g_pGSWritePos, the fifo is empty
-	const u8* m_RingPos;		// cur pos gs is reading from
-	u8* m_WritePos;				// cur pos ee thread is writing to
-	const u8* const m_RingBufferEnd;	// pointer to the end of the ringbuffer (used to detect buffer wraps)
+	// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s.
+	// (actual size is 1<<m_RingBufferSizeFactor simd vectors [128-bit values])
+	// A value of 17 is a 4meg ring buffer.  16 would be 2 megs, and 18 would be 8 megs.
+	static const uint m_RingBufferSizeFactor = 17;
 
-	Threading::WaitEvent m_wait_InitDone;	// used to regulate thread startup and gsInit
+	// size of the ringbuffer in simd128's.
+	static const uint m_RingBufferSize = 1<<m_RingBufferSizeFactor;
+
+	// Mask to apply to ring buffer indices to wrap the pointer from end to
+	// start (the wrapping is what makes it a ringbuffer, yo!)
+	static const uint m_RingBufferMask = m_RingBufferSize - 1;
+
+protected:
+	// note: when g_pGSRingPos == g_pGSWritePos, the fifo is empty
+	uint m_RingPos;		// cur pos gs is reading from
+	uint m_WritePos;	// cur pos ee thread is writing to
+
+	Threading::Semaphore m_post_InitDone;	// used to regulate thread startup and gsInit
 	Threading::MutexLock m_lock_RingRestart;
 
 	// Used to delay the sending of events.  Performance is better if the ringbuffer
@@ -167,20 +180,23 @@ protected:
 	// Only one data packet can be constructed and uploaded at a time.
 
 	uint m_packet_size;		// size of the packet (data only, ie. not including the 16 byte command!)
-	u8* m_packet_data;		// pointer to the data location in the ringbuffer.
+	uint m_packet_ringpos;	// index of the data location in the ringbuffer.
 
 #ifdef RINGBUF_DEBUG_STACK
-	MutexLock m_lock_Stack;
+	Threading::MutexLock m_lock_Stack;
 #endif
 
 	// the MTGS "dummy" GIFtag info!
+	// 16 byte alignment isn't "critical" here, so if GCC ignores the aignment directive
+	// it shouldn't cause any issues.
 	PCSX2_ALIGNED16( GIFPath m_path[3] );
 
+	// contains aligned memory allocations for gs and Ringbuffer.
+	SafeAlignedArray<u128,16> m_RingBuffer;
+
 	// mtgs needs its own memory space separate from the PS2.  The PS2 memory is in
 	// synch with the EE while this stays in sync with the GS (ie, it lags behind)
-	PCSX2_ALIGNED16( u8 m_gsMem[0x2000] );
-
-	PCSX2_ALIGNED( 4096, u8 m_RingBuffer[MTGS_RINGBUFFERSIZE] );
+	u8* const m_gsMem;
 
 public:
 	mtgsThreadObject();
@@ -225,8 +241,8 @@ protected:
 	u32 _gifTransferDummy( GIF_PATH pathidx, const u8 *pMem, u32 size );
 
 	// Used internally by SendSimplePacket type functions
-	const u8* _PrepForSimplePacket();
-	void _FinishSimplePacket( const u8* future_writepos );
+	uint _PrepForSimplePacket();
+	void _FinishSimplePacket( uint future_writepos );
 
 	int Callback();
 };
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index a367df92d6..690bdd3308 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -49,8 +49,8 @@ using namespace std;
 // This allows us to delacre the vars as non-volatile and only use
 // them as volatile when appropriate (more optimized).
 
-#define volatize(x) (*(u8* volatile*)&(x))		// for writepos
-#define volatize_c(x) (*(u8 * volatile*)&(x))	// for readpos
+#define volatize(x) (*reinterpret_cast<volatile uint*>(&(x)))		// for writepos
+//#define volatize_c(x) (*(volatile u32*)&(x))	// for readpos
 
 /////////////////////////////////////////////////////////////////////////////
 //   BEGIN  --  MTGS GIFtag Parse Implementation
@@ -164,27 +164,23 @@ static void RegHandlerLABEL(const u32* data)
 	GSSIGLBLID->LBLID = (GSSIGLBLID->LBLID&~data[1])|(data[0]&data[1]);
 }
 
-//   END  --  MTGS GIFtag Parse Implementation
+//  END  --  MTGS GIFtag Parse Implementation
 /////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////////////////////////////////////////////////////////
-// MTGS Threaded Class Implementation
+//  MTGS Threaded Class Implementation
 
 mtgsThreadObject* mtgsThread = NULL;
 
-// Uncomment this to enable the MTGS debug stack, which tracks to ensure reads
-// and writes stay synchronized.  Warning: the debug stack is VERY slow.
-//#define RINGBUF_DEBUG_STACK
 #ifdef RINGBUF_DEBUG_STACK
 #include <list>
-std::list<uptr> ringposStack;
-mutex_t stackLock;
+std::list<uint> ringposStack;
 #endif
 
 #ifdef _DEBUG
 // debug variable used to check for bad code bits where copies are started
 // but never closed, or closed without having been started.  (GSRingBufCopy calls
-// should always be followed by acall to GSRINGBUF_DONECOPY)
+// should always be followed by a call to GSRINGBUF_DONECOPY)
 static int copyLock = 0;
 #endif
 
@@ -192,27 +188,29 @@ typedef void (*GIFRegHandler)(const u32* data);
 static GIFRegHandler s_GSHandlers[3] = { RegHandlerSIGNAL, RegHandlerFINISH, RegHandlerLABEL };
 
 mtgsThreadObject::mtgsThreadObject() :
-	m_RingPos( m_RingBuffer )
-,	m_WritePos( m_RingBuffer )
-,	m_RingBufferEnd( m_RingBuffer + sizeof( m_RingBuffer ) )
+	m_RingPos( 0 )
+,	m_WritePos( 0 )
 
-,	m_wait_InitDone()
+,	m_post_InitDone()
 ,	m_lock_RingRestart()
 
 ,	m_CopyCommandTally( 0 )
 ,	m_CopyDataTally( 0 )
 ,	m_RingBufferIsBusy( 0 )
-,	m_packet_size()
-,	m_packet_data( NULL )
+,	m_packet_size( 0 )
+,	m_packet_ringpos( 0 )
 
 #ifdef RINGBUF_DEBUG_STACK
 ,	m_lock_Stack()
 #endif
+,	m_RingBuffer( m_RingBufferSize + (Ps2MemSize::GSregs/sizeof(u128)) )
+,	m_gsMem( (u8*)m_RingBuffer.GetPtr( m_RingBufferSize ) )
 {
 	// Wait for the thread to finish initialization (it runs GSinit, which can take
 	// some time since it's creating a new window and all), and then check for errors.
 
-	m_wait_InitDone.Wait();
+	m_post_event.Post();	// tell MTGS we're done here
+	m_post_InitDone.Wait();	// and wait for MTGS to be done there!
 
 	if( m_returncode != 0 )	// means the thread failed to init the GS plugin
 		throw Exception::PluginFailure( "GS", "The GS plugin failed to open/initialize." );
@@ -233,7 +231,7 @@ void mtgsThreadObject::Reset()
 	//  * Signal a reset.
 	//  * clear the path and byRegs structs (used by GIFtagDummy)
 
-	AtomicExchangePointer( m_RingPos, m_WritePos );
+	AtomicExchange( m_RingPos, m_WritePos );
 
 	MTGS_LOG( "MTGS > Sending Reset...\n" );
 	SendSimplePacket( GS_RINGTYPE_RESET, 0, 0, 0 );
@@ -406,14 +404,15 @@ __forceinline u32 mtgsThreadObject::_gifTransferDummy( GIF_PATH pathidx, const u
 		}
 	}
 
-	// FIXME: dq8, pcsx2 error probably
-
 	if(pathidx == 0)
 	{
 		if(!path.tag.eop && path.tag.nloop > 0)
 		{
 			path.tag.nloop = 0;
 			DevCon::Write( "path1 hack! " );
+
+			// This means that the giftag data got screwly somewhere
+			// along the way (often means curreg was in a bad state or something)
 		}
 	}
 #ifdef PCSX2_GSRING_SAMPLING_STATS
@@ -426,119 +425,142 @@ __forceinline u32 mtgsThreadObject::_gifTransferDummy( GIF_PATH pathidx, const u
 	return size;
 }
 
+struct PacketTagType
+{
+	u32 command;
+	u32 data[3];
+};
+
 int mtgsThreadObject::Callback()
 {
 	Console::WriteLn("MTGS > Thread Started, Opening GS Plugin...");
 
+	// Wait for the MTGS to initialize structures.
+	m_post_event.Wait();
+
 	memcpy_aligned( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
 	GSsetBaseMem( m_gsMem );
 
 	m_returncode = GSopen((void *)&pDsp, "PCSX2", 1);
 	GSCSRr = 0x551B400F; // 0x55190000
-	m_wait_InitDone.Set();
+	m_post_InitDone.Post();
 	if (m_returncode != 0) { return m_returncode; }		// error msg will be issued to the user by Plugins.c
 	Console::WriteLn("MTGS > GSopen Finished.");
 
 #ifdef RINGBUF_DEBUG_STACK
-	u32 prevCmd=0;
+	PacketTagType prevCmd;
 #endif
 
 	while( !m_sigterm )
 	{
-		m_wait_event.Wait();
+		m_post_event.Wait();
+		//if( m_sigterm ) break;
+
 		AtomicExchange( m_RingBufferIsBusy, 1 );
 
 		// note: m_RingPos is intentionally not volatile, because it should only
 		// ever be modified by this thread.
 		while( m_RingPos != volatize(m_WritePos))
 		{
-			assert( m_RingPos < m_RingBufferEnd );
+			assert( m_RingPos < m_RingBufferSize );
 
-			u32 tag = *(u32*)m_RingPos;
-			u32 ringposinc = 16;
+			const PacketTagType& tag = (PacketTagType&)m_RingBuffer[m_RingPos];
+			u32 ringposinc = 1;
 
 #ifdef RINGBUF_DEBUG_STACK
 			// pop a ringpos off the stack.  It should match this one!
 
-			EnterCriticalSection( &stackLock );
+			m_lock_Stack.Lock();
 			uptr stackpos = ringposStack.back();
-			if( stackpos != (uptr)m_RingPos )
+			if( stackpos != m_RingPos )
 			{
-				Console::Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, (long)m_RingPos, prevCmd );
+				Console::Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", params stackpos, m_RingPos, prevCmd.command );
 			}
-			assert( stackpos == (long)m_RingPos );
+			assert( stackpos == m_RingPos );
 			prevCmd = tag;
 			ringposStack.pop_back();
-			LeaveCriticalSection( &stackLock );
+			m_lock_Stack.Unlock();
 #endif
 
-			switch( tag&0xffff )
+			switch( tag.command )
 			{
 				case GS_RINGTYPE_RESTART:
-					AtomicExchangePointer(m_RingPos, m_RingBuffer);
+					AtomicExchange(m_RingPos, 0);
 					
 					// stall for a bit to let the MainThread have time to update the g_pGSWritePos. 
 					m_lock_RingRestart.Lock();
 					m_lock_RingRestart.Unlock();
-					continue;
+				continue;
 
 				case GS_RINGTYPE_P1:
 				{
-					int qsize = (tag>>16);
+					const int qsize = tag.data[0];
+					const u128* data = m_RingBuffer.GetPtr( m_RingPos+1 );
+
 					// make sure that tag>>16 is the MAX size readable
-					GSgifTransfer1((u32*)(m_RingPos+16) - 0x1000 + 4*qsize, 0x4000-qsize*16);
-					ringposinc += qsize<<4;
-					break;
+					//GSgifTransfer1(((u32*)data) - 0x1000 + 4*qsize, 0x4000-qsize*16);
+					GSgifTransfer1((u32*)(data - 0x400 + qsize), 0x4000-qsize*16);
+					ringposinc += qsize;
 				}
+				break;
+
 				case GS_RINGTYPE_P2:
-					GSgifTransfer2((u32*)(m_RingPos+16), tag>>16);
-					ringposinc += (tag>>16)<<4;
-					break;
+				{
+					const int qsize = tag.data[0];
+					const u128* data = m_RingBuffer.GetPtr( m_RingPos+1 );
+					GSgifTransfer2((u32*)data, qsize);
+					ringposinc += qsize;
+				}
+				break;
+
 				case GS_RINGTYPE_P3:
-					GSgifTransfer3((u32*)(m_RingPos+16), tag>>16);
-					ringposinc += (tag>>16)<<4;
-					break;
+				{
+					const int qsize = tag.data[0];
+					const u128* data = m_RingBuffer.GetPtr( m_RingPos+1 );
+					GSgifTransfer3((u32*)data, qsize);
+					ringposinc += qsize;
+				}
+				break;
+
 				case GS_RINGTYPE_VSYNC:
 				{
-					GSvsync(*(u32*)(m_RingPos+4));
+					GSvsync(tag.data[0]);
 
-					gsFrameSkip( !( *(u32*)(m_RingPos+8) ) );
+					gsFrameSkip( !tag.data[1] );
 
 					if( PAD1update != NULL ) PAD1update(0);
 					if( PAD2update != NULL ) PAD2update(1);
-
-					break;
 				}
+				break;
 
 				case GS_RINGTYPE_FRAMESKIP:
 					_gs_ResetFrameskip();
-					break;
+				break;
 
 				case GS_RINGTYPE_MEMWRITE8:
-					m_gsMem[*(u32*)(m_RingPos+4)] = *(u8*)(m_RingPos+8);
-					break;
+					m_gsMem[tag.data[0]] = (u8)tag.data[1];
+				break;
 				case GS_RINGTYPE_MEMWRITE16:
-					*(u16*)(m_gsMem+*(u32*)(m_RingPos+4)) = *(u16*)(m_RingPos+8);
-					break;
+					*(u16*)(m_gsMem+tag.data[0]) = (u16)tag.data[1];
+				break;
 				case GS_RINGTYPE_MEMWRITE32:
-					*(u32*)(m_gsMem+*(u32*)(m_RingPos+4)) = *(u32*)(m_RingPos+8);
-					break;
+					*(u32*)(m_gsMem+tag.data[0]) = tag.data[1];
+				break;
 				case GS_RINGTYPE_MEMWRITE64:
-					*(u64*)(m_gsMem+*(u32*)(m_RingPos+4)) = *(u64*)(m_RingPos+8);
-					break;
+					*(u64*)(m_gsMem+tag.data[0]) = *(u64*)&tag.data[1];
+				break;
 
 				case GS_RINGTYPE_FREEZE:
 				{
-					//SaveState* f = (SaveState*)(*(uptr*)(m_RingPos+8));
-					freezeData* data = (freezeData*)(*(uptr*)(m_RingPos+8));
-					int mode = *(s32*)(m_RingPos+4);
+					freezeData* data = (freezeData*)(*(uptr*)&tag.data[1]);
+					int mode = tag.data[0];
 					GSfreeze( mode, data );
 					break;
 				}
 
 				case GS_RINGTYPE_RECORD:
 				{
-					int record = *(u32*)(m_RingPos+4);
+					int record = tag.data[0];
 					if( GSsetupRecording != NULL ) GSsetupRecording(record, NULL);
 					if( SPU2setupRecording != NULL ) SPU2setupRecording(record, NULL);
 					break;
@@ -551,27 +573,27 @@ int mtgsThreadObject::Callback()
 
 				case GS_RINGTYPE_SOFTRESET:
 				{
-					int mask = *(u32*)(m_RingPos+4);
+					int mask = tag.data[0];
 					MTGS_LOG( "MTGS > Receiving GIF Soft Reset (mask: %d)\n", mask );
 					GSgifSoftReset( mask );
 					break;
 				}
 
 				case GS_RINGTYPE_WRITECSR:
-					GSwriteCSR( *(u32*)(m_RingPos+4) );
+					GSwriteCSR( tag.data[0] );
 				break;
 
 				case GS_RINGTYPE_MODECHANGE:
-					_gs_ChangeTimings( *(u32*)(m_RingPos+4), *(u32*)(m_RingPos+8) );
+					_gs_ChangeTimings( tag.data[0], tag.data[1] );
 				break;
 
 				case GS_RINGTYPE_STARTTIME:
-					m_iSlowStart += *(u32*)(m_RingPos+4);
+					m_iSlowStart += tag.data[0];
 				break;
 
 #ifdef PCSX2_DEVBUILD
 				default:
-					Console::Error("GSThreadProc, bad packet (%x) at m_RingPos: %x, m_WritePos: %x", params tag, m_RingPos, m_WritePos);
+					Console::Error("GSThreadProc, bad packet (%x) at m_RingPos: %x, m_WritePos: %x", params tag.command, m_RingPos, m_WritePos);
 					assert(0);
 					m_RingPos = m_WritePos;
 					continue;
@@ -581,12 +603,10 @@ int mtgsThreadObject::Callback()
 #endif
 			}
 
-			const u8* newringpos = m_RingPos + ringposinc;
-			assert( newringpos <= m_RingBufferEnd );
-			if( newringpos == m_RingBufferEnd )
-				newringpos = m_RingBuffer;
-
-			AtomicExchangePointer( m_RingPos, newringpos );
+			uint newringpos = m_RingPos + ringposinc;
+			assert( newringpos <= m_RingBufferSize );
+			newringpos &= m_RingBufferMask;
+			AtomicExchange( m_RingPos, newringpos );
 		}
 		AtomicExchange( m_RingBufferIsBusy, 0 );
 	}
@@ -616,7 +636,7 @@ void mtgsThreadObject::WaitGS()
 // For use in loops that wait on the GS thread to do certain things.
 void mtgsThreadObject::SetEvent()
 {
-	m_wait_event.Set();
+	m_post_event.Post();
 	m_CopyCommandTally = 0;
 	m_CopyDataTally = 0;
 }
@@ -635,30 +655,28 @@ void mtgsThreadObject::SetEventWait()
 
 u8* mtgsThreadObject::GetDataPacketPtr() const
 {
-	return m_packet_data;
+	return (u8*)m_RingBuffer.GetPtr( m_packet_ringpos );
 }
 
 // Closes the data packet send command, and initiates the gs thread (if needed).
 void mtgsThreadObject::SendDataPacket()
 {
 	// make sure a previous copy block has been started somewhere.
-	jASSUME( m_packet_data != NULL );
+	jASSUME( m_packet_size != 0 );
 
-	const u8* temp = m_packet_data + m_packet_size;
-
-	jASSUME( temp <= m_RingBufferEnd );
-	if( temp == m_RingBufferEnd )
-		temp = m_RingBuffer; 
+	uint temp = m_packet_ringpos + m_packet_size;
+	jASSUME( temp <= m_RingBufferSize );
+	temp &= m_RingBufferMask;
 
 #ifdef _DEBUG
-	else
+	if( m_packet_ringpos + m_packet_size < m_RingBufferSize )
 	{
-		const u8* readpos = volatize(m_RingPos);
+		uint readpos = volatize(m_RingPos);
 		if( readpos != m_WritePos )
 		{
 			// The writepos should never leapfrog the readpos
 			// since that indicates a bad write.
-			if( m_packet_data < readpos )
+			if( m_packet_ringpos < readpos )
 				assert( temp < readpos );
 		}
 
@@ -669,9 +687,9 @@ void mtgsThreadObject::SendDataPacket()
 	}
 #endif
 
-	AtomicExchangePointer( m_WritePos, temp );
+	AtomicExchange( m_WritePos, temp );
 
-	m_packet_data = NULL;
+	m_packet_size = 0;
 
 	if( m_RingBufferIsBusy ) return;
 
@@ -689,7 +707,7 @@ void mtgsThreadObject::SendDataPacket()
 	//  8 - roughly 2% slower on HT machines.
 
 	m_CopyDataTally += m_packet_size;
-	if( ( m_CopyDataTally > 0x40000 ) || ( ++m_CopyCommandTally > 16 ) )
+	if( ( m_CopyDataTally > 0x4000 ) || ( ++m_CopyCommandTally > 16 ) )
 	{
 		FreezeXMMRegs(1); 
 		FreezeMMXRegs(1);
@@ -727,6 +745,8 @@ static u32 GSRingBufCopySz = 0;
 // returns the amount of giftag data not processed (in simd128 values).
 // Return value is used by VU1 XGKICK to hack-fix data packets which are too
 // large for VU1 memory.
+// Parameters:
+//  size - size of the packet data, in smd128's
 int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size )
 {
 #ifdef PCSX2_GSRING_TX_STATS
@@ -777,34 +797,31 @@ int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 s
 	// interlocked exchanges when we modify it, however, since the GS thread
 	// is reading it.
 
-	const u8 *writepos = m_WritePos;
+	uint writepos = m_WritePos;
 	
 	// Checks if a previous copy was started without an accompanying call to GSRINGBUF_DONECOPY
-	jASSUME( m_packet_data == NULL );
+	jASSUME( m_packet_size == 0 );
 
 	// Sanity checks! (within the confines of our ringbuffer please!)
-	jASSUME( size < MTGS_RINGBUFFERSIZE );
-	jASSUME( writepos < m_RingBufferEnd );
-
-	// Alignment checks! (16 bytes please!)
-	jASSUME( ((uptr)writepos & 15) == 0 );
-	//jASSUME( (size&15) == 0);
+	jASSUME( size < m_RingBufferSize );
+	jASSUME( writepos < m_RingBufferSize );
 
 	//fixme: Vif sometimes screws up and size is unaligned, try this then (rama)
-	if( (size&15) != 0){
+	// Is this still a problem?  It should be fixed on the specific VIF command now. (air)
+	/*if( (size&15) != 0){
 		Console::Error( "MTGS problem, size unaligned"); 
 		size = (size+15)&(~15);
-	}
+	}*/
 
 	// retval has the amount of data *not* processed, so we only need to reserve
 	// enough room for size - retval:
-	int retval = _gifTransferDummy( pathidx, srcdata, size>>4 );
+	int retval = _gifTransferDummy( pathidx, srcdata, size );
 
-	size = size - (retval<<4);
+	size = size - retval;
 	m_packet_size = size;
-	size += 16;		// takes into account our command qword.
+	size++;			// takes into account our command qword.
 
-	if( writepos + size < m_RingBufferEnd )
+	if( writepos + size < m_RingBufferSize )
 	{
 		// generic gs wait/stall.
 		// Waits until the readpos is outside the scope of the write area.
@@ -812,7 +829,7 @@ int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 s
 		{
 			// two conditionals in the following while() loop, so precache
 			// the readpos for more efficient behavior:
-			const u8* readpos = volatize_c(m_RingPos);
+			uint readpos = volatize(m_RingPos);
 
 			// if the writepos is past the readpos then we're safe:
 			if( writepos >= readpos ) break;
@@ -824,7 +841,7 @@ int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 s
 			SetEventWait();
 		}
 	}
-	else if( writepos + size > m_RingBufferEnd )
+	else if( writepos + size > m_RingBufferSize )
 	{
 		// If the incoming packet doesn't fit, then start over from
 		// the start of the ring buffer (it's a lot easier than trying
@@ -836,7 +853,7 @@ int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 s
 
 		while( true )
 		{
-			const u8* readpos = volatize(m_RingPos);
+			uint readpos = volatize(m_RingPos);
 
 			// is the buffer empty?
 			if( readpos == writepos ) break;
@@ -844,22 +861,22 @@ int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 s
 			// Also: Wait for the readpos to go past the start of the buffer
 			// Otherwise it'll stop dead in its tracks when we set the new write
 			// position below (bad!)
-			if( readpos < writepos && readpos != m_RingBuffer ) break;
+			if( readpos < writepos && readpos != 0 ) break;
 
 			SetEventWait();
 		}
 
 		m_lock_RingRestart.Lock();
 		SendSimplePacket( GS_RINGTYPE_RESTART, 0, 0, 0 );
-		writepos = m_RingBuffer;
-		AtomicExchangePointer( m_WritePos, writepos );
+		writepos = 0;
+		AtomicExchange( m_WritePos, writepos );
 		m_lock_RingRestart.Unlock();
 
 		// stall until the read position is past the end of our incoming block,
 		// or until it reaches the current write position (signals an empty buffer).
 		while( true )
 		{
-			const u8* readpos = volatize(m_RingPos);
+			uint readpos = volatize(m_RingPos);
 
 			if( readpos == m_WritePos ) break;
 			if( writepos+size < readpos ) break;
@@ -874,48 +891,48 @@ int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 s
 		//SysPrintf( "MTGS > Perfect Fit!\n");
 		while( true )
 		{
-			const u8* readpos = volatize(m_RingPos);
+			uint readpos = volatize(m_RingPos);
 
 			// is the buffer empty?  Don't wait...
 			if( readpos == writepos ) break;
 
 			// Copy is ready so long as readpos is less than writepos and *not*
 			// equal to the base of the ringbuffer (otherwise the buffer will stop)
-			if( readpos < writepos && readpos != m_RingBuffer ) break;
+			if( readpos < writepos && readpos != 0 ) break;
 
 			SetEventWait();
 		}
     }
 
 #ifdef RINGBUF_DEBUG_STACK
-	mutex_lock( stackLock );
-	ringposStack.push_front( (uptr)writepos );
-	mutex_unlock( stackLock );
+	m_lock_Stack.Lock();
+	ringposStack.push_front( writepos );
+	m_lock_Stack.Unlock();
 #endif
 
 	// Command qword: Low word is the command, and the high word is the packet
 	// length in SIMDs (128 bits).
 
-	const uint simd_size = (m_packet_size>>4);		// minus the command byte!
-	*(u32*)m_WritePos = (pathidx+1) | (simd_size<<16);
-	m_packet_data = m_WritePos + 16;
+	PacketTagType& tag = (PacketTagType&)m_RingBuffer[m_WritePos];
+	tag.command = pathidx+1;
+	tag.data[0] = m_packet_size;
+	m_packet_ringpos = m_WritePos + 1;
 
 	return m_packet_size;
 }
 
-__forceinline const u8* mtgsThreadObject::_PrepForSimplePacket()
+__forceinline uint mtgsThreadObject::_PrepForSimplePacket()
 {
 #ifdef RINGBUF_DEBUG_STACK
 	m_lock_Stack.Lock();
-	ringposStack.push_front( (uptr)m_WritePos );
+	ringposStack.push_front( m_WritePos );
 	m_lock_Stack.Unlock();
 #endif
 
-	const u8* future_writepos = m_WritePos+16;
-	jASSUME( future_writepos <= m_RingBufferEnd );
+	uint future_writepos = m_WritePos+1;
+	jASSUME( future_writepos <= m_RingBufferSize );
 
-    if( future_writepos >= m_RingBufferEnd )
-        future_writepos = m_RingBuffer;
+    future_writepos &= m_RingBufferMask;
 
 	while( future_writepos == volatize(m_RingPos) )
 		SetEventWait();
@@ -923,31 +940,33 @@ __forceinline const u8* mtgsThreadObject::_PrepForSimplePacket()
 	return future_writepos;
 }
 
-__forceinline void mtgsThreadObject::_FinishSimplePacket( const u8* future_writepos )
+__forceinline void mtgsThreadObject::_FinishSimplePacket( uint future_writepos )
 {
 	assert( future_writepos != volatize(m_RingPos) );
-	AtomicExchangePointer( m_WritePos, future_writepos );
+	AtomicExchange( m_WritePos, future_writepos );
 }
 
 void mtgsThreadObject::SendSimplePacket( GS_RINGTYPE type, int data0, int data1, int data2 )
 {
-	const u8* const thefuture = _PrepForSimplePacket();
+	const uint thefuture = _PrepForSimplePacket();
+	PacketTagType& tag = (PacketTagType&)m_RingBuffer[m_WritePos];
 
-	*(u32*)m_WritePos = type;
-	*(u32*)(m_WritePos+4) = data0;
-	*(u32*)(m_WritePos+8) = data1;
-	*(u32*)(m_WritePos+12) = data2;
+	tag.command = type;
+	tag.data[0] = data0;
+	tag.data[1] = data1;
+	tag.data[2] = data2;
 
 	_FinishSimplePacket( thefuture );	
 }
 
 void mtgsThreadObject::SendPointerPacket( GS_RINGTYPE type, u32 data0, void* data1 )
 {
-	const u8* const thefuture = _PrepForSimplePacket();
+	const uint thefuture = _PrepForSimplePacket();
+	PacketTagType& tag = (PacketTagType&)m_RingBuffer[m_WritePos];
 
-	*(u32*)m_WritePos = type;
-	*(u32*)(m_WritePos+4) = data0;
-	*(uptr*)(m_WritePos+8) = (uptr)data1;
+	tag.command = type;
+	tag.data[0] = data0;
+	*(uptr*)&tag.data[1] = (uptr)data1;
 
 	_FinishSimplePacket( thefuture );	
 }
@@ -1004,4 +1023,4 @@ void mtgsThreadObject::Freeze( SaveState& state )
 void mtgsRingBufSimplePacket( s32 command, u32 data0, u32 data1, u32 data2 )
 {
 	mtgsThread->SendSimplePacket( (GS_RINGTYPE)command, data0, data1, data2 );
-}
\ No newline at end of file
+}
diff --git a/pcsx2/Memory.cpp b/pcsx2/Memory.cpp
index 1213232f4b..d4cd06dfe7 100644
--- a/pcsx2/Memory.cpp
+++ b/pcsx2/Memory.cpp
@@ -174,6 +174,9 @@ void memMapVUmicro()
 {
 	vtlb_MapHandler(vu0_micro_mem[CHECK_VU0REC ? 0 : 1],0x11000000,0x00004000);
 	vtlb_MapHandler(vu1_micro_mem[CHECK_VU1REC ? 0 : 1],0x11008000,0x00004000);
+
+	vtlb_MapBlock(VU0.Mem,0x11004000,0x00004000,0x1000);
+	vtlb_MapBlock(VU1.Mem,0x1100c000,0x00004000);
 }
 
 void memMapPhy()
@@ -193,9 +196,6 @@ void memMapPhy()
 	//IOP mem
 	vtlb_MapBlock(psxM,0x1c000000,0x00800000);
 
-	vtlb_MapBlock(VU0.Mem,0x11004000,0x00004000,0x1000);
-	vtlb_MapBlock(VU1.Mem,0x1100c000,0x00004000);
-
 	//These fallback to mem* stuff ...
 	vtlb_MapHandler(tlb_fallback_1,0x10000000,0x10000);
 	vtlb_MapHandler(tlb_fallback_6,0x12000000,0x10000);
@@ -455,11 +455,22 @@ void __fastcall _ext_memWrite128(u32 mem, const u64 *value)
 typedef void __fastcall ClearFunc_t( u32 addr, u32 qwc );
 
 template<int vunum, bool dynarec>
-static __forceinline ClearFunc_t& GetClearFunc()
+static __forceinline void ClearVuFunc( u32 addr, u32 size )
 {
-	return dynarec ?
-		(( vunum==0 ) ? VU0micro::recClear : VU1micro::recClear)
-		:	(( vunum==0 ) ? VU0micro::intClear : VU1micro::intClear);
+	if( dynarec )
+	{
+		if( vunum==0 )
+			VU0micro::recClear(addr,size);
+		else
+			VU1micro::recClear(addr,size);
+	}
+	else
+	{
+		if( vunum==0 )
+			VU0micro::intClear(addr,size);
+		else
+			VU1micro::intClear(addr,size);
+	}
 }
 
 template<int vunum>
@@ -521,7 +532,7 @@ void __fastcall vuMicroWrite8(u32 addr,mem8_t data)
 	{
 		vu.Micro[addr]=data;
 
-		GetClearFunc<vunum, dynrec>()(addr&(~7),1);
+		ClearVuFunc<vunum, dynrec>(addr&(~7),1);
 	}
 }
 
@@ -535,7 +546,7 @@ void __fastcall vuMicroWrite16(u32 addr,mem16_t data)
 	{
 		*(u16*)&vu.Micro[addr]=data;
 
-		GetClearFunc<vunum, dynrec>()(addr&(~7),1);
+		ClearVuFunc<vunum, dynrec>(addr&(~7),1);
 	}
 }
 
@@ -549,7 +560,7 @@ void __fastcall vuMicroWrite32(u32 addr,mem32_t data)
 	{
 		*(u32*)&vu.Micro[addr]=data;
 
-		GetClearFunc<vunum, dynrec>()(addr&(~7),1);
+		ClearVuFunc<vunum, dynrec>(addr&(~7),1);
 	}
 }
 
@@ -563,7 +574,7 @@ void __fastcall vuMicroWrite64(u32 addr,const mem64_t* data)
 	{
 		*(u64*)&vu.Micro[addr]=data[0];
 
-		GetClearFunc<vunum, dynrec>()(addr,1);
+		ClearVuFunc<vunum, dynrec>(addr,1);
 	}
 }
 
@@ -578,7 +589,7 @@ void __fastcall vuMicroWrite128(u32 addr,const mem128_t* data)
 		*(u64*)&vu.Micro[addr]=data[0];
 		*(u64*)&vu.Micro[addr+8]=data[1];
 
-		GetClearFunc<vunum, dynrec>()(addr,2);
+		ClearVuFunc<vunum, dynrec>(addr,2);
 	}
 }
 
@@ -696,7 +707,6 @@ void memReset()
 	vtlb_Init();
 
 	tlb_fallback_0=vtlb_RegisterHandlerTempl1(_ext_mem,0);
-	//tlb_fallback_1=vtlb_RegisterHandlerTempl1(_ext_mem,1);
 	tlb_fallback_2=vtlb_RegisterHandlerTempl1(_ext_mem,2);
 	tlb_fallback_3=vtlb_RegisterHandlerTempl1(_ext_mem,3);
 	tlb_fallback_4=vtlb_RegisterHandlerTempl1(_ext_mem,4);
diff --git a/pcsx2/Memory.h b/pcsx2/Memory.h
index 07d14cba17..1064e247b3 100644
--- a/pcsx2/Memory.h
+++ b/pcsx2/Memory.h
@@ -33,14 +33,16 @@ namespace Ps2MemSize
 {
 	static const uint Base	= 0x02000000;		// 32 MB main memory!
 	static const uint Rom	= 0x00400000;		// 4 MB main rom
-	static const uint Rom1	= 0x00040000;		// fixme - TLB allocates 0x00080000 ?
-	static const uint Rom2	= 0x00080000;
-	static const uint ERom	= 0x001C0000;
+	static const uint Rom1	= 0x00040000;		// DVD player
+	static const uint Rom2	= 0x00080000;		// Chinese rom extension (?)
+	static const uint ERom	= 0x001C0000;		// DVD player extensions (?)
 	static const uint Hardware = 0x00010000;
-	static const uint Scratch = 0x00004000;	// fixme - VM allocates 0x10000 ?
+	static const uint Scratch = 0x00004000;
 
-	static const uint IopRam = 0x200000;	// 2MB main ram on the IOP.
+	static const uint IopRam = 0x00200000;	// 2MB main ram on the IOP.
 	static const uint IopHardware = 0x00010000;
+
+	static const uint GSregs = 0x00002000;		// 8k for the GS registers and stuff.
 }
 
 #ifdef PCSX2_VIRTUAL_MEM
diff --git a/pcsx2/Plugins.cpp b/pcsx2/Plugins.cpp
index a96a6cc140..b31345c5a2 100644
--- a/pcsx2/Plugins.cpp
+++ b/pcsx2/Plugins.cpp
@@ -622,14 +622,28 @@ void ShutdownPlugins()
 		OpenStatus.GS = false;
 	}
 
-	GSshutdown();
-	PAD1shutdown();
-	PAD2shutdown();
-	SPU2shutdown();
-	CDVDshutdown();
-	DEV9shutdown();
-	USBshutdown();
-    FWshutdown();
+	if( GSshutdown != NULL )
+		GSshutdown();
+	
+	if( PAD1shutdown != NULL )
+		PAD1shutdown();
+	if( PAD2shutdown != NULL )
+		PAD2shutdown();
+
+	if( SPU2shutdown != NULL )
+		SPU2shutdown();
+
+	if( CDVDshutdown != NULL )
+		CDVDshutdown();
+
+	if( DEV9shutdown != NULL )
+		DEV9shutdown();
+
+	if( USBshutdown != NULL )
+		USBshutdown();
+
+	if( FWshutdown != NULL )
+		FWshutdown();
 }
 
 int LoadPlugins() {
@@ -867,4 +881,4 @@ void PluginsResetGS()
 
 	int ret = GSinit();
 	if (ret != 0) { Msgbox::Alert("GSinit error: %d", params ret);  }
-}
\ No newline at end of file
+}
diff --git a/pcsx2/System.h b/pcsx2/System.h
index 220c7ac1f9..538b34dd92 100644
--- a/pcsx2/System.h
+++ b/pcsx2/System.h
@@ -283,6 +283,25 @@ protected:
 	int m_size;	// size of the allocation of memory
 
 	const static std::string m_str_Unnamed;
+protected:
+	// Internal contructor for use by derrived classes.  This allws a derrived class to
+	// use its own memory allocation (with an aligned memory, for example).
+	// Throws:
+	//   Exception::OutOfMemory if the allocated_mem pointr is NULL.
+	explicit MemoryAlloc( const std::string& name, T* allocated_mem, int initSize ) : 
+	  Name( name )
+	, ChunkSize( DefaultChunkSize )
+	, m_ptr( allocated_mem )
+	, m_size( initSize )
+	{
+		if( m_ptr == NULL )
+			throw Exception::OutOfMemory();
+	}
+
+	virtual T* _virtual_realloc( int newsize )
+	{
+		return (T*)realloc( m_ptr, newsize * sizeof(T) );
+	}
 
 public:
 	virtual ~MemoryAlloc()
@@ -322,7 +341,7 @@ public:
 		if( blockSize > m_size )
 		{
 			const uint newalloc = blockSize + ChunkSize;
-			m_ptr = (T*)realloc( m_ptr, newalloc * sizeof(T) );
+			m_ptr = _virtual_realloc( newalloc );
 			if( m_ptr == NULL )
 			{
 				throw Exception::OutOfMemory(
@@ -353,19 +372,69 @@ public:
 	}
 
 protected:
+	// A safe array index fetcher.  Throws an exception if the array index
+	// is outside the bounds of the array.
+	// Performance Considerations: This function adds quite a bit of overhead
+	// to array indexing and thus should be done infrequently if used in
+	// time-critical situations.  Indead of using it from inside loops, cache
+	// the pointer into a local variable and use stad (unsafe) C indexes.
 	T* _getPtr( uint i ) const
 	{
+#ifdef PCSX2_DEVBUILD
 		if( i >= (uint)m_size )
 		{
-			throw std::out_of_range(
+			throw Exception::IndexBoundsFault(
 				"Index out of bounds on MemoryAlloc: " + Name + 
 				" (index=" + to_string(i) + 
 				", size=" + to_string(m_size) + ")"
 			);
 		}
+#endif
 		return &m_ptr[i];
 	}
 
 };
 
+template< typename T, uint Alignment >
+class SafeAlignedArray : public MemoryAlloc<T>
+{
+protected:
+	T* _virtual_realloc( int newsize )
+	{
+		// TODO : aligned_realloc will need a linux implementation now. -_-
+		return (T*)_aligned_realloc( m_ptr, newsize * sizeof(T), Alignment );
+	}
+
+	// Appends "(align: xx)" to the name of the allocation in devel builds.
+	// Maybe useful,maybe not... no harm in atatching it. :D
+	string _getName( const string& src )
+	{
+#ifdef PCSX2_DEVBUILD
+		return src + "(align:" + to_string(Alignment) + ")";
+#endif
+		return src;
+	}
+
+public:
+	virtual ~SafeAlignedArray()
+	{
+		safe_aligned_free( m_ptr );
+		// mptr is set to null, so the parent class's destructor won't re-free it.
+	}
+
+	explicit SafeAlignedArray( const std::string& name="Unnamed" ) : 
+		MemoryAlloc( name )
+	{
+	}
+
+	explicit SafeAlignedArray( int initialSize, const std::string& name="Unnamed" ) : 
+		MemoryAlloc(
+			_getName(name),
+			(T*)_aligned_malloc( initialSize * sizeof(T), Alignment ),
+			initialSize 
+		)
+	{
+	}
+};
+
 #endif /* __SYSTEM_H__ */
diff --git a/pcsx2/ThreadTools.cpp b/pcsx2/ThreadTools.cpp
index c3fcb177ce..a82bf8db9e 100644
--- a/pcsx2/ThreadTools.cpp
+++ b/pcsx2/ThreadTools.cpp
@@ -28,7 +28,7 @@ namespace Threading
 	,	m_returncode( 0 )
 	,	m_terminated( false )
 	,	m_sigterm( 0 )
-	,	m_wait_event()
+	,	m_post_event()
 	{
 		if( pthread_create( &m_thread, NULL, _internal_callback, this ) != 0 )
 			throw Exception::ThreadCreationError();
@@ -42,7 +42,7 @@ namespace Threading
 	void Thread::Close()
 	{
 		AtomicExchange( m_sigterm, 1 );
-		m_wait_event.Set();
+		m_post_event.Post();
 		pthread_join( m_thread, NULL );
 	}
 
@@ -82,6 +82,38 @@ namespace Threading
 		pthread_mutex_unlock( &mutex );
 	}
 
+	Semaphore::Semaphore()
+	{
+		sem_init( &sema, false, 0 );
+	}
+
+	Semaphore::~Semaphore()
+	{
+		sem_destroy( &sema );
+	}
+
+	void Semaphore::Post()
+	{
+		sem_post( &sema );
+	}
+
+	void Semaphore::Post( int multiple )
+	{
+		sem_post_multiple( &sema, multiple );
+	}
+
+	void Semaphore::Wait()
+	{
+		sem_wait( &sema );
+	}
+
+	int Semaphore::Count()
+	{
+		int retval;
+		sem_getvalue( &sema, &retval );
+		return retval;
+	}
+
 	MutexLock::MutexLock()
 	{
 		int err = 0;
diff --git a/pcsx2/Threading.h b/pcsx2/Threading.h
index 94aeb7630c..11cb15f321 100644
--- a/pcsx2/Threading.h
+++ b/pcsx2/Threading.h
@@ -20,6 +20,7 @@
 #define _THREADING_H_
 
 #include <errno.h> // EBUSY
+#include <semaphore.h>
 
 #include "PS2Etypes.h"
 #include "Exceptions.h"
@@ -41,6 +42,19 @@ namespace Threading
 		void Wait();
 	};
 
+	struct Semaphore
+	{
+		sem_t sema;
+
+		Semaphore();
+		~Semaphore();
+
+		void Post();
+		void Post( int multiple );
+		void Wait();
+		int Count();
+	};
+
 	struct MutexLock
 	{
 		pthread_mutex_t mutex;
@@ -70,7 +84,7 @@ namespace Threading
 		int m_returncode;		// value returned from the thread on close.
 		bool m_terminated;		// set true after the thread has been closed.
 		u32 m_sigterm;			// set to true(1) when the thread has been requested to exit.
-		WaitEvent m_wait_event;	// general wait event that's needed by most threads.
+		Semaphore m_post_event;	// general wait event that's needed by most threads.
 
 	public:
 		virtual ~Thread();
diff --git a/pcsx2/VifDma.cpp b/pcsx2/VifDma.cpp
index c709895ef2..38a54ee172 100644
--- a/pcsx2/VifDma.cpp
+++ b/pcsx2/VifDma.cpp
@@ -1587,8 +1587,8 @@ static int Vif1TransDirectHL(u32 *data){
 		{
 			// copy 16 bytes the fast way:
 			const u64* src = (u64*)splittransfer[0];
-			const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, src, 16);
-			jASSUME( count == 16 );
+			const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, src, 1);
+			jASSUME( count == 1 );
 			u64* dst = (u64*)mtgsThread->GetDataPacketPtr();
 			dst[0] = src[0];
 			dst[1] = src[1];
@@ -1633,8 +1633,9 @@ static int Vif1TransDirectHL(u32 *data){
 	if( mtgsThread != NULL )
 	{
 		//unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :)
-		const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, ret<<2 );
-		memcpy_fast( mtgsThread->GetDataPacketPtr(), data, count );
+		// Round ret up, just in case it's not 128bit aligned.
+		const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, (ret+3)>>2 );
+		memcpy_fast( mtgsThread->GetDataPacketPtr(), data, count<<4 );
 		mtgsThread->SendDataPacket();
 	}
 	else {
diff --git a/pcsx2/x86/iFPU.cpp b/pcsx2/x86/iFPU.cpp
index 5e70b9aaea..da296ff6d0 100644
--- a/pcsx2/x86/iFPU.cpp
+++ b/pcsx2/x86/iFPU.cpp
@@ -681,7 +681,6 @@ void recBC1T( void ) {
 	SaveBranchState();
 	recompileNextInstruction(1);
 	SetBranchImm(branchTo);
-	//j32Ptr[1] = JMP32(0);
 
 	x86SetJ32(j32Ptr[0]);
 
@@ -691,7 +690,6 @@ void recBC1T( void ) {
 	recompileNextInstruction(1);
 
 	SetBranchImm(pc);
-	//x86SetJ32(j32Ptr[1]);	
 }
 
 void recBC1FL( void ) {
diff --git a/pcsx2/x86/iVU0micro.cpp b/pcsx2/x86/iVU0micro.cpp
index c8a5131a2d..28a5031e20 100644
--- a/pcsx2/x86/iVU0micro.cpp
+++ b/pcsx2/x86/iVU0micro.cpp
@@ -45,7 +45,7 @@ namespace VU0micro
 	{
 		SuperVUReset(0);
 
-		// these shouldn't be needed, but shouldn't hurt anythign either.
+		// these shouldn't be needed, but shouldn't hurt anything either.
 		x86FpuState = FPU_STATE;
 		iCWstate = 0;
 	}
diff --git a/pcsx2/x86/iVUmicroLower.cpp b/pcsx2/x86/iVUmicroLower.cpp
index ca68eb3bc0..99ed50d218 100644
--- a/pcsx2/x86/iVUmicroLower.cpp
+++ b/pcsx2/x86/iVUmicroLower.cpp
@@ -1970,15 +1970,14 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 	// Chances are this should be a "loops around memory" situation, and the packet
 	// should be continued starting at addr zero (0).
 
-	size = mtgsThread->PrepDataPacket( GIF_PATH_1, data, (0x4000-(addr&0x3fff)));
-	//size = 0x4000-(size<<4)-(addr&0x3fff);
+	size = mtgsThread->PrepDataPacket( GIF_PATH_1, data, (0x4000-(addr&0x3fff)) >> 4);
     jASSUME( size > 0 );
 	
     //if( size > 0 )
 	{
 		u8* pmem = mtgsThread->GetDataPacketPtr();
-		memcpy_aligned(pmem, (u8*)pMem+addr, size);
+		memcpy_aligned(pmem, (u8*)pMem+addr, size<<4);
 		mtgsThread->SendDataPacket();
 	}
 }
-//------------------------------------------------------------------
\ No newline at end of file
+//------------------------------------------------------------------
diff --git a/pcsx2/x86/iVUzerorec.cpp b/pcsx2/x86/iVUzerorec.cpp
index 62250154fc..20786ecfb0 100644
--- a/pcsx2/x86/iVUzerorec.cpp
+++ b/pcsx2/x86/iVUzerorec.cpp
@@ -443,7 +443,7 @@ void SuperVUReset(int vuindex)
 }
 
 // clear the block and any joining blocks
-__forceinline void SuperVUClear(u32 startpc, u32 size, int vuindex)
+void __fastcall SuperVUClear(u32 startpc, u32 size, int vuindex)
 {
 	vector<VuFunctionHeader::RANGE>::iterator itrange;
 	list<VuFunctionHeader*>::iterator it = s_listVUHeaders[vuindex].begin();
diff --git a/pcsx2/x86/iVUzerorec.h b/pcsx2/x86/iVUzerorec.h
index 9af1d42dbe..1f451fd58f 100644
--- a/pcsx2/x86/iVUzerorec.h
+++ b/pcsx2/x86/iVUzerorec.h
@@ -23,7 +23,7 @@
 
 #include "iVUmicro.h"
 
-extern void SuperVUAlloc(int vuindex); // global VU resources aare automatically allocated if necessary.
+extern void SuperVUAlloc(int vuindex); // global VU resources are automatically allocated if necessary.
 extern void SuperVUDestroy(int vuindex); // if vuindex is -1, destroys everything
 extern void SuperVUReset(int vuindex); // if vuindex is -1, resets everything
 
@@ -37,7 +37,7 @@ extern void svudispfntemp();
 #ifdef __LINUX__
 }
 #endif
-extern void SuperVUClear(u32 startpc, u32 size, int vuindex);
+extern void __fastcall SuperVUClear(u32 startpc, u32 size, int vuindex);
 
 // read = 0, will write to reg
 // read = 1, will read from reg