diff --git a/common/include/PS2Edefs.h b/common/include/PS2Edefs.h
index f394cf5025..64a98e1016 100644
--- a/common/include/PS2Edefs.h
+++ b/common/include/PS2Edefs.h
@@ -248,6 +248,7 @@ void CALLBACK GSsetSettingsDir( const char* dir );
 void CALLBACK GSsetLogDir( const char* dir );
 
 void CALLBACK GSvsync(int field);
+void CALLBACK GSgifTransfer(u32 *pMem, u32 addr);
 void CALLBACK GSgifTransfer1(u32 *pMem, u32 addr);
 void CALLBACK GSgifTransfer2(u32 *pMem, u32 size);
 void CALLBACK GSgifTransfer3(u32 *pMem, u32 size);
diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index 2ff39cbe84..800c1071b6 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -22,12 +22,14 @@
 	extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
 	extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 	extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);
+	extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
 
 #else
 
 #	include "win_memzero.h"
 
 	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
+	extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
 	extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 	extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
 
@@ -40,9 +42,12 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 extern void memcpy_vibes(void * dest, const void * src, int size);
 extern void gen_memcpy_vibes();
 
-#define memcpy_fast			memcpy_amd_  // Fast memcpy
-#define memcpy_aligned		memcpy_amd_	 // Memcpy with 16-byte Aligned addresses
-#define memcpy_const		memcpy_amd_	 // Memcpy with constant size
-#define memcpy_constA		memcpy_amd_  // Memcpy with constant size and 16-byte aligned
-#define memcpy_qwc_			memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
-#define memcpy_qwc(x,y,z)	memcpy_amd_(x, y, z*16) // Memcpy in aligned qwc increments
+#define memcpy_fast				memcpy_amd_  // Fast memcpy
+#define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
+#define memcpy_const			memcpy_amd_	 // Memcpy with constant size
+#define memcpy_constA			memcpy_amd_  // Memcpy with constant size and 16-byte aligned
+#define memcpy_qwc_				memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
+#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
+
+// Useful alternative if we think memcpy_amd_qwc is buggy
+//#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
diff --git a/common/include/Utilities/Threading.h b/common/include/Utilities/Threading.h
index 5df1b80621..a3fa5261fa 100644
--- a/common/include/Utilities/Threading.h
+++ b/common/include/Utilities/Threading.h
@@ -129,6 +129,10 @@ namespace Threading
 
 	// For use in spin/wait loops.
 	extern void SpinWait();
+	
+	// Use prior to committing data to another thread (internal memcpy_qwc does not use fencing,
+	// so that many memcpys can be issued in a row more efficiently)
+	extern void StoreFence();
 
 	// Optional implementation to enable hires thread/process scheduler for the operating system.
 	// Needed by Windows, but might not be relevant to other platforms.
diff --git a/common/src/Utilities/Exceptions.cpp b/common/src/Utilities/Exceptions.cpp
index 7ff4440f7f..6e4869fc5e 100644
--- a/common/src/Utilities/Exceptions.cpp
+++ b/common/src/Utilities/Exceptions.cpp
@@ -71,7 +71,7 @@ wxString DiagnosticOrigin::ToString( const wxChar* msg ) const
 
 bool pxAssertImpl_LogIt( const DiagnosticOrigin& origin, const wxChar *msg )
 {
-	wxLogError( origin.ToString( msg ) );
+	wxLogError( L"%s", origin.ToString( msg ) );
 	return false;
 }
 
diff --git a/common/src/Utilities/Windows/WinThreads.cpp b/common/src/Utilities/Windows/WinThreads.cpp
index 0133f89e38..22cdfb21d6 100644
--- a/common/src/Utilities/Windows/WinThreads.cpp
+++ b/common/src/Utilities/Windows/WinThreads.cpp
@@ -36,6 +36,11 @@ __forceinline void Threading::SpinWait()
 	__asm pause;
 }
 
+__forceinline void Threading::StoreFence()
+{
+	__asm sfence;
+}
+
 __forceinline void Threading::EnableHiresScheduler()
 {
 	// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of
diff --git a/common/src/Utilities/x86/MemcpyFast.cpp b/common/src/Utilities/x86/MemcpyFast.cpp
index 40caf98308..0c8af9e63e 100644
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@@ -41,12 +41,10 @@
 MEMCPY_AMD.CPP
 ******************************************************************************/
 
-// Very optimized memcpy() routine for AMD Athlon and Duron family.
-// This code uses any of FOUR different basic copy methods, depending
-// on the transfer size.
 // NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
 // "Streaming Store"), and also uses the software prefetch instructions,
-// be sure you're running on Athlon/Duron or other recent CPU before calling!
+// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
+// calling!
 
 #define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
@@ -68,10 +66,8 @@ MEMCPY_AMD.CPP
 
 #if defined(_MSC_VER)
 
-// --------------------------------------------------------------------------------------
-//  Fast memcpy as coded by AMD, and then improved by air. 
-// --------------------------------------------------------------------------------------
 
+//  Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
 __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
 {
     __asm
@@ -92,6 +88,7 @@ __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_
 	jbe		$memcpy_do_align	;  it appears to be slower
 	cmp		eax, 64*1024
 	jbe		$memcpy_align_done
+
 $memcpy_do_align:
 	mov		eax, 8			; a trick that's faster than rep movsb...
 	sub		eax, edi		; align destination to qword
@@ -146,7 +143,7 @@ $memcpy_ic_1:			; 64-byte block copies, in-cache copy
 
 	add		esi, 64			; update source pointer
 	add		edi, 64			; update destination pointer
-	dec		eax				; count down
+	sub		eax, 1
 	jnz		$memcpy_ic_1	; last 64-byte block?
 
 $memcpy_ic_2:
@@ -189,64 +186,15 @@ $memcpy_uc_1:				; 64-byte blocks, uncached copy
 	movq	mm1,[esi-8]
 	movntq	[edi-24], mm2
 	movntq	[edi-16], mm0
-	dec		eax
 	movntq	[edi-8], mm1
+
+	sub		eax, 1
 	jnz		$memcpy_uc_1	; last 64-byte block?
 
 	jmp		$memcpy_ic_2		; almost done  (not needed because large copy below was removed)
 
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-
-// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
-// help keep the code cache footprint of memcpy_fast to a minimum.
-/*
-$memcpy_bp_1:			; large blocks, block prefetch copy
-
-	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
-	jl		$memcpy_64_test			; no, back to regular uncached copy
-
-	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
-	add		esi, CACHEBLOCK * 64	; move to the top of the block
-align 16
-$memcpy_bp_2:
-	mov		edx, [esi-64]		; grab one address per cache line
-	mov		edx, [esi-128]		; grab one address per cache line
-	sub		esi, 128			; go reverse order to suppress HW prefetcher
-	dec		eax					; count down the cache lines
-	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
-
-	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
-align 16
-$memcpy_bp_3:
-	movq	mm0, [esi   ]		; read 64 bits
-	movq	mm1, [esi+ 8]
-	movq	mm2, [esi+16]
-	movq	mm3, [esi+24]
-	movq	mm4, [esi+32]
-	movq	mm5, [esi+40]
-	movq	mm6, [esi+48]
-	movq	mm7, [esi+56]
-	add		esi, 64				; update source pointer
-	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
-	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
-	movntq	[edi+16], mm2		;    from READING the destination address
-	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
-	movntq	[edi+32], mm4		;    so that also helps performance
-	movntq	[edi+40], mm5
-	movntq	[edi+48], mm6
-	movntq	[edi+56], mm7
-	add		edi, 64				; update dest pointer
-
-	dec		eax					; count down
-
-	jnz		$memcpy_bp_3		; keep copying
-	sub		ecx, CACHEBLOCK		; update the 64-byte block count
-	jmp		$memcpy_bp_1		; keep processing chunks
-*/
+// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
+// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
 
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".   Then it handles the last few bytes.
@@ -274,17 +222,99 @@ $memcpy_last_few:		; dword aligned from before movsd's
 	rep		movsb		; the last 1, 2, or 3 bytes
 
 $memcpy_final:
+	pop    esi
+	pop    edi
+
 	emms				; clean up the MMX state
 	sfence				; flush the write buffer
 	//mov		eax, [dest]	; ret value = destination pointer
 
-	pop    esi
-	pop    edi
-
 	ret 4
     }
 }
 
+// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
+__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+{
+	// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
+	// registers will improve copy performance, because they won't.  Use of XMMs is only
+	// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
+	// and even then the benefits are typically minimal (sometimes slower depending on the
+	// amount of data being copied).
+	//
+	// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
+	//   --air
+
+	// Linux Conversion note:
+	//  This code would benefit nicely from having inline-able GAS syntax, since it should
+	//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
+	//  And its called enough times to probably merit the extra effort to ensure proper
+	//  optimization. --air
+
+    __asm
+	{
+	mov		ecx, dest
+	mov		edx, src
+	mov		eax, qwc			; keep a copy of count
+	shr		eax, 1
+	jz		$memcpy_qwc_1		; only one 16 byte block to copy?
+
+	cmp		eax, IN_CACHE_COPY/32
+	jb		$memcpy_qwc_loop1	; small copies should be cached (definite speedup --air)
+	
+$memcpy_qwc_loop2:				; 32-byte blocks, uncached copy
+	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
+
+	movq	mm0,[edx+0]			; read 64 bits
+	movq	mm1,[edx+8]
+	movq	mm2,[edx+16]
+	movntq	[ecx+0], mm0		; write 64 bits, bypassing the cache
+	movntq	[ecx+8], mm1
+	movq	mm3,[edx+24]
+	movntq	[ecx+16], mm2
+	movntq	[ecx+24], mm3
+
+	add		edx,32				; update source pointer
+	add		ecx,32				; update destination pointer
+	sub		eax,1
+	jnz		$memcpy_qwc_loop2	; last 64-byte block?
+	sfence						; flush the write buffer
+	jmp		$memcpy_qwc_1
+
+; 32-byte blocks, cached!
+; This *is* important.  Removing this and using exclusively non-temporal stores
+; results in noticable speed loss!
+
+$memcpy_qwc_loop1:				
+	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
+
+	movq	mm0,[edx+0]			; read 64 bits
+	movq	mm1,[edx+8]
+	movq	mm2,[edx+16]
+	movq	[ecx+0], mm0		; write 64 bits, bypassing the cache
+	movq	[ecx+8], mm1
+	movq	mm3,[edx+24]
+	movq	[ecx+16], mm2
+	movq	[ecx+24], mm3
+
+	add		edx,32				; update source pointer
+	add		ecx,32				; update destination pointer
+	sub		eax,1
+	jnz		$memcpy_qwc_loop1	; last 64-byte block?
+
+$memcpy_qwc_1:
+	test	qwc,1
+	jz		$memcpy_qwc_final
+	movq	mm0,[edx]
+	movq	mm1,[edx+8]
+	movq	[ecx], mm0
+	movq	[ecx+8], mm1
+
+$memcpy_qwc_final:
+	emms				; clean up the MMX state
+    }
+}
+
 // mmx mem-compare implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
 // ~10 times faster than standard memcmp
diff --git a/common/src/Utilities/x86/MemcpyVibes.cpp b/common/src/Utilities/x86/MemcpyVibes.cpp
index 7efcd83f39..b154cd2847 100644
--- a/common/src/Utilities/x86/MemcpyVibes.cpp
+++ b/common/src/Utilities/x86/MemcpyVibes.cpp
@@ -156,3 +156,95 @@ __forceinline void memcpy_vibes(void * dest, const void * src, int size) {
 
 #endif
 #endif
+
+// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
+// to get around compilation issues with having it in the headers.
+#ifdef __LINUX__
+
+	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
+	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
+	__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+	{	
+		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
+		// registers will improve copy performance, because they won't.  Use of XMMs is only
+		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
+		// and even then the benefits are typically minimal (sometimes slower depending on the
+		// amount of data being copied).
+		//
+		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
+		//   --air
+
+		// Linux Conversion note:
+		//  This code would benefit nicely from having inline-able GAS syntax, since it should
+		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
+		//  And its called enough times to probably merit the extra effort to ensure proper
+		//  optimization. --air
+
+		__asm__
+		(
+			".intel_syntax noprefix\n"
+				"mov		eax, %[qwc]\n"				// keep a copy of count for looping
+				"shr		eax, 1\n"
+				"jz			memcpy_qwc_1\n"				// only one 16 byte block to copy?
+
+				"cmp		eax, 64\n"					// "IN_CACHE_COPY/32"
+				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
+		
+			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
+
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		mm1,[%[src]+8]\n"
+				"movq		mm2,[%[src]+16]\n"
+				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movntq		[%[dest]+8], mm1\n"
+				"movq		mm3,[%[src]+24]\n"
+				"movntq		[%[dest]+16], mm2\n"
+				"movntq		[%[dest]+24], mm3\n"
+
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
+				"sfence\n"								// flush the write buffer
+				"jmp		memcpy_qwc_1\n"
+
+			// 32-byte blocks, cached!
+			// This *is* important.  Removing this and using exclusively non-temporal stores
+			// results in noticeable speed loss!
+
+			"memcpy_qwc_loop1:\n"				
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
+
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		mm1,[%[src]+8]\n"
+				"movq		mm2,[%[src]+16]\n"
+				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movq		[%[dest]+8], mm1\n"
+				"movq		mm3,[%[src]+24]\n"
+				"movq		[%[dest]+16], mm2\n"
+				"movq		[%[dest]+24], mm3\n"
+
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
+
+			"memcpy_qwc_1:\n"
+				"test %[qwc],1\n"
+				"jz			memcpy_qwc_final\n"
+				"movq		mm0,[%[src]]\n"
+				"movq		mm1,[%[src]+8]\n"
+				"movq		[%[dest]], mm0\n"
+				"movq		[%[dest]+8], mm1\n"
+
+			"memcpy_qwc_final:\n"
+				"emms\n"								// clean up the MMX state
+			".att_syntax\n"
+					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
+					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
+					: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
+		);
+	}
+#endif
+
diff --git a/pcsx2/Config.h b/pcsx2/Config.h
index a81f51d8e8..cc83918e92 100644
--- a/pcsx2/Config.h
+++ b/pcsx2/Config.h
@@ -395,6 +395,7 @@ struct Pcsx2Config
 		// style.  Useful for debugging potential bugs in the MTGS pipeline.
 		bool	SynchronousMTGS;
 		bool	DisableOutput;
+		int		VsyncQueueSize;
 
 		bool	FrameLimitEnable;
 		bool	FrameSkipEnable;
@@ -420,6 +421,8 @@ struct Pcsx2Config
 			return
 				OpEqu( SynchronousMTGS )		&&
 				OpEqu( DisableOutput )			&&
+				OpEqu( VsyncQueueSize )			&&
+				
 				OpEqu( FrameSkipEnable )		&&
 				OpEqu( FrameLimitEnable )		&&
 				OpEqu( VsyncEnable )			&&
diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp
index ed87881d03..282eb68aaa 100644
--- a/pcsx2/FiFo.cpp
+++ b/pcsx2/FiFo.cpp
@@ -164,7 +164,6 @@ void __fastcall WriteFIFO_page_5(u32 mem, const mem128_t *value)
 
 	if(GSTransferStatus.PTH2 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH2)
 	{
-		if(gifRegs->stat.DIR == 0)gifRegs->stat.OPH = false;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
 	}
@@ -195,14 +194,12 @@ void __fastcall WriteFIFO_page_6(u32 mem, const mem128_t *value)
 	nloop0_packet[1] = psHu32(GIF_FIFO + 4);
 	nloop0_packet[2] = psHu32(GIF_FIFO + 8);
 	nloop0_packet[3] = psHu32(GIF_FIFO + 12);
-	GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)nloop0_packet, 1);
-	u64* data = (u64*)GetMTGS().GetDataPacketPtr();
-	data[0] = value[0];
-	data[1] = value[1];
+	GetMTGS().PrepDataPacket(GIF_PATH_3, 1);
+	//u64* data = (u64*)GetMTGS().GetDataPacketPtr();
+	GIFPath_CopyTag( GIF_PATH_3, (u128*)nloop0_packet, 1 );
 	GetMTGS().SendDataPacket();
 	if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 )
 	{
-		if(gifRegs->stat.DIR == 0)gifRegs->stat.OPH = false;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
 	}
diff --git a/pcsx2/GS.cpp b/pcsx2/GS.cpp
index b9028de2ee..4dc0251393 100644
--- a/pcsx2/GS.cpp
+++ b/pcsx2/GS.cpp
@@ -282,14 +282,19 @@ void __fastcall gsWrite64_page_01( u32 mem, const mem64_t* value )
 	{
 		case 0x12001040: //busdir
 
-			//This is probably a complete hack, however writing to BUSDIR "should" start a transfer (Bleach Blade Battlers)
-			//Only problem is it kills killzone :( leaving it commented out for now.
+			//This is probably a complete hack, however writing to BUSDIR "should" start a transfer 
+			//(Bleach Blade Battlers, Growlanser 2 and 3, Wizardry)
+			//Only problem is it kills killzone :(.
 			// (yes it *is* a complete hack; both lines here in fact --air)
 			//=========================================================================
-			//gifRegs->stat.OPH = true; 
+			//Console.Warning("BUSDIR write! Setting OPH and DIR to = %x",(u32)value[0]);
+			if ((u32)value[0] == 1)
+				gifRegs->stat.OPH = true;
+			else
+				gifRegs->stat.OPH = false;
+			
+			gifRegs->stat.DIR = (u32)value[0];
 			//=========================================================================
-			gifRegs->stat.DIR = (u32)value;
-
 			// BUSDIR INSANITY !! MTGS FLUSH NEEDED
 			//
 			// Yup folks.  BUSDIR is evil.  The only safe way to handle it is to flush the whole MTGS
diff --git a/pcsx2/GS.h b/pcsx2/GS.h
index 3d1dc74d78..8162149218 100644
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@@ -229,7 +229,8 @@ enum GIF_PATH
 	GIF_PATH_3,
 };
 
-extern int  GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size);
+extern void GIFPath_Initialize();
+extern int  GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size);
 extern int  GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
 extern void GIFPath_Reset();
 extern void GIFPath_Clear( GIF_PATH pathidx );
@@ -248,7 +249,6 @@ enum MTGS_RingCommand
 	GS_RINGTYPE_P1
 ,	GS_RINGTYPE_P2
 ,	GS_RINGTYPE_P3
-,	GS_RINGTYPE_RESTART
 ,	GS_RINGTYPE_VSYNC
 ,	GS_RINGTYPE_FRAMESKIP
 ,	GS_RINGTYPE_FREEZE
@@ -273,19 +273,20 @@ class SysMtgsThread : public SysThreadBase
 	typedef SysThreadBase _parent;
 
 public:
-	// note: when m_RingPos == m_WritePos, the fifo is empty
-	uint			m_RingPos;			// cur pos gs is reading from
+	// note: when m_ReadPos == m_WritePos, the fifo is empty
+	uint			m_ReadPos;			// cur pos gs is reading from
 	uint			m_WritePos;			// cur pos ee thread is writing to
 
 	volatile bool	m_RingBufferIsBusy;
 	volatile u32	m_SignalRingEnable;
 	volatile s32	m_SignalRingPosition;
 
-	int				m_QueuedFrameCount;
-	u32				m_RingWrapSpot;
+	volatile s32	m_QueuedFrameCount;
+	volatile u32	m_VsyncSignalListener;
 
-	Mutex			m_lock_RingBufferBusy;
+	Mutex			m_mtx_RingBufferBusy;
 	Semaphore		m_sem_OnRingReset;
+	Semaphore		m_sem_Vsync;
 
 	// used to keep multiple threads from sending packets to the ringbuffer concurrently.
 	// (currently not used or implemented -- is a planned feature for a future threaded VU1)
@@ -301,8 +302,9 @@ public:
 	// These vars maintain instance data for sending Data Packets.
 	// Only one data packet can be constructed and uploaded at a time.
 
+	uint			m_packet_startpos;	// size of the packet (data only, ie. not including the 16 byte command!)
 	uint			m_packet_size;		// size of the packet (data only, ie. not including the 16 byte command!)
-	uint			m_packet_ringpos;	// index of the data location in the ringbuffer.
+	uint			m_packet_writepos;	// index of the data location in the ringbuffer.
 
 #ifdef RINGBUF_DEBUG_STACK
 	Threading::Mutex m_lock_Stack;
@@ -317,14 +319,13 @@ public:
 	void WaitGS();
 	void ResetGS();
 
-	int PrepDataPacket( MTGS_RingCommand cmd, u32 size );
-	int PrepDataPacket( GIF_PATH pathidx, const u8*  srcdata, u32 size );
+	void PrepDataPacket( MTGS_RingCommand cmd, u32 size );
+	void PrepDataPacket( GIF_PATH pathidx, u32 size );
 	void SendDataPacket();
 	void SendGameCRC( u32 crc );
 	void WaitForOpen();
 	void Freeze( int mode, MTGS_FreezeData& data );
 
-	void RestartRingbuffer( uint packsize=0 );
 	void SendSimplePacket( MTGS_RingCommand type, int data0, int data1, int data2 );
 	void SendPointerPacket( MTGS_RingCommand type, u32 data0, void* data1 );
 
@@ -346,9 +347,10 @@ protected:
 	void OnResumeInThread( bool IsSuspended );
 	void OnCleanupInThread();
 
+	void GenericStall( uint size );
+
 	// Used internally by SendSimplePacket type functions
-	uint _PrepForSimplePacket();
-	void _FinishSimplePacket( uint future_writepos );
+	void _FinishSimplePacket();
 	void ExecuteTaskInThread();
 };
 
@@ -416,3 +418,36 @@ extern int g_nLeftGSFrames;
 
 #endif
 
+// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s.
+// (actual size is 1<<m_RingBufferSizeFactor simd vectors [128-bit values])
+// A value of 19 is a 8meg ring buffer.  18 would be 4 megs, and 20 would be 16 megs.
+// Default was 2mb, but some games with lots of MTGS activity want 8mb to run fast (rama)
+static const uint RingBufferSizeFactor = 19;
+
+// size of the ringbuffer in simd128's.
+static const uint RingBufferSize = 1<<RingBufferSizeFactor;
+
+// Mask to apply to ring buffer indices to wrap the pointer from end to
+// start (the wrapping is what makes it a ringbuffer, yo!)
+static const uint RingBufferMask = RingBufferSize - 1;
+
+struct MTGS_BufferedData
+{
+	u128		m_Ring[RingBufferSize];
+	u8			Regs[Ps2MemSize::GSregs];
+
+	MTGS_BufferedData() {}
+
+	u128& operator[]( uint idx )
+	{
+		pxAssert( idx < RingBufferSize );
+		return m_Ring[idx];
+	}
+};
+
+extern __aligned(32) MTGS_BufferedData RingBuffer;
+
+// FIXME: These belong in common with other memcpy tools.  Will move them there later if no one
+// else beats me to it.  --air
+extern void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len );
+extern void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len );
diff --git a/pcsx2/Gif.cpp b/pcsx2/Gif.cpp
index 15ea987939..137df0564f 100644
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@@ -36,7 +36,7 @@ static u32 gifqwc = 0;
 static bool gifmfifoirq = false;
 
 //Just some temporary bits to store Path1 transfers if another is in progress.
-u8 Path1Buffer[0x1000000];
+__aligned16 u8 Path1Buffer[0x1000000];
 u32 Path1WritePos = 0;
 u32 Path1ReadPos = 0;
 
@@ -57,23 +57,23 @@ void gsPath1Interrupt()
 	if((gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.IP3 == true && gifRegs->stat.APATH == GIF_APATH3)) && Path1WritePos > 0 && !gifRegs->stat.PSE)
 	{
 		gifRegs->stat.P1Q = false;
-		while(Path1WritePos > 0)
-		{
-			u32 size  = GetMTGS().PrepDataPacket(GIF_PATH_1, Path1Buffer + (Path1ReadPos  * 16), (Path1WritePos - Path1ReadPos));
-			u8* pDest = GetMTGS().GetDataPacketPtr();
-			//DevCon.Warning("Flush Size = %x", size);
-			
-			memcpy_aligned(pDest, Path1Buffer + (Path1ReadPos * 16), size  * 16);
-			GetMTGS().SendDataPacket();
-			
 
-			Path1ReadPos += size;
-			
-			if(GSTransferStatus.PTH1 == STOPPED_MODE)
+		if (uint size = (Path1WritePos - Path1ReadPos))
+		{
+			GetMTGS().PrepDataPacket(GIF_PATH_1, size);
+			//DevCon.Warning("Flush Size = %x", size);
+			while(size > 0)
 			{
-				gifRegs->stat.OPH = false;				
-				gifRegs->stat.APATH = GIF_APATH_IDLE;
+				uint count = GIFPath_CopyTag(GIF_PATH_1, ((u128*)Path1Buffer) + Path1ReadPos, size);
+				Path1ReadPos += count;
+				size -= count;
+
+				if(GSTransferStatus.PTH1 == STOPPED_MODE)
+				{		
+					gifRegs->stat.APATH = GIF_APATH_IDLE;
+				}
 			}
+			GetMTGS().SendDataPacket();
 
 			if(Path1ReadPos == Path1WritePos)
 			{
@@ -105,7 +105,6 @@ __forceinline void gsInterrupt()
 
 	if(GSTransferStatus.PTH3 >= PENDINGSTOP_MODE && gifRegs->stat.APATH == GIF_APATH3 )
 	{
-		gifRegs->stat.OPH = false;
 		GSTransferStatus.PTH3 = STOPPED_MODE;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
@@ -150,11 +149,8 @@ __forceinline void gsInterrupt()
 
 static u32 WRITERING_DMA(u32 *pMem, u32 qwc)
 {
-	int size   = GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)pMem, qwc);
-	u8* pgsmem = GetMTGS().GetDataPacketPtr();
-
-	memcpy_aligned(pgsmem, pMem, size<<4);
-
+	GetMTGS().PrepDataPacket(GIF_PATH_3, qwc);
+	uint size = GIFPath_CopyTag(GIF_PATH_3, (u128*)pMem, qwc );
 	GetMTGS().SendDataPacket();
 	return size;
 }
@@ -167,7 +163,6 @@ static u32 WRITERING_DMA(tDMA_TAG *pMem, u32 qwc)
 int  _GIFchain()
 {
 	tDMA_TAG *pMem;
-	int qwc = 0;
 
 	pMem = dmaGetAddr(gif->madr, false);
 	if (pMem == NULL)
@@ -182,11 +177,6 @@ int  _GIFchain()
 		return -1;
 	}
 
-	//in Intermittent Mode it enabled, IMAGE_MODE transfers are sliced.
-
-	///(gifRegs->stat.IMT && GSTransferStatus.PTH3 <= IMAGE_MODE) qwc = min((int)gif->qwc, 8);
-	/*else qwc = gif->qwc;*/
-
 	return WRITERING_DMA(pMem, gif->qwc);
 }
 
@@ -327,7 +317,7 @@ void GIFdma()
 		
 
 	 	
-	    //gifRegs->stat.OPH = true;
+	    //gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama)
 		gifRegs->stat.FQC = min((u16)0x10, gif->qwc);// FQC=31, hack ;) (for values of 31 that equal 16) [ used to be 0xE00; // APATH=3]
 		//Check with Path3 masking games
 		if (gif->qwc > 0) {
@@ -346,7 +336,7 @@ void GIFdma()
 		
 	}
 	
-	//gifRegs->stat.OPH = true;
+	//gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama)
 	// Transfer Dn_QWC from Dn_MADR to GIF
 	if ((gif->chcr.MOD == NORMAL_MODE) || (gif->qwc > 0)) // Normal Mode
 	{
@@ -450,42 +440,44 @@ static __forceinline bool mfifoGIFrbTransfer()
 	u32 mfifoqwc = min(gifqwc, (u32)gif->qwc);
 	u32 *src;
 
+	GetMTGS().PrepDataPacket(GIF_PATH_3, mfifoqwc);
+
+	// TODO (minor optimization): The new GIFpath parser can do rather efficient wrapping of
+	// its own internally now. We just need to groom a version of it that can wrap around MFIFO
+	// memory similarly to how it wraps VU1 memory on PATH1.
+
 	/* Check if the transfer should wrap around the ring buffer */
 	if ((gif->madr + mfifoqwc * 16) > (dmacRegs->rbor.ADDR + dmacRegs->rbsr.RMSK + 16))
 	{
 		uint s1 = ((dmacRegs->rbor.ADDR + dmacRegs->rbsr.RMSK + 16) - gif->madr) >> 4;
 		uint s2 = (mfifoqwc - s1);
-		// fixme - I don't think these should use WRITERING_DMA, since our source
-		// isn't the DmaGetAddr(gif->madr) address that WRITERING_DMA expects.
 
 		/* it does (wrap around), so first copy 's1' bytes from 'addr' to 'data' */
+		/* and second copy 's2' bytes from 'maddr' to '&data[s1]' */
+
 		src = (u32*)PSM(gif->madr);
 		if (src == NULL) return false;
-		s1 = WRITERING_DMA(src, s1);
+		uint copied = GIFPath_CopyTag(GIF_PATH_3, (u128*)src, s1);
 
-		if (s1 == (mfifoqwc - s2))
+		if (copied == s1)	// but only copy second if first didn't abort prematurely for some reason.
 		{
-			/* and second copy 's2' bytes from 'maddr' to '&data[s1]' */
 			src = (u32*)PSM(dmacRegs->rbor.ADDR);
 			if (src == NULL) return false;
-			s2 = WRITERING_DMA(src, s2);
-		}
-		else
-		{
-			s2 = 0;
+			copied += GIFPath_CopyTag(GIF_PATH_3, (u128*)src, s2);
 		}
 
-		mfifoqwc = s1 + s2;
+		mfifoqwc = copied;
 	}
 	else
 	{
 		/* it doesn't, so just transfer 'qwc*16' words from 'gif->madr' to GS */
 		src = (u32*)PSM(gif->madr);
 		if (src == NULL) return false;
-		mfifoqwc = WRITERING_DMA(src, mfifoqwc);
+		mfifoqwc = GIFPath_CopyTag(GIF_PATH_3, (u128*)src, mfifoqwc);
 		gif->madr = dmacRegs->rbor.ADDR + (gif->madr & dmacRegs->rbsr.RMSK);
 	}
 
+	GetMTGS().SendDataPacket();
 	gifqwc -= mfifoqwc;
 
 	return true;
@@ -571,36 +563,36 @@ void mfifoGIFtransfer(int qwc)
 
 		switch (ptag->ID)
 		{
-			case TAG_REFE: // Refe - Transfer Packet According to ADDR field
+			case TAG_REFE:		// Refe - Transfer Packet According to ADDR field
 				gif->tadr = qwctag(gif->tadr + 16);
 				gifstate = GIF_STATE_DONE;										//End Transfer
 				break;
 
-			case TAG_CNT: // CNT - Transfer QWC following the tag.
+			case TAG_CNT:		// CNT - Transfer QWC following the tag.
 				gif->madr = qwctag(gif->tadr + 16);						//Set MADR to QW after Tag
-				gif->tadr = qwctag(gif->madr + (gif->qwc << 4));			//Set TADR to QW following the data
+				gif->tadr = qwctag(gif->madr + (gif->qwc << 4));		//Set TADR to QW following the data
 				gifstate = GIF_STATE_READY;
 				break;
 
-			case TAG_NEXT: // Next - Transfer QWC following tag. TADR = ADDR
+			case TAG_NEXT:		// Next - Transfer QWC following tag. TADR = ADDR
 			{
-				u32 temp = gif->madr;								//Temporarily Store ADDR
-				gif->madr = qwctag(gif->tadr + 16); 					  //Set MADR to QW following the tag
-				gif->tadr = temp;								//Copy temporarily stored ADDR to Tag
+				u32 temp = gif->madr;									//Temporarily Store ADDR
+				gif->madr = qwctag(gif->tadr + 16);						//Set MADR to QW following the tag
+				gif->tadr = temp;										//Copy temporarily stored ADDR to Tag
 				gifstate = GIF_STATE_READY;
 				break;
 			}
 
-			case TAG_REF: // Ref - Transfer QWC from ADDR field
-			case TAG_REFS: // Refs - Transfer QWC from ADDR field (Stall Control)
+			case TAG_REF:		// Ref - Transfer QWC from ADDR field
+			case TAG_REFS:		// Refs - Transfer QWC from ADDR field (Stall Control)
 				gif->tadr = qwctag(gif->tadr + 16);							//Set TADR to next tag
 				gifstate = GIF_STATE_READY;
 				break;
 
-			case TAG_END: // End - Transfer QWC following the tag
-				gif->madr = qwctag(gif->tadr + 16);		//Set MADR to data following the tag
-				gif->tadr = qwctag(gif->madr + (gif->qwc << 4));			//Set TADR to QW following the data
-				gifstate = GIF_STATE_DONE;						//End Transfer
+			case TAG_END:		// End - Transfer QWC following the tag
+				gif->madr = qwctag(gif->tadr + 16);					//Set MADR to data following the tag
+				gif->tadr = qwctag(gif->madr + (gif->qwc << 4));	//Set TADR to QW following the data
+				gifstate = GIF_STATE_DONE;							//End Transfer
 				break;
 			}
 
@@ -638,7 +630,6 @@ void gifMFIFOInterrupt()
 
 	if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 )
 	{
-		gifRegs->stat.OPH = false;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
 	}
diff --git a/pcsx2/Gif.h b/pcsx2/Gif.h
index 8533cdc98d..097cb03532 100644
--- a/pcsx2/Gif.h
+++ b/pcsx2/Gif.h
@@ -290,7 +290,7 @@ extern void gifMFIFOInterrupt();
 
 //Just some temporary bits to store Path1 transfers if another is in progress.
 extern void gsPath1Interrupt();
-extern u8 Path1Buffer[0x1000000];
+extern __aligned16 u8 Path1Buffer[0x1000000];
 extern u32 Path1WritePos;
 extern u32 Path1ReadPos;
 #endif
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index a6905b9788..115e08c3c5 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -29,7 +29,7 @@
 
 using namespace Threading;
 
-#if 0 // PCSX2_DEBUG
+#if 0 //PCSX2_DEBUG
 #	define MTGS_LOG Console.WriteLn
 #else
 #	define MTGS_LOG 0&&
@@ -46,34 +46,7 @@ using namespace Threading;
 //  MTGS Threaded Class Implementation
 // =====================================================================================================
 
-// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s.
-// (actual size is 1<<m_RingBufferSizeFactor simd vectors [128-bit values])
-// A value of 19 is a 8meg ring buffer.  18 would be 4 megs, and 20 would be 16 megs.
-// Default was 2mb, but some games with lots of MTGS activity want 8mb to run fast (rama)
-static const uint RingBufferSizeFactor = 19;
-
-// size of the ringbuffer in simd128's.
-static const uint RingBufferSize = 1<<RingBufferSizeFactor;
-
-// Mask to apply to ring buffer indices to wrap the pointer from end to
-// start (the wrapping is what makes it a ringbuffer, yo!)
-static const uint RingBufferMask = RingBufferSize - 1;
-
-struct MTGS_BufferedData
-{
-	u128		m_Ring[RingBufferSize];
-	u8			Regs[Ps2MemSize::GSregs];
-
-	MTGS_BufferedData() {}
-
-	u128& operator[]( uint idx )
-	{
-		pxAssert( idx < RingBufferSize );
-		return m_Ring[idx];
-	}
-};
-
-static __aligned(32) MTGS_BufferedData RingBuffer;
+__aligned(32) MTGS_BufferedData RingBuffer;
 extern bool renderswitch;
 
 
@@ -97,16 +70,16 @@ void SysMtgsThread::OnStart()
 {
 	m_PluginOpened		= false;
 
-	m_RingPos			= 0;
+	m_ReadPos			= 0;
 	m_WritePos			= 0;
 	m_RingBufferIsBusy	= false;
 	m_packet_size		= 0;
-	m_packet_ringpos	= 0;
+	m_packet_writepos	= 0;
 
 	m_QueuedFrameCount	= 0;
+	m_VsyncSignalListener = false;
 	m_SignalRingEnable	= 0;
 	m_SignalRingPosition= 0;
-	m_RingWrapSpot		= 0;
 
 	m_CopyDataTally		= 0;
 
@@ -125,12 +98,16 @@ void SysMtgsThread::OnResumeReady()
 
 void SysMtgsThread::ResetGS()
 {
+	pxAssertDev( !IsOpen() || (m_ReadPos == m_WritePos), "Must close or terminate the GS thread prior to gsReset." );
+
 	// MTGS Reset process:
 	//  * clear the ringbuffer.
 	//  * Signal a reset.
 	//  * clear the path and byRegs structs (used by GIFtagDummy)
 
-	m_RingPos = m_WritePos;
+	m_ReadPos = m_WritePos;
+	m_QueuedFrameCount = 0;
+	m_VsyncSignalListener = false;
 
 	MTGS_LOG( "MTGS: Sending Reset..." );
 	SendSimplePacket( GS_RINGTYPE_RESET, 0, 0, 0 );
@@ -155,30 +132,31 @@ void SysMtgsThread::PostVsyncEnd()
 	// 256-byte copy is only a few dozen cycles -- executed 60 times a second -- so probably
 	// not worth the effort or overhead of trying to selectively avoid it.
 
-	PrepDataPacket(GS_RINGTYPE_VSYNC, sizeof(RingCmdPacket_Vsync));
-	RingCmdPacket_Vsync& local( *(RingCmdPacket_Vsync*)GetDataPacketPtr() );
+	uint packsize = sizeof(RingCmdPacket_Vsync) / 16;
+	PrepDataPacket(GS_RINGTYPE_VSYNC, packsize);
+	MemCopy_WrappedDest( (u128*)PS2MEM_GS, RingBuffer.m_Ring, m_packet_writepos, RingBufferSize, 0xf );
 
-	memcpy_fast( local.regset1, PS2MEM_GS, sizeof(local.regset1) );
-	local.csr = GSCSRr;
-	local.imr = GSIMR;
-	local.siglblid = GSSIGLBLID;
+	u32* remainder = (u32*)GetDataPacketPtr();
+	remainder[0] = GSCSRr;
+	remainder[1] = GSIMR;
+	(GSRegSIGBLID&)remainder[2] = GSSIGLBLID;
+	m_packet_writepos = (m_packet_writepos + 1) & RingBufferMask;
 
 	SendDataPacket();
 
-	// Alter-frame flushing!  Restarts the ringbuffer (wraps) on every other frame.  This is a
-	// mandatory feature that prevents the MTGS from queuing more than 2 frames at any time.
-	// (queued frames cause input lag and desynced audio -- bad!).  Ring restarts work for this
-	// because they act as sync points where the EE must stall to wait for the GS to catch-up,
-	// and they also allow us to reuse the front of the ringbuffer more often, which should improve
-	// L2 cache performance.
+	// Vsyncs should always start the GS thread, regardless of how little has actually be queued.
+	if (m_CopyDataTally != 0) SetEvent();
 
-	if( m_QueuedFrameCount > 0 )
-		RestartRingbuffer();
-	else
-	{
-		m_QueuedFrameCount++;
-		SetEvent();
-	}
+	// If the MTGS is allowed to queue a lot of frames in advance, it creates input lag.
+	// Use the Queued FrameCount to stall the EE if another vsync (or two) are already queued
+	// in the ringbuffer.  The queue limit is disabled when both FrameLimiting and Vsync are
+	// disabled, since the queue can have perverse effects on framerate benchmarking.
+
+	if ((AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize) || (!EmuConfig.GS.VsyncEnable && !EmuConfig.GS.FrameLimitEnable)) return;
+
+	m_VsyncSignalListener = true;
+	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", volatize(m_ReadPos), m_WritePos );
+	m_sem_Vsync.WaitNoCancel();
 }
 
 struct PacketTagType
@@ -261,12 +239,14 @@ void SysMtgsThread::OpenPlugin()
 
 class RingBufferLock : public ScopedLock
 {
+	typedef ScopedLock _parent;
+	
 protected:
 	SysMtgsThread&		m_mtgs;
 
 public:
 	RingBufferLock( SysMtgsThread& mtgs )
-		: ScopedLock( mtgs.m_lock_RingBufferBusy )
+		: ScopedLock( mtgs.m_mtx_RingBufferBusy )
 		, m_mtgs( mtgs )
 	{
 		m_mtgs.m_RingBufferIsBusy = true;
@@ -276,6 +256,18 @@ public:
 	{
 		m_mtgs.m_RingBufferIsBusy = false;
 	}
+	
+	void Acquire()
+	{
+		_parent::Acquire();
+		m_mtgs.m_RingBufferIsBusy = true;
+	}
+	
+	void Release()
+	{
+		m_mtgs.m_RingBufferIsBusy = false;
+		_parent::Release();	
+	}
 };
 
 void SysMtgsThread::ExecuteTaskInThread()
@@ -284,31 +276,33 @@ void SysMtgsThread::ExecuteTaskInThread()
 	PacketTagType prevCmd;
 #endif
 
+	RingBufferLock busy( *this );
+
 	while( true )
 	{
+		busy.Release();
+
 		// Performance note: Both of these perform cancellation tests, but pthread_testcancel
 		// is very optimized (only 1 instruction test in most cases), so no point in trying
 		// to avoid it.
 
 		m_sem_event.WaitWithoutYield();
 		StateCheckInThread();
+		busy.Acquire();
 
-		{
-		RingBufferLock busy( *this );
-
-		// note: m_RingPos is intentionally not volatile, because it should only
+		// note: m_ReadPos is intentionally not volatile, because it should only
 		// ever be modified by this thread.
-		while( m_RingPos != volatize(m_WritePos))
+		while( m_ReadPos != volatize(m_WritePos))
 		{
 			if( EmuConfig.GS.DisableOutput )
 			{
-				m_RingPos = m_WritePos;
+				m_ReadPos = m_WritePos;
 				continue;
 			}
 
-			pxAssert( m_RingPos < RingBufferSize );
+			pxAssert( m_ReadPos < RingBufferSize );
 
-			const PacketTagType& tag = (PacketTagType&)RingBuffer[m_RingPos];
+			const PacketTagType& tag = (PacketTagType&)RingBuffer[m_ReadPos];
 			u32 ringposinc = 1;
 
 #ifdef RINGBUF_DEBUG_STACK
@@ -316,11 +310,11 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 			m_lock_Stack.Lock();
 			uptr stackpos = ringposStack.back();
-			if( stackpos != m_RingPos )
+			if( stackpos != m_ReadPos )
 			{
-				Console.Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, m_RingPos, prevCmd.command );
+				Console.Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, m_ReadPos, prevCmd.command );
 			}
-			pxAssert( stackpos == m_RingPos );
+			pxAssert( stackpos == m_ReadPos );
 			prevCmd = tag;
 			ringposStack.pop_back();
 			m_lock_Stack.Release();
@@ -330,38 +324,75 @@ void SysMtgsThread::ExecuteTaskInThread()
 			{
 				case GS_RINGTYPE_P1:
 				{
+					uint datapos = (m_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
-					const u128* data = &RingBuffer[m_RingPos+1];
+					const u128* data = &RingBuffer[datapos];
 
 					MTGS_LOG( "(MTGS Packet Read) ringtype=P1, qwc=%u", qsize );
 
-					// make sure that tag>>16 is the MAX size readable
-					GSgifTransfer1((u32*)(data - 0x400 + qsize), 0x4000-qsize*16);
-					//GSgifTransfer1((u32*)data, qsize);
+					uint endpos = datapos + qsize;
+					if( endpos >= RingBufferSize )
+					{
+						uint firstcopylen = RingBufferSize - datapos;
+						GSgifTransfer( (u32*)data, firstcopylen );
+						datapos = endpos & RingBufferMask;
+						GSgifTransfer( (u32*)RingBuffer.m_Ring, datapos );
+					}
+					else
+					{
+						GSgifTransfer( (u32*)data, qsize );
+					}
+
 					ringposinc += qsize;
 				}
 				break;
 
 				case GS_RINGTYPE_P2:
 				{
+					uint datapos = (m_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
-					const u128* data = &RingBuffer[m_RingPos+1];
+					const u128* data = &RingBuffer[datapos];
 
 					MTGS_LOG( "(MTGS Packet Read) ringtype=P2, qwc=%u", qsize );
 
-					GSgifTransfer2((u32*)data, qsize);
+					uint endpos = datapos + qsize;
+					if( endpos >= RingBufferSize )
+					{
+						uint firstcopylen = RingBufferSize - datapos;
+						GSgifTransfer2( (u32*)data, firstcopylen );
+						datapos = endpos & RingBufferMask;
+						GSgifTransfer2( (u32*)RingBuffer.m_Ring, datapos );
+					}
+					else
+					{
+						GSgifTransfer2( (u32*)data, qsize );
+					}
+
 					ringposinc += qsize;
 				}
 				break;
 
 				case GS_RINGTYPE_P3:
 				{
+					uint datapos = (m_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
-					const u128* data = &RingBuffer[m_RingPos+1];
+					const u128* data = &RingBuffer[datapos];
 
 					MTGS_LOG( "(MTGS Packet Read) ringtype=P3, qwc=%u", qsize );
 
-					GSgifTransfer3((u32*)data, qsize);
+					uint endpos = datapos + qsize;
+					if( endpos >= RingBufferSize )
+					{
+						uint firstcopylen = RingBufferSize - datapos;
+						GSgifTransfer3( (u32*)data, firstcopylen );
+						datapos = endpos & RingBufferMask;
+						GSgifTransfer3( (u32*)RingBuffer.m_Ring, datapos );
+					}
+					else
+					{
+						GSgifTransfer3( (u32*)data, qsize );
+					}
+
 					ringposinc += qsize;
 				}
 				break;
@@ -370,25 +401,25 @@ void SysMtgsThread::ExecuteTaskInThread()
 				{
 					switch( tag.command )
 					{
-						case GS_RINGTYPE_RESTART:
-							//MTGS_LOG( "(MTGS Packet Read) ringtype=Restart" );
-							m_RingPos = 0;
-						continue;
-
 						case GS_RINGTYPE_VSYNC:
 						{
 							const int qsize = tag.data[0];
 							ringposinc += qsize;
 
-							MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", tag.data[0], tag.data[1] ? "true" : "false" );
-							
+							MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", !!(((u32&)RingBuffer.Regs[0x1000]) & 0x2000) ? 0 : 1, tag.data[1] ? "true" : "false" );
+
 							// Mail in the important GS registers.
-							RingCmdPacket_Vsync& local((RingCmdPacket_Vsync&)RingBuffer[m_RingPos+1]);
-							memcpy_fast( RingBuffer.Regs, local.regset1, sizeof(local.regset1));
-							((u32&)RingBuffer.Regs[0x1000]) = local.csr;
-							((u32&)RingBuffer.Regs[0x1010]) = local.imr;
-							((GSRegSIGBLID&)RingBuffer.Regs[0x1080]) = local.siglblid;
-							
+							// This seemingly obtuse system is needed in order to handle cases where the vsync data wraps
+							// around the edge of the ringbuffer.  If not for that I'd just use a struct. >_<
+
+							uint datapos = (m_ReadPos+1) & RingBufferMask;
+							MemCopy_WrappedSrc( RingBuffer.m_Ring, datapos, RingBufferSize, (u128*)RingBuffer.Regs, 0xf );
+
+							u32* remainder = (u32*)&RingBuffer[datapos];
+							((u32&)RingBuffer.Regs[0x1000])				= remainder[0];
+							((u32&)RingBuffer.Regs[0x1010])				= remainder[1];
+							((GSRegSIGBLID&)RingBuffer.Regs[0x1080])	= (GSRegSIGBLID&)remainder[2];
+
 							// CSR & 0x2000; is the pageflip id.
 							GSvsync(((u32&)RingBuffer.Regs[0x1000]) & 0x2000);
 							gsFrameSkip();
@@ -398,7 +429,13 @@ void SysMtgsThread::ExecuteTaskInThread()
 							if( (GSopen2 == NULL) && (PADupdate != NULL) )
 								PADupdate(0);
 
+							AtomicDecrement( m_QueuedFrameCount );
+							if (!!AtomicExchange(m_VsyncSignalListener, false))
+								m_sem_Vsync.Post();
+
+							busy.Release();
 							StateCheckInThread();
+							busy.Acquire();
 						}
 						break;
 
@@ -438,9 +475,9 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 #ifdef PCSX2_DEVBUILD
 						default:
-							Console.Error("GSThreadProc, bad packet (%x) at m_RingPos: %x, m_WritePos: %x", tag.command, m_RingPos, m_WritePos);
+							Console.Error("GSThreadProc, bad packet (%x) at m_ReadPos: %x, m_WritePos: %x", tag.command, m_ReadPos, m_WritePos);
 							pxFail( "Bad packet encountered in the MTGS Ringbuffer." );
-							m_RingPos = m_WritePos;
+							m_ReadPos = m_WritePos;
 						continue;
 #else
 						// Optimized performance in non-Dev builds.
@@ -450,23 +487,29 @@ void SysMtgsThread::ExecuteTaskInThread()
 				}
 			}
 
-			uint newringpos = m_RingPos + ringposinc;
-			pxAssert( newringpos <= RingBufferSize );
-			m_RingPos = newringpos & RingBufferMask;
+			uint newringpos = (m_ReadPos + ringposinc) & RingBufferMask;
+
+			if( EmuConfig.GS.SynchronousMTGS )
+			{
+				pxAssert( m_WritePos == newringpos );
+			}
+			
+			m_ReadPos = newringpos;
 
 			if( m_SignalRingEnable != 0 )
 			{
 				// The EEcore has requested a signal after some amount of processed data.
 				if( AtomicExchangeSub( m_SignalRingPosition, ringposinc ) <= 0 )
 				{
-					// Make sure to post the signal after the m_RingPos has been updated...
+					// Make sure to post the signal after the m_ReadPos has been updated...
 					AtomicExchange( m_SignalRingEnable, 0 );
 					m_sem_OnRingReset.Post();
 					continue;
 				}
 			}
 		}
-		}
+
+		busy.Release();
 
 		// Safety valve in case standard signals fail for some reason -- this ensures the EEcore
 		// won't sleep the eternity, even if SignalRingPosition didn't reach 0 for some reason.
@@ -479,7 +522,10 @@ void SysMtgsThread::ExecuteTaskInThread()
 			m_sem_OnRingReset.Post();
 		}
 
-		//Console.Warning( "(MTGS Thread) Nothing to do!  ringpos=0x%06x", m_RingPos );
+		if (!!AtomicExchange(m_VsyncSignalListener, false))
+			m_sem_Vsync.Post();
+
+		//Console.Warning( "(MTGS Thread) Nothing to do!  ringpos=0x%06x", m_ReadPos );
 	}
 }
 
@@ -519,15 +565,15 @@ void SysMtgsThread::WaitGS()
 	if( m_ExecMode == ExecMode_NoThreadYet || !IsRunning() ) return;
 	if( !pxAssertDev( IsOpen(), "MTGS Warning!  WaitGS issued on a closed thread." ) ) return;
 
-	if( volatize(m_RingPos) != m_WritePos )
+	if( volatize(m_ReadPos) != m_WritePos )
 	{
 		SetEvent();
 		RethrowException();
 
 		do {
-			m_lock_RingBufferBusy.Wait();
+			m_mtx_RingBufferBusy.Wait();
 			RethrowException();
-		} while( volatize(m_RingPos) != m_WritePos );
+		} while( volatize(m_ReadPos) != m_WritePos );
 	}
 	
 	// Completely synchronize GS and MTGS register states.
@@ -546,7 +592,7 @@ void SysMtgsThread::SetEvent()
 
 u8* SysMtgsThread::GetDataPacketPtr() const
 {
-	return (u8*)&RingBuffer[m_packet_ringpos];
+	return (u8*)&RingBuffer[m_packet_writepos & RingBufferMask];
 }
 
 // Closes the data packet send command, and initiates the gs thread (if needed).
@@ -555,31 +601,14 @@ void SysMtgsThread::SendDataPacket()
 	// make sure a previous copy block has been started somewhere.
 	pxAssert( m_packet_size != 0 );
 
-	uint temp = m_packet_ringpos + m_packet_size;
-	pxAssert( temp <= RingBufferSize );
-	temp &= RingBufferMask;
+	uint actualSize = ((m_packet_writepos - m_packet_startpos) & RingBufferMask)-1;
+	pxAssert( actualSize <= m_packet_size );
+	pxAssert( m_packet_writepos < RingBufferSize );
 
-	if( IsDebugBuild )
-	{
-		if( m_packet_ringpos + m_packet_size < RingBufferSize )
-		{
-			uint readpos = volatize(m_RingPos);
-			if( readpos != m_WritePos )
-			{
-				// The writepos should never leapfrog the readpos
-				// since that indicates a bad write.
-				if( m_packet_ringpos < readpos )
-					pxAssert( temp < readpos );
-			}
+	PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
+	tag.data[0] = actualSize;
 
-			// Updating the writepos should never make it equal the readpos, since
-			// that would stop the buffer prematurely (and indicates bad code in the
-			// ringbuffer manager)
-			pxAssert( readpos != temp );
-		}
-	}
-
-	m_WritePos = temp;
+	m_WritePos = m_packet_writepos;
 
 	if( EmuConfig.GS.SynchronousMTGS )
 	{
@@ -596,142 +625,95 @@ void SysMtgsThread::SendDataPacket()
 	//m_PacketLocker.Release();
 }
 
-int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
+void SysMtgsThread::GenericStall( uint size )
 {
 	// Note on volatiles: m_WritePos is not modified by the GS thread, so there's no need
 	// to use volatile reads here.  We do cache it though, since we know it never changes,
 	// except for calls to RingbufferRestert() -- handled below.
-	uint writepos = m_WritePos;
-
-	// Checks if a previous copy was started without an accompanying call to GSRINGBUF_DONECOPY
-	pxAssert( m_packet_size == 0 );
+	const uint writepos = m_WritePos;
 
 	// Sanity checks! (within the confines of our ringbuffer please!)
 	pxAssert( size < RingBufferSize );
 	pxAssert( writepos < RingBufferSize );
 
+	// generic gs wait/stall.
+	// if the writepos is past the readpos then we're safe.
+	// But if not then we need to make sure the readpos is outside the scope of
+	// the block about to be written (writepos + size)
+
+	uint readpos = volatize(m_ReadPos);
+	uint freeroom;
+
+	if (writepos < readpos)
+		freeroom = readpos - writepos;
+	else
+		freeroom = RingBufferSize - (writepos - readpos);
+
+	if (freeroom <= size)
+	{
+		// writepos will overlap readpos if we commit the data, so we need to wait until
+		// readpos is out past the end of the future write pos, or until it wraps around
+		// (in which case writepos will be >= readpos).
+
+		// Ideally though we want to wait longer, because if we just toss in this packet
+		// the next packet will likely stall up too.  So lets set a condition for the MTGS
+		// thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied.
+
+		uint somedone	= (RingBufferSize - freeroom) / 4;
+		if( somedone < size+1 ) somedone = size + 1;
+
+		// FMV Optimization: FMVs typically send *very* little data to the GS, in some cases
+		// every other frame is nothing more than a page swap.  Sleeping the EEcore is a
+		// waste of time, and we get better results using a spinwait.
+
+		if( somedone > 0x80 )
+		{
+			pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
+			m_SignalRingPosition = somedone;
+
+			//Console.WriteLn( Color_Blue, "(EEcore Sleep) PrepDataPacker \tringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", readpos, writepos, m_SignalRingPosition );
+
+			while(true) {
+				AtomicExchange( m_SignalRingEnable, 1 );
+				SetEvent();
+				m_sem_OnRingReset.WaitWithoutYield();
+				readpos = volatize(m_ReadPos);
+				//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
+
+				if (writepos < readpos)
+					freeroom = readpos - writepos;
+				else
+					freeroom = RingBufferSize - (writepos - readpos);
+					
+				if (freeroom > size) break;
+			}
+
+			pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
+		}
+		else
+		{
+			//Console.WriteLn( Color_StrongGray, "(EEcore Spin) PrepDataPacket!" );
+			SetEvent();
+			while(true) {
+				SpinWait();
+				readpos = volatize(m_ReadPos);
+
+				if (writepos < readpos)
+					freeroom = readpos - writepos;
+				else
+					freeroom = RingBufferSize - (writepos - readpos);
+
+				if (freeroom > size) break;
+			}
+		}
+	}
+}
+
+void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
+{
 	m_packet_size = size;
 	++size;			// takes into account our RingCommand QWC.
-
-	if( writepos + size < RingBufferSize )
-	{
-		// generic gs wait/stall.
-		// if the writepos is past the readpos then we're safe.
-		// But if not then we need to make sure the readpos is outside the scope of
-		// the block about to be written (writepos + size)
-
-		uint readpos = volatize(m_RingPos);
-		if( (writepos < readpos) && (writepos+size >= readpos) )
-		{
-			// writepos is behind the readpos and will overlap it if we commit the data,
-			// so we need to wait until readpos is out past the end of the future write pos,
-			// or until it wraps around (in which case writepos will be >= readpos).
-
-			// Ideally though we want to wait longer, because if we just toss in this packet
-			// the next packet will likely stall up too.  So lets set a condition for the MTGS
-			// thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied.
-
-			uint totalAccum	= (m_RingWrapSpot - readpos) + writepos;
-			uint somedone	= totalAccum / 4;
-			if( somedone < size+1 ) somedone = size + 1;
-
-			// FMV Optimization: FMVs typically send *very* little data to the GS, in some cases
-			// every other frame is nothing more than a page swap.  Sleeping the EEcore is a
-			// waste of time, and we get better results using a spinwait.
-
-			if( somedone > 0x80 )
-			{
-				pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
-				m_SignalRingPosition = somedone;
-
-				//Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition );
-
-				do {
-					AtomicExchange( m_SignalRingEnable, 1 );
-					SetEvent();
-					m_sem_OnRingReset.WaitWithoutYield();
-					readpos = volatize(m_RingPos);
-					//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
-				} while( (writepos < readpos) && (writepos+size >= readpos) );
-
-				pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
-			}
-			else
-			{
-				SetEvent();
-				do {
-					SpinWait();
-					readpos = volatize(m_RingPos);
-				} while( (writepos < readpos) && (writepos+size >= readpos) );
-			}
-		}
-	}
-	else if( writepos + size > RingBufferSize )
-	{
-		pxAssert( writepos != 0 );
-
-		// If the incoming packet doesn't fit, then start over from the start of the ring
-		// buffer (it's a lot easier than trying to wrap the packet around the end of the
-		// buffer).
-
-		//Console.WriteLn( "MTGS > Ringbuffer Got Filled!");
-		RestartRingbuffer( size );
-		writepos = m_WritePos;
-	}
-    else	// always true - if( writepos + size == MTGS_RINGBUFFEREND )
-	{
-		// Yay.  Perfect fit.  What are the odds?
-		// Copy is ready so long as readpos is less than writepos and *not* equal to the
-		// base of the ringbuffer (otherwise the buffer will stop when the writepos is
-		// wrapped around to zero later-on in SendDataPacket).
-
-		uint readpos = volatize(m_RingPos);
-		//Console.WriteLn( "MTGS > Perfect Fit!\tringpos=0x%06x, writepos=0x%06x", readpos, writepos );
-		if( readpos > writepos || readpos == 0 )
-		{
-			uint totalAccum	= (readpos == 0) ? RingBufferSize : ((m_RingWrapSpot - readpos) + writepos);
-			uint somedone	= totalAccum / 4;
-			if( somedone < size+1 ) somedone = size + 1;
-
-			// FMV Optimization: (see above) This condition of a perfect fit is so rare that optimizing
-			// for it is pointless -- but it was also mindlessly simple copy-paste.  So there. :p
-
-			if( somedone > 0x80 )
-			{
-				m_SignalRingPosition = somedone;
-
-				//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Sleep!\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
-
-				do {
-					AtomicExchange( m_SignalRingEnable, 1 );
-					SetEvent();
-					m_sem_OnRingReset.WaitWithoutYield();
-					readpos = volatize(m_RingPos);
-					//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Post-sleep Report!\tringpos=0x%06x", readpos );
-				} while( (writepos < readpos) || (readpos==0) );
-
-				pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
-			}
-			else
-			{
-				//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Spin!" );
-				SetEvent();
-				do {
-					SpinWait();
-					readpos = volatize(m_RingPos);
-				} while( (writepos < readpos) || (readpos==0) );
-			}
-		}
-
-		m_QueuedFrameCount = 0;
-		m_RingWrapSpot = RingBufferSize;
-    }
-
-#ifdef RINGBUF_DEBUG_STACK
-	m_lock_Stack.Lock();
-	ringposStack.push_front( writepos );
-	m_lock_Stack.Release();
-#endif
+	GenericStall(size);
 
 	// Command qword: Low word is the command, and the high word is the packet
 	// length in SIMDs (128 bits).
@@ -739,9 +721,8 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
 	tag.command = cmd;
 	tag.data[0] = m_packet_size;
-	m_packet_ringpos = m_WritePos + 1;
-
-	return m_packet_size;
+	m_packet_startpos = m_WritePos;
+	m_packet_writepos = (m_WritePos + 1) & RingBufferMask;
 }
 
 // Returns the amount of giftag data processed (in simd128 values).
@@ -749,132 +730,17 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 // around VU memory instead of having buffer overflow...
 // Parameters:
 //  size - size of the packet data, in smd128's
-int SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size )
+void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size )
 {
 	//m_PacketLocker.Acquire();
 
-	return PrepDataPacket( (MTGS_RingCommand)pathidx, GIFPath_ParseTag(pathidx, srcdata, size) );
+	PrepDataPacket( (MTGS_RingCommand)pathidx, size );
 }
 
-void SysMtgsThread::RestartRingbuffer( uint packsize )
+__forceinline void SysMtgsThread::_FinishSimplePacket()
 {
-	if( m_WritePos == 0 ) return;
-	const uint thefuture = packsize;
-
-	//Console.WriteLn( Color_Magenta, "**** Ringbuffer Restart!!" );
-	// Always kick the MTGS into action for a ringbuffer restart.
-	SetEvent();
-
-	uint readpos = volatize(m_RingPos);
-
-	if( (readpos > m_WritePos) || (readpos <= thefuture) )
-	{
-		// We have to be careful not to leapfrog our read-position, which would happen if
-		// it's greater than the current write position (since wrapping writepos to 0 would
-		// be the act of skipping PAST readpos).  Stall until it loops around to the
-		// beginning of the buffer, and past the size of our packet allocation.
-
-		uint somedone;
-
-		if( readpos > m_WritePos )
-			somedone = (m_RingWrapSpot - readpos) + packsize + 1;
-		else
-			somedone = (packsize + 1) - readpos;
-
-		if( somedone > 0x80 )
-		{
-			m_SignalRingPosition = somedone;
-			//Console.WriteLn( Color_Blue, "(EEcore Sleep) Restart!\tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x",
-			//	readpos, m_WritePos, m_RingWrapSpot, m_SignalRingPosition );
-
-			do {
-				AtomicExchange( m_SignalRingEnable, 1 );
-				SetEvent();
-				m_sem_OnRingReset.WaitWithoutYield();
-				readpos = volatize(m_RingPos);
-				//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
-			} while( (readpos > m_WritePos) || (readpos <= thefuture) );
-		}
-		else
-		{
-			SetEvent();
-			do {
-				SpinWait();
-				readpos = volatize(m_RingPos);
-			} while( (readpos > m_WritePos) || (readpos <= thefuture) );
-		}
-	}
-
-	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
-
-	tag.command = GS_RINGTYPE_RESTART;
-
-	m_RingWrapSpot = m_WritePos;
-	m_WritePos = 0;
-	m_QueuedFrameCount = 0;
-
-	if( EmuConfig.GS.SynchronousMTGS )
-		WaitGS();
-}
-
-__forceinline uint SysMtgsThread::_PrepForSimplePacket()
-{
-#ifdef RINGBUF_DEBUG_STACK
-	m_lock_Stack.Lock();
-	ringposStack.push_front( m_WritePos );
-	m_lock_Stack.Release();
-#endif
-
-	uint future_writepos = m_WritePos+1;
-	pxAssert( future_writepos <= RingBufferSize );
-
-    future_writepos &= RingBufferMask;
-    if( future_writepos == 0 )
-    {
-		m_QueuedFrameCount = 0;
-		m_RingWrapSpot = RingBufferSize;
-	}
-
-	uint readpos = volatize(m_RingPos);
-	if( future_writepos == readpos )
-	{
-		// The ringbuffer read pos is blocking the future write position, so stall out
-		// until the read position has moved.
-
-		uint totalAccum	= (m_RingWrapSpot - readpos) + future_writepos;
-		uint somedone	= totalAccum / 4;
-
-		if( somedone > 0x80 )
-		{
-			m_SignalRingPosition = somedone;
-
-			//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Simple Sleep!\t\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
-
-			do {
-				AtomicExchange( m_SignalRingEnable, 1 );
-				SetEvent();
-				m_sem_OnRingReset.WaitWithoutYield();
-				readpos = volatize(m_RingPos);
-				//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Simple Post-sleep Report!\tringpos=0x%06x", readpos );
-			} while( future_writepos  == readpos );
-
-			pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
-		}
-		else
-		{
-			SetEvent();
-			do {
-				SpinWait();
-			} while( future_writepos == volatize(m_RingPos) );
-		}
-	}
-
-	return future_writepos;
-}
-
-__forceinline void SysMtgsThread::_FinishSimplePacket( uint future_writepos )
-{
-	pxAssert( future_writepos != volatize(m_RingPos) );
+	uint future_writepos = (m_WritePos+1) & RingBufferMask;
+	pxAssert( future_writepos != volatize(m_ReadPos) );
 	m_WritePos = future_writepos;
 
 	if( EmuConfig.GS.SynchronousMTGS )
@@ -887,7 +753,7 @@ void SysMtgsThread::SendSimplePacket( MTGS_RingCommand type, int data0, int data
 {
 	//ScopedLock locker( m_PacketLocker );
 
-	const uint thefuture = _PrepForSimplePacket();
+	GenericStall(1);
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
 
 	tag.command = type;
@@ -895,21 +761,21 @@ void SysMtgsThread::SendSimplePacket( MTGS_RingCommand type, int data0, int data
 	tag.data[1] = data1;
 	tag.data[2] = data2;
 
-	_FinishSimplePacket( thefuture );
+	_FinishSimplePacket();
 }
 
 void SysMtgsThread::SendPointerPacket( MTGS_RingCommand type, u32 data0, void* data1 )
 {
 	//ScopedLock locker( m_PacketLocker );
 
-	const uint thefuture = _PrepForSimplePacket();
+	GenericStall(1);
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
 
 	tag.command = type;
 	tag.data[0] = data0;
 	*(uptr*)&tag.data[1] = (uptr)data1;
 
-	_FinishSimplePacket( thefuture );
+	_FinishSimplePacket();
 }
 
 void SysMtgsThread::SendGameCRC( u32 crc )
diff --git a/pcsx2/Pcsx2Config.cpp b/pcsx2/Pcsx2Config.cpp
index efa01f4c59..a1aa88f307 100644
--- a/pcsx2/Pcsx2Config.cpp
+++ b/pcsx2/Pcsx2Config.cpp
@@ -217,6 +217,7 @@ Pcsx2Config::GSOptions::GSOptions()
 
 	SynchronousMTGS			= false;
 	DisableOutput			= false;
+	VsyncQueueSize			= 2;
 
 	DefaultRegionMode		= Region_NTSC;
 	FramesToDraw			= 2;
@@ -234,6 +235,7 @@ void Pcsx2Config::GSOptions::LoadSave( IniInterface& ini )
 
 	IniEntry( SynchronousMTGS );
 	IniEntry( DisableOutput );
+	IniEntry( VsyncQueueSize );
 
 	IniEntry( FrameLimitEnable );
 	IniEntry( FrameSkipEnable );
diff --git a/pcsx2/PluginManager.cpp b/pcsx2/PluginManager.cpp
index 080f1f5e9d..558a12180f 100644
--- a/pcsx2/PluginManager.cpp
+++ b/pcsx2/PluginManager.cpp
@@ -144,6 +144,7 @@ static s32  CALLBACK fallback_test() { return 0; }
 _GSvsync           GSvsync;
 _GSopen            GSopen;
 _GSopen2           GSopen2;
+_GSgifTransfer     GSgifTransfer;
 _GSgifTransfer1    GSgifTransfer1;
 _GSgifTransfer2    GSgifTransfer2;
 _GSgifTransfer3    GSgifTransfer3;
@@ -309,7 +310,8 @@ static const LegacyApi_ReqMethod s_MethMessReq_GS[] =
 {
 	{	"GSopen",			(vMeth**)&GSopen,			NULL	},
 	{	"GSvsync",			(vMeth**)&GSvsync,			NULL	},
-	{	"GSgifTransfer1",	(vMeth**)&GSgifTransfer1,	NULL	},
+	{	"GSgifTransfer",	(vMeth**)&GSgifTransfer,	NULL	},
+	//{	"GSgifTransfer1",	(vMeth**)&GSgifTransfer1,	NULL	},
 	{	"GSgifTransfer2",	(vMeth**)&GSgifTransfer2,	NULL	},
 	{	"GSgifTransfer3",	(vMeth**)&GSgifTransfer3,	NULL	},
 	{	"GSreadFIFO2",		(vMeth**)&GSreadFIFO2,		NULL	},
diff --git a/pcsx2/VUops.cpp b/pcsx2/VUops.cpp
index 6baaacebad..0172fd1d9f 100644
--- a/pcsx2/VUops.cpp
+++ b/pcsx2/VUops.cpp
@@ -2057,21 +2057,8 @@ void _vuXGKICK(VURegs * VU)
 
 	u8* data = ((u8*)VU->Mem + ((VU->VI[_Is_].US[0]*16) & 0x3fff));
 	u32 size;
-	size = GetMTGS().PrepDataPacket( GIF_PATH_1, data, (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4);
-	u8* pmem = GetMTGS().GetDataPacketPtr();
-
-	if((size << 4) > (u32)(0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)))
-	{
-		//DevCon.Warning("addr + Size = 0x%x, transferring %x then doing %x", ((VU->VI[_Is_].US[0]*16) & 0x3fff) + (size << 4), (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4, size - (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff) >> 4));
-		memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff));
-		size -= (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4;
-		//DevCon.Warning("Size left %x", size);
-		pmem += 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff);
-		memcpy_aligned(pmem, (u8*)VU->Mem, size<<4);
-	}
-	else {
-		memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), size<<4);
-	}
+	GetMTGS().PrepDataPacket( GIF_PATH_1, 0x400 );
+	size = GIFPath_CopyTag( GIF_PATH_1, (u128*)data, (0x400-(VU->VI[_Is_].US[0] & 0x3ff)) );
 	GetMTGS().SendDataPacket();
 }
 
diff --git a/pcsx2/Vif1_Dma.cpp b/pcsx2/Vif1_Dma.cpp
index c7e42ad814..8f738a590e 100644
--- a/pcsx2/Vif1_Dma.cpp
+++ b/pcsx2/Vif1_Dma.cpp
@@ -345,7 +345,6 @@ __forceinline void vif1Interrupt()
 
 	if(GSTransferStatus.PTH2 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH2)
 	{
-		gifRegs->stat.OPH = false;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
 	}
@@ -440,11 +439,6 @@ __forceinline void vif1Interrupt()
 	if (vif1.cmd != 0) Console.WriteLn("vif1.cmd still set %x tag size %x", vif1.cmd, vif1.tag.size);
 #endif
 
-	
-	if((vif1ch->chcr.DIR == VIF_NORMAL_TO_MEM_MODE) && vif1.GSLastDownloadSize <= 16) 
-	{   //Reverse fifo has finished and nothing is left, so lets clear the outputting flag
-		gifRegs->stat.OPH = false;
-	}
 	vif1ch->chcr.STR = false;
 	vif1.vifstalled = false;
 	g_vifCycles = 0;
diff --git a/pcsx2/Vif1_MFIFO.cpp b/pcsx2/Vif1_MFIFO.cpp
index 64ff291b90..86f8008e6f 100644
--- a/pcsx2/Vif1_MFIFO.cpp
+++ b/pcsx2/Vif1_MFIFO.cpp
@@ -239,7 +239,6 @@ void vifMFIFOInterrupt()
 	if(GSTransferStatus.PTH2 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH2)
 	{
 		GSTransferStatus.PTH2 = STOPPED_MODE;
-		if(gifRegs->stat.DIR == 0)gifRegs->stat.OPH = false;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
 		/*gifRegs->stat.APATH = GIF_APATH_IDLE;
diff --git a/pcsx2/Vif_Codes.cpp b/pcsx2/Vif_Codes.cpp
index fb2fb3a9f9..d7208d1976 100644
--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@@ -167,10 +167,16 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 			return 0;
 		}
 
-		
-
+		// HACK ATTACK!
+		// we shouldn't be clearing the queue flag here at all.  Ideally, the queue statuses
+		// should be checked, handled, and cleared from the EOP check in GIFPath only. --air
 		gifRegs->stat.clear_flags(GIF_STAT_P2Q);
 
+		// the tag size should ALWAYS be 128 bits (qwc).  If it isn't, it means there's a serious bug
+		// somewhere in the VIF (likely relating to +/-'ing the tag.size during processing).
+		// NOTE: ICO [PAL] exploits this during bootup.  Needs investigation. --air
+		//pxAssumeMsg( (vif1.tag.size & 3) == 0, "Invalid Vif1 DIRECT packet size detected!" );
+
 		nVifStruct&	v	 = nVif[1];
 		const int	ret	 = aMin(vif1.vifpacketsize, vif1.tag.size);
 		u32			size = ret << 2;
@@ -184,8 +190,6 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 
 			if(vif1.vifpacketsize < 4 && v.bSize < 16) 
 			{
-				nVifStruct& v = nVif[idx];
-
 				memcpy(&v.buffer[v.bPtr], data, vif1.vifpacketsize << 2);
 				v.bSize += vif1.vifpacketsize << 2;
 				v.bPtr += vif1.vifpacketsize << 2;
@@ -199,7 +203,6 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 			}
 			else
 			{
-				nVifStruct& v = nVif[idx];
 				if(v.bSize)
 				{
 					int ret = 0;
@@ -213,8 +216,8 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 						v.bSize = 0;
 						v.bPtr = 0;						
 					}
-					const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, v.buffer, 1);
-					memcpy_fast(GetMTGS().GetDataPacketPtr(), v.buffer, count << 4);
+					GetMTGS().PrepDataPacket(GIF_PATH_2, 1);
+					GIFPath_CopyTag(GIF_PATH_2, (u128*)v.buffer, 1);
 					GetMTGS().SendDataPacket();
 
 					if(vif1.tag.size == 0) 
@@ -226,16 +229,17 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 				}
 				else
 				{
-					const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, data, size >> 4);
-					memcpy_fast(GetMTGS().GetDataPacketPtr(), data, count << 4);
+					GetMTGS().PrepDataPacket(GIF_PATH_2, size/16);
+					uint count = GIFPath_CopyTag(GIF_PATH_2, (u128*)data, size/16) * 4;
 					GetMTGS().SendDataPacket();
-					vif1.tag.size -= count << 2;
+
+					vif1.tag.size -= count;
 					if(vif1.tag.size == 0) 
 					{
 						vif1.cmd = 0;
 					}
 					vif1.vifstalled    = true;
-					return count << 2;
+					return count;
 				}
 			}
 			
diff --git a/pcsx2/Vif_Transfer.cpp b/pcsx2/Vif_Transfer.cpp
index a5f80db686..21aafa5529 100644
--- a/pcsx2/Vif_Transfer.cpp
+++ b/pcsx2/Vif_Transfer.cpp
@@ -36,16 +36,8 @@ _vifT bool analyzeIbit(u32* &data, int iBit) {
 	if (iBit && !vifX.cmd && !vifXRegs->err.MII) {
 		//DevCon.WriteLn("Vif I-Bit IRQ");
 		vifX.irq++;
-		// On i-bit, the command is run, vif stalls etc,
-		// however if the vifcode is MARK, you do NOT stall, just send IRQ. - Max Payne shows this up.
-		//if(((vifXRegs->code >> 24) & 0x7f) == 0x7) return 0;
 
-		// If we have a vifcode with i-bit, the following instruction
-		// should stall unless its MARK?.. we test that case here...
-		// Not 100% sure if this is the correct behavior, so printing
-		// a console message to see games that use this. (cottonvibes)
-
-		// Okay did some testing with Max Payne, it does this
+		// Okay did some testing with Max Payne, it does this:
 		// VifMark  value = 0x666   (i know, evil!)
 		// NOP with I Bit
 		// VifMark  value = 0
@@ -53,6 +45,23 @@ _vifT bool analyzeIbit(u32* &data, int iBit) {
 		// If you break after the 2nd Mark has run, the game reports invalid mark 0 and the game dies.
 		// So it has to occur here, testing a theory that it only doesn't stall if the command with
 		// the iBit IS mark, but still sends the IRQ to let the cpu know the mark is there. (Refraction)
+		//
+		// --------------------------
+		//
+		// This is how it probably works: i-bit sets the IRQ flag, and VIF keeps running until it encounters
+		// a non-MARK instruction.  This includes the *current* instruction.  ie, execution only continues
+		// unimpeded if MARK[i] is specified, and keeps executing unimpeded until any non-MARK command.
+		// Any other command with an I bit should stall immediately.
+		// Example:
+		//
+		// VifMark[i] value = 0x321   (with I bit)
+		// VifMark    value = 0
+		// VifMark    value = 0x333
+		// NOP
+		//
+		// ... the VIF should not stall and raise the interrupt until after the NOP is processed.
+		// So the final value for MARK as the game sees it will be 0x333. --air
+		
 		return runMark<idx>(data);
 	}
 	return 0;
diff --git a/pcsx2/gui/AppAssert.cpp b/pcsx2/gui/AppAssert.cpp
index 6d33082872..87f55d2449 100644
--- a/pcsx2/gui/AppAssert.cpp
+++ b/pcsx2/gui/AppAssert.cpp
@@ -134,10 +134,10 @@ bool AppDoAssert( const DiagnosticOrigin& origin, const wxChar *msg )
 	wxString trace( pxGetStackTrace(origin.function) );
 	wxString dbgmsg( origin.ToString( msg ) );
 
-	wxMessageOutputDebug().Printf( dbgmsg );
+	wxMessageOutputDebug().Printf( L"%s", dbgmsg );
 
-	Console.Error( dbgmsg );
-	Console.WriteLn( trace );
+	Console.Error( L"%s", dbgmsg );
+	Console.WriteLn( L"%s", trace );
 
 	wxString windowmsg( L"Assertion failed: " );
 	if( msg != NULL )
diff --git a/pcsx2/gui/AppInit.cpp b/pcsx2/gui/AppInit.cpp
index 70985ad576..7d36249b39 100644
--- a/pcsx2/gui/AppInit.cpp
+++ b/pcsx2/gui/AppInit.cpp
@@ -189,13 +189,13 @@ void Pcsx2App::DetectCpuAndUserMode()
 	x86caps.CountCores();
 	x86caps.SIMD_EstablishMXCSRmask();
 
-	if( !x86caps.hasMultimediaExtensions )
+	if( !x86caps.hasMultimediaExtensions || !x86caps.hasStreamingSIMDExtensions )
 	{
-		// Note: due to memcpy_fast, we need minimum MMX even for interpreters.  This will
-		// hopefully change later once we have a dynamically recompiled memcpy.
+		// Note: Due to optimizations to GIFpath parsers, memcpy, and possibly other things, we need
+		// a bare minimum of SSE supported by the CPU.
 		throw Exception::HardwareDeficiency()
-			.SetDiagMsg(L"Critical Failure: MMX Extensions not available.")
-			.SetUserMsg(_("MMX extensions are not available.  PCSX2 requires cpu with MMX extension support to run."));
+			.SetDiagMsg(L"Critical Failure: SSE Extensions not available.")
+			.SetUserMsg(_("SSE extensions are not available.  PCSX2 requires a cpu that supports the SSE instruction set."));
 	}
 
 	ReadUserModeSettings();
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index b311361c84..1c9366ec21 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -19,6 +19,7 @@
 #include "Gif.h"
 #include "Vif_Dma.h"
 #include "Vif.h"
+#include <xmmintrin.h>
 
 // --------------------------------------------------------------------------------------
 //  GIFpath -- the GIFtag Parser
@@ -92,12 +93,16 @@ struct GIFPath
 
 	void Reset();
 	void PrepPackedRegs();
-	void SetTag(const void* mem);
 	bool StepReg();
 	u8 GetReg();
 	bool IsActive() const;
 
-	int ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size);
+	template< bool Aligned >
+	void SetTag(const void* mem);
+
+	template< GIF_PATH pathidx, bool Aligned >
+	int CopyTag(const u128* pMem, u32 size);
+
 	int ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
 };
 
@@ -285,9 +290,11 @@ __forceinline void GIFPath::PrepPackedRegs()
 	}
 }
 
+
+template< bool Aligned >
 __forceinline void GIFPath::SetTag(const void* mem)
 {
-	const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);
+	_mm_store_ps( (float*)&tag, Aligned ? _mm_load_ps((const float*)mem) : _mm_loadu_ps((const float*)mem) );
 
 	nloop	= tag.NLOOP;
 	curreg	= 0;
@@ -350,7 +357,8 @@ static __forceinline void gsHandler(const u8* pMem)
 			// qwords, rounded down; any extra bits are lost
 			// games must take care to ensure transfer rectangles are exact multiples of a qword
 			vif1.GSLastDownloadSize = vif1.TRXREG.RRW * vif1.TRXREG.RRH * bpp >> 7;
-			gifRegs->stat.OPH = true;
+			//DevCon.Warning("GS download in progress. OPH = %x", gifRegs->stat.OPH);
+			//gifRegs->stat.OPH = true; // Too early to set it here. It should be done on a BUSDIR call (rama)
 		}
 	}
 	if (reg >= 0x60)
@@ -371,10 +379,9 @@ static __forceinline void gsHandler(const u8* pMem)
 #define aMin(x, y) std::min(x, y)
 
 // Parameters:
-//   size (path1)   - difference between the end of VU memory and pMem.
-//   size (path2/3) - max size of incoming data stream, in qwc (simd128)
-
-
+//   size - max size of incoming data stream, in qwc (simd128).  If the path is PATH1, and the
+//     path does not terminate (EOP) within the specified size, it is assumed that the path must
+//     loop around to the start of VU memory and continue processing.
 __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
 {
 	u32	startSize =  size;						// Start Size
@@ -382,7 +389,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 	while (size > 0) {
 		if (!nloop) {
 
-			SetTag(pMem);
+			SetTag<false>(pMem);
 			incTag(1);
 		}
 		else
@@ -509,6 +516,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 
 					Console.Warning("GIFTAG error, size exceeded VU memory size %x", startSize);
 					nloop	= 0;
+					const_cast<GIFTAG&>(tag).EOP = 1;
 				}
 			}
 		}
@@ -521,15 +529,65 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 	return size;
 }
 
-__forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
+__forceinline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
 {
+	uint endpos = destStart + len;
+	if( endpos < destSize )
+	{
+		memcpy_qwc(&destBase[destStart], src, len );
+		destStart += len;
+	}
+	else
+	{
+		uint firstcopylen = destSize - destStart;
+		memcpy_qwc(&destBase[destStart], src, firstcopylen );
+
+		destStart = endpos % destSize;
+		memcpy_qwc(destBase, src+firstcopylen, destStart );
+	}
+}
+
+__forceinline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
+{
+	uint endpos = srcStart + len;
+	if( endpos < srcSize )
+	{
+		memcpy_qwc(dest, &srcBase[srcStart], len );
+		srcStart += len;
+	}
+	else
+	{
+		uint firstcopylen = srcSize - srcStart;
+		memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
+
+		srcStart = endpos % srcSize;
+		memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
+	}
+}
+
+#define copyTag() do {						\
+	_mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], Aligned ? _mm_load_ps((float*)pMem128) : _mm_loadu_ps((float*)pMem128)); \
+	++pMem128; --size;						\
+	ringpos = (ringpos+1)&RingBufferMask;	\
+} while(false)
+
+// Parameters:
+//   size - max size of incoming data stream, in qwc (simd128).  If the path is PATH1, and the
+//     path does not terminate (EOP) within the specified size, it is assumed that the path must
+//     loop around to the start of VU memory and continue processing.
+template< GIF_PATH pathidx, bool Aligned > 
+__forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
+{
+	uint& ringpos = GetMTGS().m_packet_writepos;
+	const uint original_ringpos = ringpos;
+
 	u32	startSize =  size;						// Start Size
 
 	while (size > 0) {
 		if (!nloop) {
 
-			SetTag(pMem);
-			incTag(1);		
+			SetTag<Aligned>((u8*)pMem128);
+			copyTag();
 			
 			if(nloop > 0)
 			{
@@ -560,7 +618,7 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 			}	
 			if(GSTransferStatus.PTH3 < PENDINGSTOP_MODE || pathidx != 2)
 			{
-				gifRegs->stat.OPH = true;
+				//gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama)
 				gifRegs->stat.APATH = pathidx + 1;	
 			}
 
@@ -588,7 +646,7 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 					break;
 			}
 			gifRegs->stat.APATH = pathidx + 1;
-			gifRegs->stat.OPH = true;
+			//gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama)
 	
 			switch(tag.FLG) {
 				case GIF_FLG_PACKED:
@@ -599,9 +657,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 					{
 						do {
 							if (GetReg() == 0xe) {
-								gsHandler(pMem);
+								gsHandler((u8*)pMem128);
 							}
-							incTag(1);
+							copyTag();
 						} while(StepReg() && size > 0 && SIGNAL_IMR_Pending == false);
 					}
 					else
@@ -644,11 +702,14 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 							curreg = 0;
 							nloop = 0;
 						}
-						incTag(len);
+
+						MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
+						pMem128 += len;
+						size -= len;
 					}
 				break;
 				case GIF_FLG_REGLIST:
-				{				
+				{
 					GIF_LOG("Reglist Mode EOP %x", tag.EOP);
 
 					// In reglist mode, the GIF packs 2 registers into each QWC.  The nloop however
@@ -687,8 +748,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 						nloop = 0;
 					}
 
-					incTag(len);
-
+					MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
+					pMem128 += len;
+					size -= len;
 				}
 				break;
 				case GIF_FLG_IMAGE:
@@ -696,13 +758,15 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 				{
 					GIF_LOG("IMAGE Mode EOP %x", tag.EOP);
 					int len = aMin(size, nloop);
-					incTag(len);
+
+					MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
+
+					pMem128 += len;
+					size -= len;
 					nloop -= len;
 				}
 				break;
 			}
-			
-
 		}
 
 		if(pathidx == GIF_PATH_1)
@@ -713,11 +777,11 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 				{
 					size = 0x3ff - startSize;
 					startSize = 0x3ff;
-					pMem -= 0x4000;
+					pMem128 -= 0x400;
 				}
 				else
 				{
-					// Note: The BIOS does an XGKICK on the VU1 and lets yt DMA to the GS without an EOP
+					// Note: The BIOS does an XGKICK on the VU1 and lets it DMA to the GS without an EOP
 					// (seemingly to loop forever), only to write an EOP later on.  No other game is known to
 					// do anything of the sort.
 					// So lets just cap the DMA at 16k, and force it to "look" like it's terminated for now.
@@ -727,6 +791,12 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 
 					Console.Warning("GIFTAG error, size exceeded VU memory size %x", startSize);
 					nloop	= 0;
+					const_cast<GIFTAG&>(tag).EOP = 1;
+
+					// Don't send the packet to the GS -- its incomplete and might cause the GS plugin
+					// to get confused and die. >_<
+					
+					ringpos = original_ringpos;
 				}
 			}
 		}
@@ -749,6 +819,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 					gsIrq();
 				}
 			}
+			
+			// [TODO] : DMAC Arbitration rights should select the next queued GIF transfer here.
+			
 			break;
 		}
 		if(SIGNAL_IMR_Pending == true)
@@ -793,47 +866,40 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 			gif->qwc  -= size;
 		}
 	}
-	
-	
 
 	return size;
 }
 
-// Processes a GIFtag & packet, and throws out some gsIRQs as needed.
-// Used to keep interrupts in sync with the EE, while the GS itself
-// runs potentially several frames behind.
 // Parameters:
-//   size  - max size of incoming data stream, in qwc (simd128)
-__forceinline int GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
+//   size - max size of incoming data stream, in qwc (simd128).  If the path is PATH1, and the
+//     path does not terminate (EOP) within the specified size, it is assumed that the path must
+//     loop around to the start of VU memory and continue processing.
+__forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
 {
-#ifdef PCSX2_GSRING_SAMPLING_STATS
-	static uptr profStartPtr = 0;
-	static uptr profEndPtr = 0;
-	if (profStartPtr == 0) {
-		__asm
-		{
-		__beginfunc:
-			mov profStartPtr, offset __beginfunc;
-			mov profEndPtr, offset __endfunc;
-		}
-		ProfilerRegisterSource( "GSRingBufCopy", (void*)profStartPtr, profEndPtr - profStartPtr );
-	}
-#endif
-
-	int retSize = s_gifPath[pathidx].ParseTag(pathidx, pMem, size);
-
-#ifdef PCSX2_GSRING_SAMPLING_STATS
-	__asm
+	switch( pathidx )
 	{
-		__endfunc:
-			nop;
+		case GIF_PATH_1:
+			pxAssertMsg(!s_gifPath[GIF_PATH_2].IsActive(), "GIFpath conflict: Attempted to start PATH1 while PATH2 is already active.");
+			pxAssertMsg(!s_gifPath[GIF_PATH_3].IsActive() || (GSTransferStatus.PTH3 == IMAGE_MODE), "GIFpath conflict: Attempted to start PATH1 while PATH3 is already active.");
+			return s_gifPath[GIF_PATH_1].CopyTag<GIF_PATH_1,true>(pMem, size);
+		case GIF_PATH_2:
+			pxAssertMsg(!s_gifPath[GIF_PATH_1].IsActive(), "GIFpath conflict: Attempted to start PATH2 while PATH1 is already active.");
+			pxAssertMsg(!s_gifPath[GIF_PATH_3].IsActive() || (GSTransferStatus.PTH3 == IMAGE_MODE), "GIFpath conflict: Attempted to start PATH2 while PATH3 is already active.");
+			return s_gifPath[GIF_PATH_2].CopyTag<GIF_PATH_2,false>(pMem, size);
+		case GIF_PATH_3:
+			pxAssertMsg(!s_gifPath[GIF_PATH_1].IsActive(), "GIFpath conflict: Attempted to start PATH3 while PATH1 is already active.");
+			pxAssertMsg(!s_gifPath[GIF_PATH_2].IsActive(), "GIFpath conflict: Attempted to start PATH3 while PATH2 is already active.");
+			return s_gifPath[GIF_PATH_3].CopyTag<GIF_PATH_3,true>(pMem, size);
+
+		jNO_DEFAULT;
 	}
-#endif
-	return retSize;
+	
+	return 0;		// unreachable
 }
 
-//Quick version for queueing PATH1 data
-
+// Quick version for queuing PATH1 data.
+// This version calculates the real length of the packet data only.  It does not process
+// IRQs or DMA status updates.
 __forceinline int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
 {
 	int retSize = s_gifPath[pathidx].ParseTagQuick(pathidx, pMem, size);
diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp
index fe4510b4a5..87dbeb055c 100644
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@@ -1258,11 +1258,11 @@ void recompileNextInstruction(int delayslot)
 // Calling of this function can be enabled or disabled through the use of EmuConfig.Recompiler.PreBlockChecks
 static void __fastcall PreBlockCheck( u32 blockpc )
 {
-	static int lastrec = 0;
+	/*static int lastrec = 0;
 	static int curcount = 0;
 	const int skip = 0;
 
-    /*if( blockpc != 0x81fc0 ) {//&& lastrec != g_lastpc ) {
+    if( blockpc != 0x81fc0 ) {//&& lastrec != g_lastpc ) {
 		curcount++;
 
 		if( curcount > skip ) {
diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl
index 7b1cb1edbb..41a38861fd 100644
--- a/pcsx2/x86/microVU_Lower.inl
+++ b/pcsx2/x86/microVU_Lower.inl
@@ -1097,7 +1097,6 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 	u8* data  = microVU1.regs->Mem + (addr*16);
 	u32 diff  = 0x400 - addr;
 	u32 size;
-	u8* pDest;
 	
 	if(gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.APATH == GIF_APATH3 && gifRegs->stat.IP3 == true) && SIGNAL_IMR_Pending == false)
 	{
@@ -1106,23 +1105,12 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 			//Flush any pending transfers so things dont go up in the wrong order
 			while(gifRegs->stat.P1Q == true) gsPath1Interrupt();
 		}
-		size  = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
-		pDest = GetMTGS().GetDataPacketPtr();
-
-		if (size > diff) {
-			//DevCon.WriteLn("XGkick Wrap!");
-			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
-			size  -= diff;
-			pDest += diff*16;
-			memcpy_qwc(pDest, microVU1.regs->Mem, size);
-		}
-		else {
-			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
-		}
+		GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400);
+		size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff);
 		GetMTGS().SendDataPacket();
+
 		if(GSTransferStatus.PTH1 == STOPPED_MODE)
 		{
-			gifRegs->stat.OPH = false;
 			gifRegs->stat.APATH = GIF_APATH_IDLE;
 		}
 	}
@@ -1130,17 +1118,16 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 	{
 		//DevCon.Warning("GIF APATH busy %x Holding for later  W %x, R %x", gifRegs->stat.APATH, Path1WritePos, Path1ReadPos);
 		size = GIFPath_ParseTagQuick(GIF_PATH_1, data, diff);
-		pDest = &Path1Buffer[Path1WritePos*16];
+		u8* pDest = &Path1Buffer[Path1WritePos*16];
 
-		pxAssumeMsg((Path1WritePos+size < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!");
+		Path1WritePos += size;
+
+		pxAssumeMsg((Path1WritePos < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!");
 		//DevCon.Warning("Storing size %x PATH 1", size);
 
 		if (size > diff) {
-			// fixme: one of these days the following *16's will get cleaned up when we introduce
-			// a special qwc/simd16 optimized version of memcpy_aligned. :)
 			//DevCon.Status("XGkick Wrap!");
 			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
-			Path1WritePos += size;
 			size  -= diff;
 			pDest += diff*16;
 			memcpy_qwc(pDest, microVU1.regs->Mem, size);
diff --git a/pcsx2/x86/sVU_Lower.cpp b/pcsx2/x86/sVU_Lower.cpp
index c8d103477b..5191748f78 100644
--- a/pcsx2/x86/sVU_Lower.cpp
+++ b/pcsx2/x86/sVU_Lower.cpp
@@ -1988,24 +1988,12 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 			//Flush any pending transfers so things dont go up in the wrong order
 			while(gifRegs->stat.P1Q == true) gsPath1Interrupt();
 		}
-		size  = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
-		pDest = GetMTGS().GetDataPacketPtr();
-		if (size > diff) {
-			// fixme: one of these days the following *16's will get cleaned up when we introduce
-			// a special qwc/simd16 optimized version of memcpy_aligned. :)
-			
-			memcpy_aligned(pDest, VU1.Mem + addr, diff*16);
-			size  -= diff;
-			pDest += diff*16;
-			memcpy_aligned(pDest, VU1.Mem, size*16);
-		}
-		else {
-			memcpy_aligned(pDest, VU1.Mem + addr, size*16);
-		}
+		GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400);
+		size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff);
 		GetMTGS().SendDataPacket();
+
 		if(GSTransferStatus.PTH1 == STOPPED_MODE )
 		{
-			gifRegs->stat.OPH = false;
 			gifRegs->stat.APATH = GIF_APATH_IDLE;
 		}
 	}
@@ -2015,8 +2003,6 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 		size = GIFPath_ParseTagQuick(GIF_PATH_1, data, diff);
 		pDest = &Path1Buffer[Path1WritePos*16];
 
-
-
 		pxAssumeMsg((Path1WritePos+size < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!");
 
 		//DevCon.Warning("Storing size %x PATH 1", size);
@@ -2024,14 +2010,14 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 			// fixme: one of these days the following *16's will get cleaned up when we introduce
 			// a special qwc/simd16 optimized version of memcpy_aligned. :)
 			//DevCon.Status("XGkick Wrap!");
-			memcpy_aligned(pDest, VU1.Mem + addr, diff*16);
+			memcpy_aligned(pDest, VU1.Mem + addr, diff);
 			Path1WritePos += size;
 			size  -= diff;
 			pDest += diff*16;
-			memcpy_aligned(pDest, VU1.Mem, size*16);			
+			memcpy_aligned(pDest, VU1.Mem, size);			
 		}
 		else {
-			memcpy_aligned(pDest, VU1.Mem + addr, size*16);
+			memcpy_aligned(pDest, VU1.Mem + addr, size);
 			Path1WritePos += size;
 		}
 		//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);
diff --git a/plugins/spu2-x/src/Linux/SPU2-X.cbp b/plugins/spu2-x/src/Linux/SPU2-X.cbp
index c262b9f674..fa27c73506 100644
--- a/plugins/spu2-x/src/Linux/SPU2-X.cbp
+++ b/plugins/spu2-x/src/Linux/SPU2-X.cbp
@@ -195,8 +195,6 @@
 		<Unit filename="../spdif.h" />
 		<Unit filename="../spu2freeze.cpp" />
 		<Unit filename="../spu2sys.cpp" />
-		<Unit filename="../utf8.cpp" />
-		<Unit filename="../utf8.h" />
 		<Extensions>
 			<code_completion />
 			<debugger />
diff --git a/plugins/zzogl-pg/opengl/GS.h b/plugins/zzogl-pg/opengl/GS.h
index 3ac73bde9b..8ef2d0175b 100644
--- a/plugins/zzogl-pg/opengl/GS.h
+++ b/plugins/zzogl-pg/opengl/GS.h
@@ -635,7 +635,7 @@ typedef struct
 	int imageTransfer;
 	int imageWnew, imageHnew, imageX, imageY, imageEndX, imageEndY;
 
-	pathInfo path[3];
+	pathInfo path[4];
 	GIFRegDIMX dimx;
 	void setRGBA(u32 r, u32 g, u32 b, u32 a)
 	{
diff --git a/plugins/zzogl-pg/opengl/GifTransfer.cpp b/plugins/zzogl-pg/opengl/GifTransfer.cpp
index d8776eff13..4939f53dd9 100644
--- a/plugins/zzogl-pg/opengl/GifTransfer.cpp
+++ b/plugins/zzogl-pg/opengl/GifTransfer.cpp
@@ -265,8 +265,17 @@ void CALLBACK GSgifTransfer3(u32 *pMem, u32 size)
 	_GSgifTransfer<2>(pMem, size);
 }
 
-void InitPath()
+void CALLBACK GSgifTransfer(u32 *pMem, u32 size)
 {
-	gs.path[0].mode = gs.path[1].mode = gs.path[2].mode = 0;
+	FUNCLOG
+
+	//ZZLog::GS_Log("GSgifTransfer3 size = %lx (mode %d, gs.path3.tag.nloop = %d).", size, gs.path[2].mode, gs.path[2].tag.nloop);
+
+	_GSgifTransfer<3>(pMem, size);
+}
+
+void InitPath()
+{
+	gs.path[0].mode = gs.path[1].mode = gs.path[2].mode = gs.path[3].mode = 0;
 }