From a9084741bc1dc9a427a9125ac927e5d487726a8a Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sun, 11 Jul 2010 04:53:50 +0000
Subject: [PATCH 02/26] ReorderingMTGS:  * Implemented GIFPath_CopyTag, which
 performs a "copy-in-place" while parsing tags (big speedup over the old
 parse-then-copy strategy, especially with the SSE intrinsics I've included
 for kicks).  * Removed the old ringbuffer 'restart' mechanism and replaced it
 with a truly free-flowing wrapping mechanism.  Utilizes the ringbuffer more
 efficiently, and removes quite a bit of overhead from the MTGS's
 PrepDataPacket call.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3458 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/PS2Edefs.h             |   2 +
 common/include/Utilities/MemcpyFast.h |   8 +-
 pcsx2/FiFo.cpp                        |   5 +-
 pcsx2/GS.h                            |  37 ++-
 pcsx2/Gif.cpp                         |  23 +-
 pcsx2/MTGS.cpp                        | 329 +++++++++++++-------------
 pcsx2/PluginManager.cpp               |   4 +-
 pcsx2/VUops.cpp                       |  17 +-
 pcsx2/Vif_Codes.cpp                   |  13 +-
 pcsx2/ps2/GIFpath.cpp                 | 118 +++++----
 pcsx2/x86/microVU_Lower.inl           |  24 +-
 pcsx2/x86/sVU_Lower.cpp               |  25 +-
 12 files changed, 304 insertions(+), 301 deletions(-)

diff --git a/common/include/PS2Edefs.h b/common/include/PS2Edefs.h
index 5496df0587..f394cf5025 100644
--- a/common/include/PS2Edefs.h
+++ b/common/include/PS2Edefs.h
@@ -564,6 +564,7 @@ typedef void (CALLBACK* _PS2EsetEmuVersion)(const char* emuId, u32 version);		//
 typedef s32  (CALLBACK* _GSopen)(void *pDsp, char *Title, int multithread);
 typedef s32  (CALLBACK* _GSopen2)( void *pDsp, u32 flags );
 typedef void (CALLBACK* _GSvsync)(int field);
+typedef void (CALLBACK* _GSgifTransfer)(u32 *pMem, u32 size);
 typedef void (CALLBACK* _GSgifTransfer1)(u32 *pMem, u32 addr);
 typedef void (CALLBACK* _GSgifTransfer2)(u32 *pMem, u32 size);
 typedef void (CALLBACK* _GSgifTransfer3)(u32 *pMem, u32 size);
@@ -723,6 +724,7 @@ typedef void (CALLBACK* _FWirqCallback)(void (*callback)());
 extern _GSopen            GSopen;
 extern _GSopen2           GSopen2;
 extern _GSvsync           GSvsync;
+extern _GSgifTransfer     GSgifTransfer;
 extern _GSgifTransfer1    GSgifTransfer1;
 extern _GSgifTransfer2    GSgifTransfer2;
 extern _GSgifTransfer3    GSgifTransfer3;
diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index 5c74d55087..76526a5eed 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -36,7 +36,7 @@
 // Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
 void _memset16_unaligned( void* dest, u16 data, size_t size );
 
-#define memcpy_fast		memcpy_amd_ // Fast memcpy
-#define memcpy_aligned	memcpy_amd_	// Memcpy with 16-byte Aligned addresses
-#define memcpy_const	memcpy_amd_	// Memcpy with constant size
-#define memcpy_constA	memcpy_amd_ // Memcpy with constant size and 16-byte aligned
+#define memcpy_fast				memcpy_amd_ // Fast memcpy
+#define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c*16)	// Memcpy with 16-byte Aligned addresses
+#define memcpy_const			memcpy_amd_	// Memcpy with constant size
+#define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned
diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp
index ed87881d03..f93b80242b 100644
--- a/pcsx2/FiFo.cpp
+++ b/pcsx2/FiFo.cpp
@@ -195,10 +195,9 @@ void __fastcall WriteFIFO_page_6(u32 mem, const mem128_t *value)
 	nloop0_packet[1] = psHu32(GIF_FIFO + 4);
 	nloop0_packet[2] = psHu32(GIF_FIFO + 8);
 	nloop0_packet[3] = psHu32(GIF_FIFO + 12);
-	GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)nloop0_packet, 1);
+	GetMTGS().PrepDataPacket(GIF_PATH_3, 1);
 	u64* data = (u64*)GetMTGS().GetDataPacketPtr();
-	data[0] = value[0];
-	data[1] = value[1];
+	GIFPath_CopyTag( GIF_PATH_3, (u128*)nloop0_packet, 1 );
 	GetMTGS().SendDataPacket();
 	if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 )
 	{
diff --git a/pcsx2/GS.h b/pcsx2/GS.h
index 3d1dc74d78..d3232ef2aa 100644
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@@ -229,7 +229,7 @@ enum GIF_PATH
 	GIF_PATH_3,
 };
 
-extern int  GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size);
+extern int  GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size);
 extern int  GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
 extern void GIFPath_Reset();
 extern void GIFPath_Clear( GIF_PATH pathidx );
@@ -282,7 +282,6 @@ public:
 	volatile s32	m_SignalRingPosition;
 
 	int				m_QueuedFrameCount;
-	u32				m_RingWrapSpot;
 
 	Mutex			m_lock_RingBufferBusy;
 	Semaphore		m_sem_OnRingReset;
@@ -301,6 +300,7 @@ public:
 	// These vars maintain instance data for sending Data Packets.
 	// Only one data packet can be constructed and uploaded at a time.
 
+	uint			m_packet_startpos;	// size of the packet (data only, ie. not including the 16 byte command!)
 	uint			m_packet_size;		// size of the packet (data only, ie. not including the 16 byte command!)
 	uint			m_packet_ringpos;	// index of the data location in the ringbuffer.
 
@@ -317,14 +317,13 @@ public:
 	void WaitGS();
 	void ResetGS();
 
-	int PrepDataPacket( MTGS_RingCommand cmd, u32 size );
-	int PrepDataPacket( GIF_PATH pathidx, const u8*  srcdata, u32 size );
+	void PrepDataPacket( MTGS_RingCommand cmd, u32 size );
+	void PrepDataPacket( GIF_PATH pathidx, u32 size );
 	void SendDataPacket();
 	void SendGameCRC( u32 crc );
 	void WaitForOpen();
 	void Freeze( int mode, MTGS_FreezeData& data );
 
-	void RestartRingbuffer( uint packsize=0 );
 	void SendSimplePacket( MTGS_RingCommand type, int data0, int data1, int data2 );
 	void SendPointerPacket( MTGS_RingCommand type, u32 data0, void* data1 );
 
@@ -416,3 +415,31 @@ extern int g_nLeftGSFrames;
 
 #endif
 
+// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s.
+// (actual size is 1<<m_RingBufferSizeFactor simd vectors [128-bit values])
+// A value of 19 is a 8meg ring buffer.  18 would be 4 megs, and 20 would be 16 megs.
+// Default was 2mb, but some games with lots of MTGS activity want 8mb to run fast (rama)
+static const uint RingBufferSizeFactor = 19;
+
+// size of the ringbuffer in simd128's.
+static const uint RingBufferSize = 1<<RingBufferSizeFactor;
+
+// Mask to apply to ring buffer indices to wrap the pointer from end to
+// start (the wrapping is what makes it a ringbuffer, yo!)
+static const uint RingBufferMask = RingBufferSize - 1;
+
+struct MTGS_BufferedData
+{
+	u128		m_Ring[RingBufferSize];
+	u8			Regs[Ps2MemSize::GSregs];
+
+	MTGS_BufferedData() {}
+
+	u128& operator[]( uint idx )
+	{
+		pxAssert( idx < RingBufferSize );
+		return m_Ring[idx];
+	}
+};
+
+extern __aligned(32) MTGS_BufferedData RingBuffer;
diff --git a/pcsx2/Gif.cpp b/pcsx2/Gif.cpp
index 15ea987939..37679de3ce 100644
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@@ -59,16 +59,15 @@ void gsPath1Interrupt()
 		gifRegs->stat.P1Q = false;
 		while(Path1WritePos > 0)
 		{
-			u32 size  = GetMTGS().PrepDataPacket(GIF_PATH_1, Path1Buffer + (Path1ReadPos  * 16), (Path1WritePos - Path1ReadPos));
-			u8* pDest = GetMTGS().GetDataPacketPtr();
+			uint size = (Path1WritePos - Path1ReadPos);
+			GetMTGS().PrepDataPacket(GIF_PATH_1, size);
 			//DevCon.Warning("Flush Size = %x", size);
-			
-			memcpy_aligned(pDest, Path1Buffer + (Path1ReadPos * 16), size  * 16);
-			GetMTGS().SendDataPacket();
-			
 
-			Path1ReadPos += size;
-			
+			uint count = GIFPath_CopyTag(GIF_PATH_1, ((u128*)Path1Buffer) + Path1ReadPos, size);
+			GetMTGS().SendDataPacket();
+			pxAssume( count == size );
+			Path1ReadPos += count;
+
 			if(GSTransferStatus.PTH1 == STOPPED_MODE)
 			{
 				gifRegs->stat.OPH = false;				
@@ -150,11 +149,9 @@ __forceinline void gsInterrupt()
 
 static u32 WRITERING_DMA(u32 *pMem, u32 qwc)
 {
-	int size   = GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)pMem, qwc);
-	u8* pgsmem = GetMTGS().GetDataPacketPtr();
-
-	memcpy_aligned(pgsmem, pMem, size<<4);
-
+	GetMTGS().PrepDataPacket(GIF_PATH_3, qwc);
+	//uint len1 = GIFPath_ParseTag(GIF_PATH_3, (u8*)pMem, qwc );
+	uint size = GIFPath_CopyTag(GIF_PATH_3, (u128*)pMem, qwc );
 	GetMTGS().SendDataPacket();
 	return size;
 }
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index a6905b9788..04a3a42db1 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -29,7 +29,7 @@
 
 using namespace Threading;
 
-#if 0 // PCSX2_DEBUG
+#if 0 //PCSX2_DEBUG
 #	define MTGS_LOG Console.WriteLn
 #else
 #	define MTGS_LOG 0&&
@@ -46,34 +46,7 @@ using namespace Threading;
 //  MTGS Threaded Class Implementation
 // =====================================================================================================
 
-// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s.
-// (actual size is 1<<m_RingBufferSizeFactor simd vectors [128-bit values])
-// A value of 19 is a 8meg ring buffer.  18 would be 4 megs, and 20 would be 16 megs.
-// Default was 2mb, but some games with lots of MTGS activity want 8mb to run fast (rama)
-static const uint RingBufferSizeFactor = 19;
-
-// size of the ringbuffer in simd128's.
-static const uint RingBufferSize = 1<<RingBufferSizeFactor;
-
-// Mask to apply to ring buffer indices to wrap the pointer from end to
-// start (the wrapping is what makes it a ringbuffer, yo!)
-static const uint RingBufferMask = RingBufferSize - 1;
-
-struct MTGS_BufferedData
-{
-	u128		m_Ring[RingBufferSize];
-	u8			Regs[Ps2MemSize::GSregs];
-
-	MTGS_BufferedData() {}
-
-	u128& operator[]( uint idx )
-	{
-		pxAssert( idx < RingBufferSize );
-		return m_Ring[idx];
-	}
-};
-
-static __aligned(32) MTGS_BufferedData RingBuffer;
+__aligned(32) MTGS_BufferedData RingBuffer;
 extern bool renderswitch;
 
 
@@ -106,7 +79,6 @@ void SysMtgsThread::OnStart()
 	m_QueuedFrameCount	= 0;
 	m_SignalRingEnable	= 0;
 	m_SignalRingPosition= 0;
-	m_RingWrapSpot		= 0;
 
 	m_CopyDataTally		= 0;
 
@@ -125,12 +97,15 @@ void SysMtgsThread::OnResumeReady()
 
 void SysMtgsThread::ResetGS()
 {
+	pxAssertDev( !IsOpen() || (m_RingPos == m_WritePos), "Must close or terminate the GS thread prior to gsReset." );
+
 	// MTGS Reset process:
 	//  * clear the ringbuffer.
 	//  * Signal a reset.
 	//  * clear the path and byRegs structs (used by GIFtagDummy)
 
 	m_RingPos = m_WritePos;
+	m_QueuedFrameCount = 0;
 
 	MTGS_LOG( "MTGS: Sending Reset..." );
 	SendSimplePacket( GS_RINGTYPE_RESET, 0, 0, 0 );
@@ -155,7 +130,8 @@ void SysMtgsThread::PostVsyncEnd()
 	// 256-byte copy is only a few dozen cycles -- executed 60 times a second -- so probably
 	// not worth the effort or overhead of trying to selectively avoid it.
 
-	PrepDataPacket(GS_RINGTYPE_VSYNC, sizeof(RingCmdPacket_Vsync));
+	uint packsize = sizeof(RingCmdPacket_Vsync) / 16;
+	PrepDataPacket(GS_RINGTYPE_VSYNC, packsize);
 	RingCmdPacket_Vsync& local( *(RingCmdPacket_Vsync*)GetDataPacketPtr() );
 
 	memcpy_fast( local.regset1, PS2MEM_GS, sizeof(local.regset1) );
@@ -163,6 +139,7 @@ void SysMtgsThread::PostVsyncEnd()
 	local.imr = GSIMR;
 	local.siglblid = GSSIGLBLID;
 
+	m_packet_ringpos += packsize;
 	SendDataPacket();
 
 	// Alter-frame flushing!  Restarts the ringbuffer (wraps) on every other frame.  This is a
@@ -172,13 +149,29 @@ void SysMtgsThread::PostVsyncEnd()
 	// and they also allow us to reuse the front of the ringbuffer more often, which should improve
 	// L2 cache performance.
 
-	if( m_QueuedFrameCount > 0 )
-		RestartRingbuffer();
+	if( AtomicIncrement(m_QueuedFrameCount) == 0 ) return;
+
+	uint readpos = volatize(m_RingPos);
+	uint freeroom;
+
+	if (m_WritePos < readpos)
+		freeroom = readpos - m_WritePos;
 	else
-	{
-		m_QueuedFrameCount++;
-		SetEvent();
-	}
+		freeroom = RingBufferSize - (m_WritePos - readpos);
+
+	uint totalAccum	= RingBufferSize - freeroom;
+	uint somedone	= totalAccum / 4;
+
+	m_SignalRingPosition = totalAccum;
+
+	//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Vsync Sleep!\t\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
+
+	AtomicExchange( m_SignalRingEnable, 1 );
+	SetEvent();
+	m_sem_OnRingReset.WaitWithoutYield();
+	readpos = volatize(m_RingPos);
+
+	pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
 }
 
 struct PacketTagType
@@ -197,7 +190,7 @@ void SysMtgsThread::OpenPlugin()
 {
 	if( m_PluginOpened ) return;
 
-	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
+	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 );
 	GSsetBaseMem( RingBuffer.Regs );
 	GSirqCallback( dummyIrqCallback );
 
@@ -330,38 +323,75 @@ void SysMtgsThread::ExecuteTaskInThread()
 			{
 				case GS_RINGTYPE_P1:
 				{
+					uint datapos = (m_RingPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
-					const u128* data = &RingBuffer[m_RingPos+1];
+					const u128* data = &RingBuffer[datapos];
 
 					MTGS_LOG( "(MTGS Packet Read) ringtype=P1, qwc=%u", qsize );
 
-					// make sure that tag>>16 is the MAX size readable
-					GSgifTransfer1((u32*)(data - 0x400 + qsize), 0x4000-qsize*16);
-					//GSgifTransfer1((u32*)data, qsize);
+					uint endpos = datapos + qsize;
+					if( endpos >= RingBufferSize )
+					{
+						uint firstcopylen = RingBufferSize - datapos;
+						GSgifTransfer( (u32*)data, firstcopylen );
+						datapos = endpos & RingBufferMask;
+						GSgifTransfer( (u32*)RingBuffer.m_Ring, datapos );
+					}
+					else
+					{
+						GSgifTransfer( (u32*)data, qsize );
+					}
+
 					ringposinc += qsize;
 				}
 				break;
 
 				case GS_RINGTYPE_P2:
 				{
+					uint datapos = (m_RingPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
-					const u128* data = &RingBuffer[m_RingPos+1];
+					const u128* data = &RingBuffer[datapos];
 
 					MTGS_LOG( "(MTGS Packet Read) ringtype=P2, qwc=%u", qsize );
 
-					GSgifTransfer2((u32*)data, qsize);
+					uint endpos = datapos + qsize;
+					if( endpos >= RingBufferSize )
+					{
+						uint firstcopylen = RingBufferSize - datapos;
+						GSgifTransfer2( (u32*)data, firstcopylen );
+						datapos = endpos & RingBufferMask;
+						GSgifTransfer2( (u32*)RingBuffer.m_Ring, datapos );
+					}
+					else
+					{
+						GSgifTransfer2( (u32*)data, qsize );
+					}
+
 					ringposinc += qsize;
 				}
 				break;
 
 				case GS_RINGTYPE_P3:
 				{
+					uint datapos = (m_RingPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
-					const u128* data = &RingBuffer[m_RingPos+1];
+					const u128* data = &RingBuffer[datapos];
 
 					MTGS_LOG( "(MTGS Packet Read) ringtype=P3, qwc=%u", qsize );
 
-					GSgifTransfer3((u32*)data, qsize);
+					uint endpos = datapos + qsize;
+					if( endpos >= RingBufferSize )
+					{
+						uint firstcopylen = RingBufferSize - datapos;
+						GSgifTransfer3( (u32*)data, firstcopylen );
+						datapos = endpos & RingBufferMask;
+						GSgifTransfer3( (u32*)RingBuffer.m_Ring, datapos );
+					}
+					else
+					{
+						GSgifTransfer3( (u32*)data, qsize );
+					}
+
 					ringposinc += qsize;
 				}
 				break;
@@ -380,7 +410,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 							const int qsize = tag.data[0];
 							ringposinc += qsize;
 
-							MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", tag.data[0], tag.data[1] ? "true" : "false" );
+							MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", !!(((u32&)RingBuffer.Regs[0x1000]) & 0x2000) ? 0 : 1, tag.data[1] ? "true" : "false" );
 							
 							// Mail in the important GS registers.
 							RingCmdPacket_Vsync& local((RingCmdPacket_Vsync&)RingBuffer[m_RingPos+1]);
@@ -398,6 +428,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 							if( (GSopen2 == NULL) && (PADupdate != NULL) )
 								PADupdate(0);
 
+							AtomicDecrement( m_QueuedFrameCount );
 							StateCheckInThread();
 						}
 						break;
@@ -450,9 +481,14 @@ void SysMtgsThread::ExecuteTaskInThread()
 				}
 			}
 
-			uint newringpos = m_RingPos + ringposinc;
-			pxAssert( newringpos <= RingBufferSize );
-			m_RingPos = newringpos & RingBufferMask;
+			uint newringpos = (m_RingPos + ringposinc) & RingBufferMask;
+
+			if( EmuConfig.GS.SynchronousMTGS )
+			{
+				pxAssert( m_WritePos == newringpos );
+			}
+			
+			m_RingPos = newringpos;
 
 			if( m_SignalRingEnable != 0 )
 			{
@@ -546,7 +582,7 @@ void SysMtgsThread::SetEvent()
 
 u8* SysMtgsThread::GetDataPacketPtr() const
 {
-	return (u8*)&RingBuffer[m_packet_ringpos];
+	return (u8*)&RingBuffer[m_packet_ringpos & RingBufferMask];
 }
 
 // Closes the data packet send command, and initiates the gs thread (if needed).
@@ -555,6 +591,7 @@ void SysMtgsThread::SendDataPacket()
 	// make sure a previous copy block has been started somewhere.
 	pxAssert( m_packet_size != 0 );
 
+	#if 0
 	uint temp = m_packet_ringpos + m_packet_size;
 	pxAssert( temp <= RingBufferSize );
 	temp &= RingBufferMask;
@@ -578,8 +615,16 @@ void SysMtgsThread::SendDataPacket()
 			pxAssert( readpos != temp );
 		}
 	}
+	#endif
 
-	m_WritePos = temp;
+	uint actualSize = ((m_packet_ringpos - m_packet_startpos) & RingBufferMask)-1;
+	pxAssert( actualSize <= m_packet_size );
+	pxAssert( m_packet_ringpos < RingBufferSize );
+
+	PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
+	tag.data[0] = actualSize;
+
+	m_WritePos = m_packet_ringpos;
 
 	if( EmuConfig.GS.SynchronousMTGS )
 	{
@@ -596,7 +641,7 @@ void SysMtgsThread::SendDataPacket()
 	//m_PacketLocker.Release();
 }
 
-int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
+void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 {
 	// Note on volatiles: m_WritePos is not modified by the GS thread, so there's no need
 	// to use volatile reads here.  We do cache it though, since we know it never changes,
@@ -613,119 +658,63 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 	m_packet_size = size;
 	++size;			// takes into account our RingCommand QWC.
 
-	if( writepos + size < RingBufferSize )
+	// generic gs wait/stall.
+	// if the writepos is past the readpos then we're safe.
+	// But if not then we need to make sure the readpos is outside the scope of
+	// the block about to be written (writepos + size)
+
+	uint readpos = volatize(m_RingPos);
+	uint endpos = writepos+size;
+	uint freeroom;
+
+	if (writepos < readpos)
+		freeroom = readpos - writepos;
+	else
+		freeroom = RingBufferSize - (writepos - readpos);
+
+	if (freeroom < size)
 	{
-		// generic gs wait/stall.
-		// if the writepos is past the readpos then we're safe.
-		// But if not then we need to make sure the readpos is outside the scope of
-		// the block about to be written (writepos + size)
+		// writepos will overlap readpos if we commit the data, so we need to wait until
+		// readpos is out past the end of the future write pos, or until it wraps around
+		// (in which case writepos will be >= readpos).
 
-		uint readpos = volatize(m_RingPos);
-		if( (writepos < readpos) && (writepos+size >= readpos) )
+		// Ideally though we want to wait longer, because if we just toss in this packet
+		// the next packet will likely stall up too.  So lets set a condition for the MTGS
+		// thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied.
+
+		uint somedone	= (RingBufferSize - freeroom) / 4;
+		if( somedone < size+1 ) somedone = size + 1;
+
+		// FMV Optimization: FMVs typically send *very* little data to the GS, in some cases
+		// every other frame is nothing more than a page swap.  Sleeping the EEcore is a
+		// waste of time, and we get better results using a spinwait.
+
+		if( somedone > 0x80 )
 		{
-			// writepos is behind the readpos and will overlap it if we commit the data,
-			// so we need to wait until readpos is out past the end of the future write pos,
-			// or until it wraps around (in which case writepos will be >= readpos).
+			pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
+			m_SignalRingPosition = somedone;
 
-			// Ideally though we want to wait longer, because if we just toss in this packet
-			// the next packet will likely stall up too.  So lets set a condition for the MTGS
-			// thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied.
+			//Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition );
 
-			uint totalAccum	= (m_RingWrapSpot - readpos) + writepos;
-			uint somedone	= totalAccum / 4;
-			if( somedone < size+1 ) somedone = size + 1;
-
-			// FMV Optimization: FMVs typically send *very* little data to the GS, in some cases
-			// every other frame is nothing more than a page swap.  Sleeping the EEcore is a
-			// waste of time, and we get better results using a spinwait.
-
-			if( somedone > 0x80 )
-			{
-				pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
-				m_SignalRingPosition = somedone;
-
-				//Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition );
-
-				do {
-					AtomicExchange( m_SignalRingEnable, 1 );
-					SetEvent();
-					m_sem_OnRingReset.WaitWithoutYield();
-					readpos = volatize(m_RingPos);
-					//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
-				} while( (writepos < readpos) && (writepos+size >= readpos) );
-
-				pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
-			}
-			else
-			{
+			do {
+				AtomicExchange( m_SignalRingEnable, 1 );
 				SetEvent();
-				do {
-					SpinWait();
-					readpos = volatize(m_RingPos);
-				} while( (writepos < readpos) && (writepos+size >= readpos) );
-			}
+				m_sem_OnRingReset.WaitWithoutYield();
+				readpos = volatize(m_RingPos);
+				//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
+			} while( (writepos < readpos) && (writepos+size >= readpos) );
+
+			pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
+		}
+		else
+		{
+			SetEvent();
+			do {
+				SpinWait();
+				readpos = volatize(m_RingPos);
+			} while( (writepos < readpos) && (writepos+size >= readpos) );
 		}
 	}
-	else if( writepos + size > RingBufferSize )
-	{
-		pxAssert( writepos != 0 );
-
-		// If the incoming packet doesn't fit, then start over from the start of the ring
-		// buffer (it's a lot easier than trying to wrap the packet around the end of the
-		// buffer).
-
-		//Console.WriteLn( "MTGS > Ringbuffer Got Filled!");
-		RestartRingbuffer( size );
-		writepos = m_WritePos;
-	}
-    else	// always true - if( writepos + size == MTGS_RINGBUFFEREND )
-	{
-		// Yay.  Perfect fit.  What are the odds?
-		// Copy is ready so long as readpos is less than writepos and *not* equal to the
-		// base of the ringbuffer (otherwise the buffer will stop when the writepos is
-		// wrapped around to zero later-on in SendDataPacket).
-
-		uint readpos = volatize(m_RingPos);
-		//Console.WriteLn( "MTGS > Perfect Fit!\tringpos=0x%06x, writepos=0x%06x", readpos, writepos );
-		if( readpos > writepos || readpos == 0 )
-		{
-			uint totalAccum	= (readpos == 0) ? RingBufferSize : ((m_RingWrapSpot - readpos) + writepos);
-			uint somedone	= totalAccum / 4;
-			if( somedone < size+1 ) somedone = size + 1;
-
-			// FMV Optimization: (see above) This condition of a perfect fit is so rare that optimizing
-			// for it is pointless -- but it was also mindlessly simple copy-paste.  So there. :p
-
-			if( somedone > 0x80 )
-			{
-				m_SignalRingPosition = somedone;
-
-				//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Sleep!\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
-
-				do {
-					AtomicExchange( m_SignalRingEnable, 1 );
-					SetEvent();
-					m_sem_OnRingReset.WaitWithoutYield();
-					readpos = volatize(m_RingPos);
-					//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Post-sleep Report!\tringpos=0x%06x", readpos );
-				} while( (writepos < readpos) || (readpos==0) );
-
-				pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
-			}
-			else
-			{
-				//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Spin!" );
-				SetEvent();
-				do {
-					SpinWait();
-					readpos = volatize(m_RingPos);
-				} while( (writepos < readpos) || (readpos==0) );
-			}
-		}
-
-		m_QueuedFrameCount = 0;
-		m_RingWrapSpot = RingBufferSize;
-    }
 
 #ifdef RINGBUF_DEBUG_STACK
 	m_lock_Stack.Lock();
@@ -739,9 +728,8 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
 	tag.command = cmd;
 	tag.data[0] = m_packet_size;
-	m_packet_ringpos = m_WritePos + 1;
-
-	return m_packet_size;
+	m_packet_startpos = m_WritePos;
+	m_packet_ringpos = (m_WritePos + 1) & RingBufferMask;
 }
 
 // Returns the amount of giftag data processed (in simd128 values).
@@ -749,13 +737,14 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 // around VU memory instead of having buffer overflow...
 // Parameters:
 //  size - size of the packet data, in smd128's
-int SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size )
+void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size )
 {
 	//m_PacketLocker.Acquire();
 
-	return PrepDataPacket( (MTGS_RingCommand)pathidx, GIFPath_ParseTag(pathidx, srcdata, size) );
+	PrepDataPacket( (MTGS_RingCommand)pathidx, size );
 }
 
+#if 0
 void SysMtgsThread::RestartRingbuffer( uint packsize )
 {
 	if( m_WritePos == 0 ) return;
@@ -816,6 +805,7 @@ void SysMtgsThread::RestartRingbuffer( uint packsize )
 	if( EmuConfig.GS.SynchronousMTGS )
 		WaitGS();
 }
+#endif
 
 __forceinline uint SysMtgsThread::_PrepForSimplePacket()
 {
@@ -830,10 +820,7 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket()
 
     future_writepos &= RingBufferMask;
     if( future_writepos == 0 )
-    {
 		m_QueuedFrameCount = 0;
-		m_RingWrapSpot = RingBufferSize;
-	}
 
 	uint readpos = volatize(m_RingPos);
 	if( future_writepos == readpos )
@@ -841,7 +828,15 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket()
 		// The ringbuffer read pos is blocking the future write position, so stall out
 		// until the read position has moved.
 
-		uint totalAccum	= (m_RingWrapSpot - readpos) + future_writepos;
+		uint freeroom;
+
+		if (future_writepos < readpos)
+			freeroom = readpos - future_writepos;
+		else
+			freeroom = RingBufferSize - (future_writepos - readpos);
+
+		uint totalAccum	= RingBufferSize - freeroom;
+
 		uint somedone	= totalAccum / 4;
 
 		if( somedone > 0x80 )
diff --git a/pcsx2/PluginManager.cpp b/pcsx2/PluginManager.cpp
index 080f1f5e9d..558a12180f 100644
--- a/pcsx2/PluginManager.cpp
+++ b/pcsx2/PluginManager.cpp
@@ -144,6 +144,7 @@ static s32  CALLBACK fallback_test() { return 0; }
 _GSvsync           GSvsync;
 _GSopen            GSopen;
 _GSopen2           GSopen2;
+_GSgifTransfer     GSgifTransfer;
 _GSgifTransfer1    GSgifTransfer1;
 _GSgifTransfer2    GSgifTransfer2;
 _GSgifTransfer3    GSgifTransfer3;
@@ -309,7 +310,8 @@ static const LegacyApi_ReqMethod s_MethMessReq_GS[] =
 {
 	{	"GSopen",			(vMeth**)&GSopen,			NULL	},
 	{	"GSvsync",			(vMeth**)&GSvsync,			NULL	},
-	{	"GSgifTransfer1",	(vMeth**)&GSgifTransfer1,	NULL	},
+	{	"GSgifTransfer",	(vMeth**)&GSgifTransfer,	NULL	},
+	//{	"GSgifTransfer1",	(vMeth**)&GSgifTransfer1,	NULL	},
 	{	"GSgifTransfer2",	(vMeth**)&GSgifTransfer2,	NULL	},
 	{	"GSgifTransfer3",	(vMeth**)&GSgifTransfer3,	NULL	},
 	{	"GSreadFIFO2",		(vMeth**)&GSreadFIFO2,		NULL	},
diff --git a/pcsx2/VUops.cpp b/pcsx2/VUops.cpp
index 6baaacebad..0172fd1d9f 100644
--- a/pcsx2/VUops.cpp
+++ b/pcsx2/VUops.cpp
@@ -2057,21 +2057,8 @@ void _vuXGKICK(VURegs * VU)
 
 	u8* data = ((u8*)VU->Mem + ((VU->VI[_Is_].US[0]*16) & 0x3fff));
 	u32 size;
-	size = GetMTGS().PrepDataPacket( GIF_PATH_1, data, (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4);
-	u8* pmem = GetMTGS().GetDataPacketPtr();
-
-	if((size << 4) > (u32)(0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)))
-	{
-		//DevCon.Warning("addr + Size = 0x%x, transferring %x then doing %x", ((VU->VI[_Is_].US[0]*16) & 0x3fff) + (size << 4), (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4, size - (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff) >> 4));
-		memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff));
-		size -= (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4;
-		//DevCon.Warning("Size left %x", size);
-		pmem += 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff);
-		memcpy_aligned(pmem, (u8*)VU->Mem, size<<4);
-	}
-	else {
-		memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), size<<4);
-	}
+	GetMTGS().PrepDataPacket( GIF_PATH_1, 0x400 );
+	size = GIFPath_CopyTag( GIF_PATH_1, (u128*)data, (0x400-(VU->VI[_Is_].US[0] & 0x3ff)) );
 	GetMTGS().SendDataPacket();
 }
 
diff --git a/pcsx2/Vif_Codes.cpp b/pcsx2/Vif_Codes.cpp
index fb2fb3a9f9..f7af604502 100644
--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@@ -213,8 +213,8 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 						v.bSize = 0;
 						v.bPtr = 0;						
 					}
-					const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, v.buffer, 1);
-					memcpy_fast(GetMTGS().GetDataPacketPtr(), v.buffer, count << 4);
+					GetMTGS().PrepDataPacket(GIF_PATH_2, 1);
+					GIFPath_CopyTag(GIF_PATH_2, (u128*)v.buffer, 1);
 					GetMTGS().SendDataPacket();
 
 					if(vif1.tag.size == 0) 
@@ -226,16 +226,17 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 				}
 				else
 				{
-					const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, data, size >> 4);
-					memcpy_fast(GetMTGS().GetDataPacketPtr(), data, count << 4);
+					GetMTGS().PrepDataPacket(GIF_PATH_2, size/16);
+					uint count = GIFPath_CopyTag(GIF_PATH_2, (u128*)data, size/16) * 4;
 					GetMTGS().SendDataPacket();
-					vif1.tag.size -= count << 2;
+
+					vif1.tag.size -= count;
 					if(vif1.tag.size == 0) 
 					{
 						vif1.cmd = 0;
 					}
 					vif1.vifstalled    = true;
-					return count << 2;
+					return count;
 				}
 			}
 			
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index 02b6551e4f..3b5f477e0e 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -97,7 +97,7 @@ struct GIFPath
 	u8 GetReg();
 	bool IsActive() const;
 
-	int ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size);
+	int CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size);
 	int ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
 };
 
@@ -287,7 +287,8 @@ __forceinline void GIFPath::PrepPackedRegs()
 
 __forceinline void GIFPath::SetTag(const void* mem)
 {
-	const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);
+	_mm_store_ps( (float*)&tag, _mm_loadu_ps((float*)mem) );
+	//const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);
 
 	nloop	= tag.NLOOP;
 	curreg	= 0;
@@ -521,15 +522,50 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 	return size;
 }
 
-__forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
+void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
 {
+	uint endpos = destStart + len;
+	if( endpos >= destSize )
+	{
+		uint firstcopylen = RingBufferSize - destStart;
+		memcpy_aligned(&destBase[destStart], src, firstcopylen );
+
+		destStart = endpos & RingBufferMask;
+		memcpy_aligned(destBase, src+firstcopylen, destStart );
+	}
+	else
+	{
+		memcpy_aligned(&destBase[destStart], src, len );
+		destStart += len;
+	}
+}
+
+// [TODO] optimization: If later templated, we can have Paths 1 and 3 use aligned SSE movs,
+// since only PATH2 can feed us unaligned source data.
+#define copyTag() do {						\
+	/*RingBuffer.m_Ring[ringpos] = *pMem128;*/	\
+	_mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], _mm_loadu_ps((float*)pMem128)); \
+	++pMem128; --size;						\
+	ringpos = (ringpos+1)&RingBufferMask;	\
+} while(false)
+
+__forceinline int GIFPath::CopyTag(GIF_PATH pathidx, const u128* pMem128, u32 size)
+{
+	uint& ringpos = GetMTGS().m_packet_ringpos;
+	const uint original_ringpos = ringpos;
+
 	u32	startSize =  size;						// Start Size
 
 	while (size > 0) {
 		if (!nloop) {
 
-			SetTag(pMem);
-			incTag(1);		
+			// [TODO] Optimization: Use MMX intrinsics for SetTag and CopyTag, which both currently
+			//   produce a series of mov eax,[src]; mov [dest],eax instructions to copy these
+			//   individual qwcs.  Warning: Path2 transfers are not always QWC-aligned, but they are
+			//   always aligned on an 8 byte boundary; so its probably best to use MMX here.
+
+			SetTag((u8*)pMem128);
+			copyTag();
 			
 			if(nloop > 0)
 			{
@@ -599,9 +635,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 					{
 						do {
 							if (GetReg() == 0xe) {
-								gsHandler(pMem);
+								gsHandler((u8*)pMem128);
 							}
-							incTag(1);
+							copyTag();
 						} while(StepReg() && size > 0 && SIGNAL_IMR_Pending == false);
 					}
 					else
@@ -644,11 +680,14 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 							curreg = 0;
 							nloop = 0;
 						}
-						incTag(len);
+
+						MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
+						pMem128 += len;
+						size -= len;
 					}
 				break;
 				case GIF_FLG_REGLIST:
-				{				
+				{
 					GIF_LOG("Reglist Mode EOP %x", tag.EOP);
 
 					// In reglist mode, the GIF packs 2 registers into each QWC.  The nloop however
@@ -687,8 +726,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 						nloop = 0;
 					}
 
-					incTag(len);
-
+					MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
+					pMem128 += len;
+					size -= len;
 				}
 				break;
 				case GIF_FLG_IMAGE:
@@ -696,13 +736,15 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 				{
 					GIF_LOG("IMAGE Mode EOP %x", tag.EOP);
 					int len = aMin(size, nloop);
-					incTag(len);
+
+					MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
+
+					pMem128 += len;
+					size -= len;
 					nloop -= len;
 				}
 				break;
 			}
-			
-
 		}
 
 		if(pathidx == GIF_PATH_1)
@@ -713,11 +755,11 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 				{
 					size = 0x3ff - startSize;
 					startSize = 0x3ff;
-					pMem -= 0x4000;
+					pMem128 -= 0x400;
 				}
 				else
 				{
-					// Note: The BIOS does an XGKICK on the VU1 and lets yt DMA to the GS without an EOP
+					// Note: The BIOS does an XGKICK on the VU1 and lets it DMA to the GS without an EOP
 					// (seemingly to loop forever), only to write an EOP later on.  No other game is known to
 					// do anything of the sort.
 					// So lets just cap the DMA at 16k, and force it to "look" like it's terminated for now.
@@ -727,6 +769,11 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 
 					Console.Warning("GIFTAG error, size exceeded VU memory size %x", startSize);
 					nloop	= 0;
+					
+					// Don't send the packet to the GS -- its incomplete and might cause the GS plugin
+					// to get confused and die. >_<
+					
+					ringpos = original_ringpos;
 				}
 			}
 		}
@@ -793,47 +840,18 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 			gif->qwc  -= size;
 		}
 	}
-	
-	
 
 	return size;
 }
 
-// Processes a GIFtag & packet, and throws out some gsIRQs as needed.
-// Used to keep interrupts in sync with the EE, while the GS itself
-// runs potentially several frames behind.
-// Parameters:
-//   size  - max size of incoming data stream, in qwc (simd128)
-__forceinline int GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
+__forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
 {
-#ifdef PCSX2_GSRING_SAMPLING_STATS
-	static uptr profStartPtr = 0;
-	static uptr profEndPtr = 0;
-	if (profStartPtr == 0) {
-		__asm
-		{
-		__beginfunc:
-			mov profStartPtr, offset __beginfunc;
-			mov profEndPtr, offset __endfunc;
-		}
-		ProfilerRegisterSource( "GSRingBufCopy", (void*)profStartPtr, profEndPtr - profStartPtr );
-	}
-#endif
-
-	int retSize = s_gifPath[pathidx].ParseTag(pathidx, pMem, size);
-
-#ifdef PCSX2_GSRING_SAMPLING_STATS
-	__asm
-	{
-		__endfunc:
-			nop;
-	}
-#endif
-	return retSize;
+	return s_gifPath[pathidx].CopyTag(pathidx, pMem, size);
 }
 
-//Quick version for queueing PATH1 data
-
+// Quick version for queueing PATH1 data.
+// This version calculates the real length of the packet data only.  It does not process
+// IRQs or DMA status updates.
 __forceinline int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
 {
 	int retSize = s_gifPath[pathidx].ParseTagQuick(pathidx, pMem, size);
diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl
index b28427a46f..de20032e9e 100644
--- a/pcsx2/x86/microVU_Lower.inl
+++ b/pcsx2/x86/microVU_Lower.inl
@@ -1101,27 +1101,15 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 	
 	if(gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.APATH == GIF_APATH3 && gifRegs->stat.IP3 == true) && SIGNAL_IMR_Pending == false)
 	{
-
 		if(Path1WritePos != 0)	
 		{
 			//Flush any pending transfers so things dont go up in the wrong order
 			while(gifRegs->stat.P1Q == true) gsPath1Interrupt();
 		}
-		size  = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
-		pDest = GetMTGS().GetDataPacketPtr();
-		if (size > diff) {
-			// fixme: one of these days the following *16's will get cleaned up when we introduce
-			// a special qwc/simd16 optimized version of memcpy_aligned. :)
-			//DevCon.Status("XGkick Wrap!");
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
-			size  -= diff;
-			pDest += diff*16;
-			memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
-		}
-		else {
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
-		}
+		GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400);
+		size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff);
 		GetMTGS().SendDataPacket();
+
 		if(GSTransferStatus.PTH1 == STOPPED_MODE)
 		{
 			gifRegs->stat.OPH = false;
@@ -1141,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 			// fixme: one of these days the following *16's will get cleaned up when we introduce
 			// a special qwc/simd16 optimized version of memcpy_aligned. :)
 			//DevCon.Status("XGkick Wrap!");
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
+			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff);
 			Path1WritePos += size;
 			size  -= diff;
 			pDest += diff*16;
-			memcpy_aligned(pDest, microVU1.regs->Mem, size*16);			
+			memcpy_aligned(pDest, microVU1.regs->Mem, size);			
 		}
 		else {
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
+			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size);
 			Path1WritePos += size;
 		}
 		//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);
diff --git a/pcsx2/x86/sVU_Lower.cpp b/pcsx2/x86/sVU_Lower.cpp
index c8d103477b..30ec18fd91 100644
--- a/pcsx2/x86/sVU_Lower.cpp
+++ b/pcsx2/x86/sVU_Lower.cpp
@@ -1988,21 +1988,10 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 			//Flush any pending transfers so things dont go up in the wrong order
 			while(gifRegs->stat.P1Q == true) gsPath1Interrupt();
 		}
-		size  = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
-		pDest = GetMTGS().GetDataPacketPtr();
-		if (size > diff) {
-			// fixme: one of these days the following *16's will get cleaned up when we introduce
-			// a special qwc/simd16 optimized version of memcpy_aligned. :)
-			
-			memcpy_aligned(pDest, VU1.Mem + addr, diff*16);
-			size  -= diff;
-			pDest += diff*16;
-			memcpy_aligned(pDest, VU1.Mem, size*16);
-		}
-		else {
-			memcpy_aligned(pDest, VU1.Mem + addr, size*16);
-		}
+		GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400);
+		size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff);
 		GetMTGS().SendDataPacket();
+
 		if(GSTransferStatus.PTH1 == STOPPED_MODE )
 		{
 			gifRegs->stat.OPH = false;
@@ -2015,8 +2004,6 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 		size = GIFPath_ParseTagQuick(GIF_PATH_1, data, diff);
 		pDest = &Path1Buffer[Path1WritePos*16];
 
-
-
 		pxAssumeMsg((Path1WritePos+size < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!");
 
 		//DevCon.Warning("Storing size %x PATH 1", size);
@@ -2024,14 +2011,14 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 			// fixme: one of these days the following *16's will get cleaned up when we introduce
 			// a special qwc/simd16 optimized version of memcpy_aligned. :)
 			//DevCon.Status("XGkick Wrap!");
-			memcpy_aligned(pDest, VU1.Mem + addr, diff*16);
+			memcpy_aligned(pDest, VU1.Mem + addr, diff);
 			Path1WritePos += size;
 			size  -= diff;
 			pDest += diff*16;
-			memcpy_aligned(pDest, VU1.Mem, size*16);			
+			memcpy_aligned(pDest, VU1.Mem, size);			
 		}
 		else {
-			memcpy_aligned(pDest, VU1.Mem + addr, size*16);
+			memcpy_aligned(pDest, VU1.Mem + addr, size);
 			Path1WritePos += size;
 		}
 		//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);

From 472358345a0ba9d2096b8a14cf90a87e52599ef6 Mon Sep 17 00:00:00 2001
From: arcum42 <arcum42@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sun, 11 Jul 2010 06:16:16 +0000
Subject: [PATCH 03/26] ReorderingMTGS: zzogl-pg: Add GSgifTransfer. pcsx2: Add
 an include so Linux compiles.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3459 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/PS2Edefs.h               |  1 +
 pcsx2/ps2/GIFpath.cpp                   |  1 +
 plugins/zzogl-pg/opengl/GS.h            |  2 +-
 plugins/zzogl-pg/opengl/GifTransfer.cpp | 13 +++++++++++--
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/common/include/PS2Edefs.h b/common/include/PS2Edefs.h
index f394cf5025..64a98e1016 100644
--- a/common/include/PS2Edefs.h
+++ b/common/include/PS2Edefs.h
@@ -248,6 +248,7 @@ void CALLBACK GSsetSettingsDir( const char* dir );
 void CALLBACK GSsetLogDir( const char* dir );
 
 void CALLBACK GSvsync(int field);
+void CALLBACK GSgifTransfer(u32 *pMem, u32 addr);
 void CALLBACK GSgifTransfer1(u32 *pMem, u32 addr);
 void CALLBACK GSgifTransfer2(u32 *pMem, u32 size);
 void CALLBACK GSgifTransfer3(u32 *pMem, u32 size);
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index 3b5f477e0e..bb148ac3b4 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -19,6 +19,7 @@
 #include "Gif.h"
 #include "Vif_Dma.h"
 #include "Vif.h"
+#include <xmmintrin.h>
 
 // --------------------------------------------------------------------------------------
 //  GIFpath -- the GIFtag Parser
diff --git a/plugins/zzogl-pg/opengl/GS.h b/plugins/zzogl-pg/opengl/GS.h
index 3ac73bde9b..8ef2d0175b 100644
--- a/plugins/zzogl-pg/opengl/GS.h
+++ b/plugins/zzogl-pg/opengl/GS.h
@@ -635,7 +635,7 @@ typedef struct
 	int imageTransfer;
 	int imageWnew, imageHnew, imageX, imageY, imageEndX, imageEndY;
 
-	pathInfo path[3];
+	pathInfo path[4];
 	GIFRegDIMX dimx;
 	void setRGBA(u32 r, u32 g, u32 b, u32 a)
 	{
diff --git a/plugins/zzogl-pg/opengl/GifTransfer.cpp b/plugins/zzogl-pg/opengl/GifTransfer.cpp
index d8776eff13..4939f53dd9 100644
--- a/plugins/zzogl-pg/opengl/GifTransfer.cpp
+++ b/plugins/zzogl-pg/opengl/GifTransfer.cpp
@@ -265,8 +265,17 @@ void CALLBACK GSgifTransfer3(u32 *pMem, u32 size)
 	_GSgifTransfer<2>(pMem, size);
 }
 
-void InitPath()
+void CALLBACK GSgifTransfer(u32 *pMem, u32 size)
 {
-	gs.path[0].mode = gs.path[1].mode = gs.path[2].mode = 0;
+	FUNCLOG
+
+	//ZZLog::GS_Log("GSgifTransfer3 size = %lx (mode %d, gs.path3.tag.nloop = %d).", size, gs.path[2].mode, gs.path[2].tag.nloop);
+
+	_GSgifTransfer<3>(pMem, size);
+}
+
+void InitPath()
+{
+	gs.path[0].mode = gs.path[1].mode = gs.path[2].mode = gs.path[3].mode = 0;
 }
 

From ec7e1ed0a11bf3626f9f59e95a65a9d6aaae472b Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sun, 11 Jul 2010 13:23:56 +0000
Subject: [PATCH 04/26] ReorderingMTGS: Bugfix for possible corruption/crash if
 a vsync happens at the ends of the ringbuffer.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3464 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/GS.h            |  5 +++++
 pcsx2/MTGS.cpp        | 14 +++++++-------
 pcsx2/ps2/GIFpath.cpp | 22 ++++++++++++++++++++--
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/pcsx2/GS.h b/pcsx2/GS.h
index d3232ef2aa..e23aaf0a07 100644
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@@ -443,3 +443,8 @@ struct MTGS_BufferedData
 };
 
 extern __aligned(32) MTGS_BufferedData RingBuffer;
+
+// FIXME: These belong in common with other memcpy tools.  Will move them there later if no one
+// else beats me to it.  --air
+extern void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len );
+extern void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len );
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index 04a3a42db1..3dd1eba59c 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -132,14 +132,14 @@ void SysMtgsThread::PostVsyncEnd()
 
 	uint packsize = sizeof(RingCmdPacket_Vsync) / 16;
 	PrepDataPacket(GS_RINGTYPE_VSYNC, packsize);
-	RingCmdPacket_Vsync& local( *(RingCmdPacket_Vsync*)GetDataPacketPtr() );
+	MemCopy_WrappedDest( (u128*)PS2MEM_GS, RingBuffer.m_Ring, m_packet_ringpos, RingBufferSize, 0xf );
 
-	memcpy_fast( local.regset1, PS2MEM_GS, sizeof(local.regset1) );
-	local.csr = GSCSRr;
-	local.imr = GSIMR;
-	local.siglblid = GSSIGLBLID;
-
-	m_packet_ringpos += packsize;
+	u32* remainder = (u32*)GetDataPacketPtr();
+	remainder[0] = GSCSRr;
+	remainder[1] = GSIMR;
+	(GSRegSIGBLID&)remainder[2] = GSSIGLBLID;
+	m_packet_ringpos = (m_packet_ringpos + 1) & RingBufferMask;
+	
 	SendDataPacket();
 
 	// Alter-frame flushing!  Restarts the ringbuffer (wraps) on every other frame.  This is a
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index bb148ac3b4..8ea9091ce0 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -528,10 +528,10 @@ void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint
 	uint endpos = destStart + len;
 	if( endpos >= destSize )
 	{
-		uint firstcopylen = RingBufferSize - destStart;
+		uint firstcopylen = destSize - destStart;
 		memcpy_aligned(&destBase[destStart], src, firstcopylen );
 
-		destStart = endpos & RingBufferMask;
+		destStart = endpos % destSize;
 		memcpy_aligned(destBase, src+firstcopylen, destStart );
 	}
 	else
@@ -541,6 +541,24 @@ void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint
 	}
 }
 
+void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
+{
+	uint endpos = srcStart + len;
+	if( endpos >= srcSize )
+	{
+		uint firstcopylen = srcSize - srcStart;
+		memcpy_aligned(dest, &srcBase[srcStart], firstcopylen );
+
+		srcStart = endpos & srcSize;
+		memcpy_aligned(dest+firstcopylen, srcBase, srcStart );
+	}
+	else
+	{
+		memcpy_aligned(dest, &srcBase[srcStart], len );
+		srcStart += len;
+	}
+}
+
 // [TODO] optimization: If later templated, we can have Paths 1 and 3 use aligned SSE movs,
 // since only PATH2 can feed us unaligned source data.
 #define copyTag() do {						\

From a6b3acb5d0a9da5b92c7f6feac2a328f4fddd15f Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Mon, 12 Jul 2010 04:04:40 +0000
Subject: [PATCH 05/26] ReorderingMTGS: Minor performance tweak.  MTGS was only
 queuing 1 frame ahead instead of 2.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3467 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/MTGS.cpp | 74 +++++---------------------------------------------
 1 file changed, 7 insertions(+), 67 deletions(-)

diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index 3dd1eba59c..5727ca9929 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -149,7 +149,7 @@ void SysMtgsThread::PostVsyncEnd()
 	// and they also allow us to reuse the front of the ringbuffer more often, which should improve
 	// L2 cache performance.
 
-	if( AtomicIncrement(m_QueuedFrameCount) == 0 ) return;
+	if( AtomicIncrement(m_QueuedFrameCount) < 2 ) return;
 
 	uint readpos = volatize(m_RingPos);
 	uint freeroom;
@@ -164,7 +164,7 @@ void SysMtgsThread::PostVsyncEnd()
 
 	m_SignalRingPosition = totalAccum;
 
-	//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Vsync Sleep!\t\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
+	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\tringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", readpos, m_WritePos, m_SignalRingPosition );
 
 	AtomicExchange( m_SignalRingEnable, 1 );
 	SetEvent();
@@ -694,7 +694,7 @@ void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 			pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
 			m_SignalRingPosition = somedone;
 
-			//Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition );
+			//Console.WriteLn( Color_Blue, "(EEcore Sleep) PrepDataPacker \tringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", readpos, writepos, m_SignalRingPosition );
 
 			do {
 				AtomicExchange( m_SignalRingEnable, 1 );
@@ -708,6 +708,7 @@ void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 		}
 		else
 		{
+			//Console.WriteLn( Color_StrongGray, "(EEcore Spin) PrepDataPacket!" );
 			SetEvent();
 			do {
 				SpinWait();
@@ -744,69 +745,6 @@ void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size )
 	PrepDataPacket( (MTGS_RingCommand)pathidx, size );
 }
 
-#if 0
-void SysMtgsThread::RestartRingbuffer( uint packsize )
-{
-	if( m_WritePos == 0 ) return;
-	const uint thefuture = packsize;
-
-	//Console.WriteLn( Color_Magenta, "**** Ringbuffer Restart!!" );
-	// Always kick the MTGS into action for a ringbuffer restart.
-	SetEvent();
-
-	uint readpos = volatize(m_RingPos);
-
-	if( (readpos > m_WritePos) || (readpos <= thefuture) )
-	{
-		// We have to be careful not to leapfrog our read-position, which would happen if
-		// it's greater than the current write position (since wrapping writepos to 0 would
-		// be the act of skipping PAST readpos).  Stall until it loops around to the
-		// beginning of the buffer, and past the size of our packet allocation.
-
-		uint somedone;
-
-		if( readpos > m_WritePos )
-			somedone = (m_RingWrapSpot - readpos) + packsize + 1;
-		else
-			somedone = (packsize + 1) - readpos;
-
-		if( somedone > 0x80 )
-		{
-			m_SignalRingPosition = somedone;
-			//Console.WriteLn( Color_Blue, "(EEcore Sleep) Restart!\tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x",
-			//	readpos, m_WritePos, m_RingWrapSpot, m_SignalRingPosition );
-
-			do {
-				AtomicExchange( m_SignalRingEnable, 1 );
-				SetEvent();
-				m_sem_OnRingReset.WaitWithoutYield();
-				readpos = volatize(m_RingPos);
-				//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
-			} while( (readpos > m_WritePos) || (readpos <= thefuture) );
-		}
-		else
-		{
-			SetEvent();
-			do {
-				SpinWait();
-				readpos = volatize(m_RingPos);
-			} while( (readpos > m_WritePos) || (readpos <= thefuture) );
-		}
-	}
-
-	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
-
-	tag.command = GS_RINGTYPE_RESTART;
-
-	m_RingWrapSpot = m_WritePos;
-	m_WritePos = 0;
-	m_QueuedFrameCount = 0;
-
-	if( EmuConfig.GS.SynchronousMTGS )
-		WaitGS();
-}
-#endif
-
 __forceinline uint SysMtgsThread::_PrepForSimplePacket()
 {
 #ifdef RINGBUF_DEBUG_STACK
@@ -843,7 +781,7 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket()
 		{
 			m_SignalRingPosition = somedone;
 
-			//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Simple Sleep!\t\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
+			//Console.WriteLn( Color_Blue, "(EEcore Sleep) PrepSimplePacket\tringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", readpos, m_WritePos, m_SignalRingPosition );
 
 			do {
 				AtomicExchange( m_SignalRingEnable, 1 );
@@ -857,6 +795,8 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket()
 		}
 		else
 		{
+			//Console.WriteLn( Color_StrongGray, "(EEcore Spin) PrepSimplePacket!" );
+
 			SetEvent();
 			do {
 				SpinWait();

From 934578c8fe50e3785437cb352115268663d809f0 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Mon, 12 Jul 2010 19:40:30 +0000
Subject: [PATCH 06/26] ReorderingMTGS: Added a new optimized memcpy_amd_qwc,
 for use by GIFpath copies.  After much studying, we determined this is about
 as efficient as memcpy will ever get, for what we're doing with it.

DevNot:  Win32-only at the moment -- needs a GAS port (but that shouldn't be hard).  I made some notes in the code about it.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3472 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/Utilities/MemcpyFast.h       |   5 +-
 common/include/Utilities/Threading.h        |   4 +
 common/src/Utilities/Windows/WinThreads.cpp |   5 +
 common/src/Utilities/x86/MemcpyFast.cpp     | 147 ++++++++++++--------
 pcsx2/Config.h                              |   3 +
 pcsx2/MTGS.cpp                              |  14 +-
 pcsx2/Pcsx2Config.cpp                       |   2 +
 pcsx2/ps2/GIFpath.cpp                       |  32 ++---
 pcsx2/x86/microVU_Lower.inl                 |   6 +-
 9 files changed, 133 insertions(+), 85 deletions(-)

diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index 76526a5eed..a8184b345b 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -28,6 +28,7 @@
 #	include "win_memzero.h"
 
 	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
+	extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
 	extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 	extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
 
@@ -37,6 +38,8 @@
 void _memset16_unaligned( void* dest, u16 data, size_t size );
 
 #define memcpy_fast				memcpy_amd_ // Fast memcpy
-#define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c*16)	// Memcpy with 16-byte Aligned addresses
+#define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
 #define memcpy_const			memcpy_amd_	// Memcpy with constant size
 #define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned
+#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
+//#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
diff --git a/common/include/Utilities/Threading.h b/common/include/Utilities/Threading.h
index 5df1b80621..a3fa5261fa 100644
--- a/common/include/Utilities/Threading.h
+++ b/common/include/Utilities/Threading.h
@@ -129,6 +129,10 @@ namespace Threading
 
 	// For use in spin/wait loops.
 	extern void SpinWait();
+	
+	// Use prior to committing data to another thread (internal memcpy_qwc does not use fencing,
+	// so that many memcpys can be issued in a row more efficiently)
+	extern void StoreFence();
 
 	// Optional implementation to enable hires thread/process scheduler for the operating system.
 	// Needed by Windows, but might not be relevant to other platforms.
diff --git a/common/src/Utilities/Windows/WinThreads.cpp b/common/src/Utilities/Windows/WinThreads.cpp
index 0133f89e38..22cdfb21d6 100644
--- a/common/src/Utilities/Windows/WinThreads.cpp
+++ b/common/src/Utilities/Windows/WinThreads.cpp
@@ -36,6 +36,11 @@ __forceinline void Threading::SpinWait()
 	__asm pause;
 }
 
+__forceinline void Threading::StoreFence()
+{
+	__asm sfence;
+}
+
 __forceinline void Threading::EnableHiresScheduler()
 {
 	// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of
diff --git a/common/src/Utilities/x86/MemcpyFast.cpp b/common/src/Utilities/x86/MemcpyFast.cpp
index 40caf98308..bbf2fb8d5f 100644
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@@ -146,7 +146,7 @@ $memcpy_ic_1:			; 64-byte block copies, in-cache copy
 
 	add		esi, 64			; update source pointer
 	add		edi, 64			; update destination pointer
-	dec		eax				; count down
+	sub		eax, 1
 	jnz		$memcpy_ic_1	; last 64-byte block?
 
 $memcpy_ic_2:
@@ -189,64 +189,15 @@ $memcpy_uc_1:				; 64-byte blocks, uncached copy
 	movq	mm1,[esi-8]
 	movntq	[edi-24], mm2
 	movntq	[edi-16], mm0
-	dec		eax
 	movntq	[edi-8], mm1
+
+	sub		eax, 1
 	jnz		$memcpy_uc_1	; last 64-byte block?
 
 	jmp		$memcpy_ic_2		; almost done  (not needed because large copy below was removed)
 
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-
-// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
-// help keep the code cache footprint of memcpy_fast to a minimum.
-/*
-$memcpy_bp_1:			; large blocks, block prefetch copy
-
-	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
-	jl		$memcpy_64_test			; no, back to regular uncached copy
-
-	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
-	add		esi, CACHEBLOCK * 64	; move to the top of the block
-align 16
-$memcpy_bp_2:
-	mov		edx, [esi-64]		; grab one address per cache line
-	mov		edx, [esi-128]		; grab one address per cache line
-	sub		esi, 128			; go reverse order to suppress HW prefetcher
-	dec		eax					; count down the cache lines
-	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
-
-	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
-align 16
-$memcpy_bp_3:
-	movq	mm0, [esi   ]		; read 64 bits
-	movq	mm1, [esi+ 8]
-	movq	mm2, [esi+16]
-	movq	mm3, [esi+24]
-	movq	mm4, [esi+32]
-	movq	mm5, [esi+40]
-	movq	mm6, [esi+48]
-	movq	mm7, [esi+56]
-	add		esi, 64				; update source pointer
-	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
-	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
-	movntq	[edi+16], mm2		;    from READING the destination address
-	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
-	movntq	[edi+32], mm4		;    so that also helps performance
-	movntq	[edi+40], mm5
-	movntq	[edi+48], mm6
-	movntq	[edi+56], mm7
-	add		edi, 64				; update dest pointer
-
-	dec		eax					; count down
-
-	jnz		$memcpy_bp_3		; keep copying
-	sub		ecx, CACHEBLOCK		; update the 64-byte block count
-	jmp		$memcpy_bp_1		; keep processing chunks
-*/
+// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
+// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
 
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".   Then it handles the last few bytes.
@@ -274,17 +225,99 @@ $memcpy_last_few:		; dword aligned from before movsd's
 	rep		movsb		; the last 1, 2, or 3 bytes
 
 $memcpy_final:
+	pop    esi
+	pop    edi
+
 	emms				; clean up the MMX state
 	sfence				; flush the write buffer
 	//mov		eax, [dest]	; ret value = destination pointer
 
-	pop    esi
-	pop    edi
-
 	ret 4
     }
 }
 
+// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
+__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+{
+	// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
+	// registers will improve copy performance, because they won't.  Use of XMMs is only
+	// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
+	// and even then the benefits are typically minimal (sometimes slower depending on the
+	// amount of data being copied).
+	//
+	// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
+	//   --air
+
+	// Linux Conversion note:
+	//  This code would benefit nicely from having inline-able GAS syntax, since it should
+	//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
+	//  And its called enough times to probably merit the extra effort to ensure proper
+	//  optimization. --air
+
+    __asm
+	{
+	mov		ecx, [dest]
+	mov		edx, [src]
+	mov		eax, [qwc]			; keep a copy of count
+	shr		eax, 1
+	jz		$memcpy_qwc_1		; only one 16 byte block to copy?
+
+	cmp		eax, IN_CACHE_COPY/32
+	jb		$memcpy_qwc_loop1	; small copies should be cached (definite speedup --air)
+	
+$memcpy_qwc_loop2:				; 32-byte blocks, uncached copy
+	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
+
+	movq	mm0,[edx+0]			; read 64 bits
+	movq	mm1,[edx+8]
+	movq	mm2,[edx+16]
+	movntq	[ecx+0], mm0		; write 64 bits, bypassing the cache
+	movntq	[ecx+8], mm1
+	movq	mm3,[edx+24]
+	movntq	[ecx+16], mm2
+	movntq	[ecx+24], mm3
+
+	add		edx,32				; update source pointer
+	add		ecx,32				; update destination pointer
+	sub		eax,1
+	jnz		$memcpy_qwc_loop2	; last 64-byte block?
+	sfence						; flush the write buffer
+	jmp		$memcpy_qwc_1
+
+; 32-byte blocks, cached!
+; This *is* important.  Removing this and using exclusively non-temporal stores
+; results in noticable speed loss!
+
+$memcpy_qwc_loop1:				
+	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
+
+	movq	mm0,[edx+0]			; read 64 bits
+	movq	mm1,[edx+8]
+	movq	mm2,[edx+16]
+	movq	[ecx+0], mm0		; write 64 bits, bypassing the cache
+	movq	[ecx+8], mm1
+	movq	mm3,[edx+24]
+	movq	[ecx+16], mm2
+	movq	[ecx+24], mm3
+
+	add		edx,32				; update source pointer
+	add		ecx,32				; update destination pointer
+	sub		eax,1
+	jnz		$memcpy_qwc_loop1	; last 64-byte block?
+
+$memcpy_qwc_1:
+	test	[qwc],1
+	jz		$memcpy_qwc_final
+	movq	mm0,[edx]
+	movq	mm1,[edx+8]
+	movq	[ecx], mm0
+	movq	[ecx+8], mm1
+
+$memcpy_qwc_final:
+	emms				; clean up the MMX state
+    }
+}
+
 // mmx mem-compare implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
 // ~10 times faster than standard memcmp
diff --git a/pcsx2/Config.h b/pcsx2/Config.h
index a81f51d8e8..cc83918e92 100644
--- a/pcsx2/Config.h
+++ b/pcsx2/Config.h
@@ -395,6 +395,7 @@ struct Pcsx2Config
 		// style.  Useful for debugging potential bugs in the MTGS pipeline.
 		bool	SynchronousMTGS;
 		bool	DisableOutput;
+		int		VsyncQueueSize;
 
 		bool	FrameLimitEnable;
 		bool	FrameSkipEnable;
@@ -420,6 +421,8 @@ struct Pcsx2Config
 			return
 				OpEqu( SynchronousMTGS )		&&
 				OpEqu( DisableOutput )			&&
+				OpEqu( VsyncQueueSize )			&&
+				
 				OpEqu( FrameSkipEnable )		&&
 				OpEqu( FrameLimitEnable )		&&
 				OpEqu( VsyncEnable )			&&
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index 5727ca9929..da894e81d8 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -142,14 +142,11 @@ void SysMtgsThread::PostVsyncEnd()
 	
 	SendDataPacket();
 
-	// Alter-frame flushing!  Restarts the ringbuffer (wraps) on every other frame.  This is a
-	// mandatory feature that prevents the MTGS from queuing more than 2 frames at any time.
-	// (queued frames cause input lag and desynced audio -- bad!).  Ring restarts work for this
-	// because they act as sync points where the EE must stall to wait for the GS to catch-up,
-	// and they also allow us to reuse the front of the ringbuffer more often, which should improve
-	// L2 cache performance.
+	// If the MTGS is allowed to queue a lot of frames in advance, it creates input lag.
+	// Use the Queued FrameCount to stall the EE if another vsync is already queued in
+	// the ringbuffer.
 
-	if( AtomicIncrement(m_QueuedFrameCount) < 2 ) return;
+	if( AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize ) return;
 
 	uint readpos = volatize(m_RingPos);
 	uint freeroom;
@@ -190,7 +187,7 @@ void SysMtgsThread::OpenPlugin()
 {
 	if( m_PluginOpened ) return;
 
-	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 );
+	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
 	GSsetBaseMem( RingBuffer.Regs );
 	GSirqCallback( dummyIrqCallback );
 
@@ -624,6 +621,7 @@ void SysMtgsThread::SendDataPacket()
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
 	tag.data[0] = actualSize;
 
+	//Threading::StoreFence();
 	m_WritePos = m_packet_ringpos;
 
 	if( EmuConfig.GS.SynchronousMTGS )
diff --git a/pcsx2/Pcsx2Config.cpp b/pcsx2/Pcsx2Config.cpp
index efa01f4c59..a1aa88f307 100644
--- a/pcsx2/Pcsx2Config.cpp
+++ b/pcsx2/Pcsx2Config.cpp
@@ -217,6 +217,7 @@ Pcsx2Config::GSOptions::GSOptions()
 
 	SynchronousMTGS			= false;
 	DisableOutput			= false;
+	VsyncQueueSize			= 2;
 
 	DefaultRegionMode		= Region_NTSC;
 	FramesToDraw			= 2;
@@ -234,6 +235,7 @@ void Pcsx2Config::GSOptions::LoadSave( IniInterface& ini )
 
 	IniEntry( SynchronousMTGS );
 	IniEntry( DisableOutput );
+	IniEntry( VsyncQueueSize );
 
 	IniEntry( FrameLimitEnable );
 	IniEntry( FrameSkipEnable );
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index 8ea9091ce0..f1a2c94ded 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -526,36 +526,36 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
 {
 	uint endpos = destStart + len;
-	if( endpos >= destSize )
+	if( endpos < destSize )
 	{
-		uint firstcopylen = destSize - destStart;
-		memcpy_aligned(&destBase[destStart], src, firstcopylen );
-
-		destStart = endpos % destSize;
-		memcpy_aligned(destBase, src+firstcopylen, destStart );
+		memcpy_qwc(&destBase[destStart], src, len );
+		destStart += len;
 	}
 	else
 	{
-		memcpy_aligned(&destBase[destStart], src, len );
-		destStart += len;
+		uint firstcopylen = destSize - destStart;
+		memcpy_qwc(&destBase[destStart], src, firstcopylen );
+
+		destStart = endpos % destSize;
+		memcpy_qwc(destBase, src+firstcopylen, destStart );
 	}
 }
 
 void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
 {
 	uint endpos = srcStart + len;
-	if( endpos >= srcSize )
+	if( endpos < srcSize )
 	{
-		uint firstcopylen = srcSize - srcStart;
-		memcpy_aligned(dest, &srcBase[srcStart], firstcopylen );
-
-		srcStart = endpos & srcSize;
-		memcpy_aligned(dest+firstcopylen, srcBase, srcStart );
+		memcpy_qwc(dest, &srcBase[srcStart], len );
+		srcStart += len;
 	}
 	else
 	{
-		memcpy_aligned(dest, &srcBase[srcStart], len );
-		srcStart += len;
+		uint firstcopylen = srcSize - srcStart;
+		memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
+
+		srcStart = endpos & srcSize;
+		memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
 	}
 }
 
diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl
index de20032e9e..0068975b99 100644
--- a/pcsx2/x86/microVU_Lower.inl
+++ b/pcsx2/x86/microVU_Lower.inl
@@ -1129,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 			// fixme: one of these days the following *16's will get cleaned up when we introduce
 			// a special qwc/simd16 optimized version of memcpy_aligned. :)
 			//DevCon.Status("XGkick Wrap!");
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff);
+			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
 			Path1WritePos += size;
 			size  -= diff;
 			pDest += diff*16;
-			memcpy_aligned(pDest, microVU1.regs->Mem, size);			
+			memcpy_qwc(pDest, microVU1.regs->Mem, size);			
 		}
 		else {
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size);
+			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
 			Path1WritePos += size;
 		}
 		//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);

From 43cd559801e070257065bb5e9f7c799844044b27 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Tue, 13 Jul 2010 05:20:42 +0000
Subject: [PATCH 07/26] ReorderingMTGS: Templated the GIFPath parsers, to allow
 for SSE optimizations.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3474 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/GS.cpp          |  1 +
 pcsx2/GS.h            | 10 ++++++
 pcsx2/ps2/GIFpath.cpp | 74 +++++++++++++++++++++++++++++--------------
 3 files changed, 62 insertions(+), 23 deletions(-)

diff --git a/pcsx2/GS.cpp b/pcsx2/GS.cpp
index b9028de2ee..fa7d5cfedd 100644
--- a/pcsx2/GS.cpp
+++ b/pcsx2/GS.cpp
@@ -49,6 +49,7 @@ void gsSetRegionMode( GS_RegionMode region )
 void gsInit()
 {
 	memzero(g_RealGSMem);
+	GIFPath_Initialize();
 }
 
 extern bool SIGNAL_IMR_Pending;
diff --git a/pcsx2/GS.h b/pcsx2/GS.h
index e23aaf0a07..eeb337f2b6 100644
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@@ -18,6 +18,15 @@
 #include "Common.h"
 #include "System/SysThreads.h"
 
+enum CpuExtType
+{
+	CpuExt_Base,
+	CpuExt_MMX,
+	CpuExt_SSE,
+	CpuExt_SSE2,
+	CpuExt_SSE41,
+};
+
 extern __aligned16 u8 g_RealGSMem[Ps2MemSize::GSregs];
 
 enum CSR_FifoState
@@ -229,6 +238,7 @@ enum GIF_PATH
 	GIF_PATH_3,
 };
 
+extern void GIFPath_Initialize();
 extern int  GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size);
 extern int  GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
 extern void GIFPath_Reset();
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index f1a2c94ded..2f2907e2d2 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -93,12 +93,16 @@ struct GIFPath
 
 	void Reset();
 	void PrepPackedRegs();
-	void SetTag(const void* mem);
 	bool StepReg();
 	u8 GetReg();
 	bool IsActive() const;
 
-	int CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size);
+	template< CpuExtType CpuExt, bool Aligned >
+	void SetTag(const void* mem);
+
+	template< CpuExtType CpuExt, int pathidx >
+	int CopyTag(const u128* pMem, u32 size);
+
 	int ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
 };
 
@@ -286,10 +290,14 @@ __forceinline void GIFPath::PrepPackedRegs()
 	}
 }
 
+
+template< CpuExtType CpuExt, bool Aligned >
 __forceinline void GIFPath::SetTag(const void* mem)
 {
-	_mm_store_ps( (float*)&tag, _mm_loadu_ps((float*)mem) );
-	//const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);
+	if( CpuExt >= CpuExt_SSE )
+		_mm_store_ps( (float*)&tag, Aligned ? _mm_load_ps((const float*)mem) : _mm_loadu_ps((const float*)mem) );
+	else
+		const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);
 
 	nloop	= tag.NLOOP;
 	curreg	= 0;
@@ -373,10 +381,9 @@ static __forceinline void gsHandler(const u8* pMem)
 #define aMin(x, y) std::min(x, y)
 
 // Parameters:
-//   size (path1)   - difference between the end of VU memory and pMem.
-//   size (path2/3) - max size of incoming data stream, in qwc (simd128)
-
-
+//   size - max size of incoming data stream, in qwc (simd128).  If the path is PATH1, and the
+//     path does not terminate (EOP) within the specified size, it is assumed that the path must
+//     loop around to the start of VU memory and continue processing.
 __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
 {
 	u32	startSize =  size;						// Start Size
@@ -384,7 +391,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 	while (size > 0) {
 		if (!nloop) {
 
-			SetTag(pMem);
+			SetTag<CpuExt_Base,false>(pMem);
 			incTag(1);
 		}
 		else
@@ -523,7 +530,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 	return size;
 }
 
-void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
+__forceinline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
 {
 	uint endpos = destStart + len;
 	if( endpos < destSize )
@@ -541,7 +548,7 @@ void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint
 	}
 }
 
-void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
+__forceinline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
 {
 	uint endpos = srcStart + len;
 	if( endpos < srcSize )
@@ -559,16 +566,21 @@ void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128
 	}
 }
 
-// [TODO] optimization: If later templated, we can have Paths 1 and 3 use aligned SSE movs,
-// since only PATH2 can feed us unaligned source data.
 #define copyTag() do {						\
-	/*RingBuffer.m_Ring[ringpos] = *pMem128;*/	\
-	_mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], _mm_loadu_ps((float*)pMem128)); \
+	if( CpuExt >= CpuExt_SSE )				\
+		_mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], (pathidx!=GIF_PATH_2) ? _mm_load_ps((float*)pMem128) : _mm_loadu_ps((float*)pMem128)); \
+	else \
+		RingBuffer.m_Ring[ringpos] = *pMem128;	\
 	++pMem128; --size;						\
 	ringpos = (ringpos+1)&RingBufferMask;	\
 } while(false)
 
-__forceinline int GIFPath::CopyTag(GIF_PATH pathidx, const u128* pMem128, u32 size)
+// Parameters:
+//   size - max size of incoming data stream, in qwc (simd128).  If the path is PATH1, and the
+//     path does not terminate (EOP) within the specified size, it is assumed that the path must
+//     loop around to the start of VU memory and continue processing.
+template< CpuExtType CpuExt, int pathidx > 
+__forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
 {
 	uint& ringpos = GetMTGS().m_packet_ringpos;
 	const uint original_ringpos = ringpos;
@@ -578,12 +590,7 @@ __forceinline int GIFPath::CopyTag(GIF_PATH pathidx, const u128* pMem128, u32 si
 	while (size > 0) {
 		if (!nloop) {
 
-			// [TODO] Optimization: Use MMX intrinsics for SetTag and CopyTag, which both currently
-			//   produce a series of mov eax,[src]; mov [dest],eax instructions to copy these
-			//   individual qwcs.  Warning: Path2 transfers are not always QWC-aligned, but they are
-			//   always aligned on an 8 byte boundary; so its probably best to use MMX here.
-
-			SetTag((u8*)pMem128);
+			SetTag<CpuExt, (pathidx!=GIF_PATH_2)>((u8*)pMem128);
 			copyTag();
 			
 			if(nloop > 0)
@@ -863,9 +870,30 @@ __forceinline int GIFPath::CopyTag(GIF_PATH pathidx, const u128* pMem128, u32 si
 	return size;
 }
 
+typedef int __fastcall FnType_CopyTag(const u128* pMem, u32 size);
+
+static __aligned16 FnType_CopyTag* tbl_CopyTag[3];
+
+// Parameters:
+//   size - max size of incoming data stream, in qwc (simd128).  If the path is PATH1, and the
+//     path does not terminate (EOP) within the specified size, it is assumed that the path must
+//     loop around to the start of VU memory and continue processing.
+template< CpuExtType CpuExt, int pathidx >
+static int __fastcall _CopyTag_tmpl(const u128* pMem, u32 size)
+{
+	return s_gifPath[pathidx].CopyTag<CpuExt,pathidx>(pMem, size);
+}
+
+void GIFPath_Initialize()
+{
+	tbl_CopyTag[0] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 0> : _CopyTag_tmpl<CpuExt_Base, 0>;
+	tbl_CopyTag[1] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 1> : _CopyTag_tmpl<CpuExt_Base, 1>;
+	tbl_CopyTag[2] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 2> : _CopyTag_tmpl<CpuExt_Base, 2>;
+}
+
 __forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
 {
-	return s_gifPath[pathidx].CopyTag(pathidx, pMem, size);
+	return tbl_CopyTag[pathidx](pMem, size);
 }
 
 // Quick version for queueing PATH1 data.

From f6d0222a8c796f30b921328aed0d8242f478c916 Mon Sep 17 00:00:00 2001
From: arcum42 <arcum42@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Tue, 13 Jul 2010 09:28:24 +0000
Subject: [PATCH 08/26] ReorderingMTGS: Hackfix Linux, until some assembly is
 written.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3476 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/Utilities/MemcpyFast.h | 5 ++++-
 pcsx2/ps2/GIFpath.cpp                 | 8 ++++++++
 plugins/spu2-x/src/Linux/SPU2-X.cbp   | 2 --
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index a8184b345b..d78f9f8a8e 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -41,5 +41,8 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 #define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
 #define memcpy_const			memcpy_amd_	// Memcpy with constant size
 #define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned
+#ifndef __LINUX__
 #define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
-//#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
+#else
+#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
+#endif
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index 2f2907e2d2..09c89c1200 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -886,9 +886,17 @@ static int __fastcall _CopyTag_tmpl(const u128* pMem, u32 size)
 
 void GIFPath_Initialize()
 {
+#ifdef __LINUX__
+	// It's already thrown an exception if it isn't SSE, and the check was giving me a compilation error.
+	// I could fix it, but why bother?
+	tbl_CopyTag[0] = _CopyTag_tmpl<CpuExt_SSE, 0>;
+	tbl_CopyTag[1] = _CopyTag_tmpl<CpuExt_SSE, 1>;
+	tbl_CopyTag[2] = _CopyTag_tmpl<CpuExt_SSE, 2>;
+#else
 	tbl_CopyTag[0] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 0> : _CopyTag_tmpl<CpuExt_Base, 0>;
 	tbl_CopyTag[1] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 1> : _CopyTag_tmpl<CpuExt_Base, 1>;
 	tbl_CopyTag[2] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 2> : _CopyTag_tmpl<CpuExt_Base, 2>;
+#endif
 }
 
 __forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
diff --git a/plugins/spu2-x/src/Linux/SPU2-X.cbp b/plugins/spu2-x/src/Linux/SPU2-X.cbp
index c262b9f674..fa27c73506 100644
--- a/plugins/spu2-x/src/Linux/SPU2-X.cbp
+++ b/plugins/spu2-x/src/Linux/SPU2-X.cbp
@@ -195,8 +195,6 @@
 		<Unit filename="../spdif.h" />
 		<Unit filename="../spu2freeze.cpp" />
 		<Unit filename="../spu2sys.cpp" />
-		<Unit filename="../utf8.cpp" />
-		<Unit filename="../utf8.h" />
 		<Extensions>
 			<code_completion />
 			<debugger />

From d1e0922417b4c69c2b66311023ba9464235428c7 Mon Sep 17 00:00:00 2001
From: arcum42 <arcum42@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Tue, 13 Jul 2010 10:48:35 +0000
Subject: [PATCH 09/26] ReorderingMTGS: Initial Linux version of
 memcpy_amd_qwc. Disabled for now, till I get a chance to look it over better.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3477 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/build/Utilities/Utilities.cbp  |  4 ++
 common/include/Utilities/MemcpyFast.h | 92 +++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/common/build/Utilities/Utilities.cbp b/common/build/Utilities/Utilities.cbp
index 0e98314365..0d7fbbe596 100644
--- a/common/build/Utilities/Utilities.cbp
+++ b/common/build/Utilities/Utilities.cbp
@@ -208,6 +208,10 @@
 		<Unit filename="../../src/Utilities/wxGuiTools.cpp" />
 		<Unit filename="../../src/Utilities/wxHelpers.cpp" />
 		<Unit filename="../../src/Utilities/x86/MemcpyFast.S" />
+		<Unit filename="../../src/Utilities/x86/MemcpyFast.cpp">
+			<Option compile="0" />
+			<Option link="0" />
+		</Unit>
 		<Extensions>
 			<envvars />
 			<code_completion>
diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index d78f9f8a8e..9fc9331651 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -23,6 +23,95 @@
 	extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 	extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);
 
+#if 0
+	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
+	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
+	static __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+	{	
+		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
+		// registers will improve copy performance, because they won't.  Use of XMMs is only
+		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
+		// and even then the benefits are typically minimal (sometimes slower depending on the
+		// amount of data being copied).
+		//
+		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
+		//   --air
+
+		// Linux Conversion note:
+		//  This code would benefit nicely from having inline-able GAS syntax, since it should
+		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
+		//  And its called enough times to probably merit the extra effort to ensure proper
+		//  optimization. --air
+
+		__asm__
+		(
+			".intel_syntax noprefix\n"
+				"mov		ecx, [%[dest]]\n"
+				"mov		edx, [%[src]]\n"
+				"mov		eax, [%[qwc]]\n"			// keep a copy of count
+				"shr		eax, 1\n"
+				"jz		memcpy_qwc_1\n"		// only one 16 byte block to copy?
+
+				"cmp		eax, 64\n" // "IN_CACHE_COPY/32"
+				"jb		memcpy_qwc_loop1\n"	// small copies should be cached (definite speedup --air)
+		
+			"memcpy_qwc_loop2:\n"				// 32-byte blocks, uncached copy
+				"prefetchnta [edx + 568]\n"		// start reading ahead (tested: it helps! --air)
+
+				"movq	mm0,[edx+0]\n"			// read 64 bits
+				"movq	mm1,[edx+8]\n"
+				"movq	mm2,[edx+16]\n"
+				"movntq	[ecx+0], mm0\n"		// write 64 bits, bypassing the cache
+				"movntq	[ecx+8], mm1\n"
+				"movq	mm3,[edx+24]\n"
+				"movntq	[ecx+16], mm2\n"
+				"movntq	[ecx+24], mm3\n"
+
+				"add		edx,32\n"				// update source pointer
+				"add		ecx,32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop2\n"	// last 64-byte block?
+				"sfence\n"						// flush the write buffer
+				"jmp		memcpy_qwc_1\n"
+
+			// 32-byte blocks, cached!
+			// This *is* important.  Removing this and using exclusively non-temporal stores
+			// results in noticable speed loss!
+
+			"memcpy_qwc_loop1:\n"				
+				"prefetchnta [edx + 568]\n"		// start reading ahead (tested: it helps! --air)
+
+				"movq	mm0,[edx+0]\n"			// read 64 bits
+				"movq	mm1,[edx+8]\n"
+				"movq	mm2,[edx+16]\n"
+				"movq	[ecx+0], mm0\n"		// write 64 bits, bypassing the cache
+				"movq	[ecx+8], mm1\n"
+				"movq	mm3,[edx+24]\n"
+				"movq	[ecx+16], mm2\n"
+				"movq	[ecx+24], mm3\n"
+
+				"add		edx,32\n"				// update source pointer
+				"add		ecx,32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop1\n"	// last 64-byte block?
+
+			"memcpy_qwc_1:\n"
+				"test	[%[qwc]],dword ptr 1\n"
+				"jz		memcpy_qwc_final\n"
+				"movq	mm0,[edx]\n"
+				"movq	mm1,[edx+8]\n"
+				"movq	[ecx], mm0\n"
+				"movq	[ecx+8], mm1\n"
+
+			"memcpy_qwc_final:\n"
+				"emms\n"				// clean up the MMX state
+			".att_syntax\n"
+					: "=r"(dest), "=r"(src), "=r"(qwc)
+					: [dest]"r"(dest), [src]"r"(src), [qwc]"r"(qwc)
+					//: Needs a clobber list here
+		);
+	}
+#endif
 #else
 
 #	include "win_memzero.h"
@@ -41,8 +130,11 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 #define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
 #define memcpy_const			memcpy_amd_	// Memcpy with constant size
 #define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned
+
+//#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
 #ifndef __LINUX__
 #define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
 #else
 #define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
+//#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
 #endif

From 168a60ad13ab1b01ce832605ee5e67a120ad211c Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Tue, 13 Jul 2010 16:34:27 +0000
Subject: [PATCH 10/26] ReorderingMTGS: Some cleanups and minor fixes to thread
 benching.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3478 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/src/Utilities/x86/MemcpyFast.cpp |  8 +--
 pcsx2/GS.h                              |  7 +-
 pcsx2/MTGS.cpp                          | 86 +++++++------------------
 3 files changed, 33 insertions(+), 68 deletions(-)

diff --git a/common/src/Utilities/x86/MemcpyFast.cpp b/common/src/Utilities/x86/MemcpyFast.cpp
index bbf2fb8d5f..64a8191068 100644
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@@ -256,9 +256,9 @@ __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
 
     __asm
 	{
-	mov		ecx, [dest]
-	mov		edx, [src]
-	mov		eax, [qwc]			; keep a copy of count
+	mov		ecx, dest
+	mov		edx, src
+	mov		eax, qwc			; keep a copy of count
 	shr		eax, 1
 	jz		$memcpy_qwc_1		; only one 16 byte block to copy?
 
@@ -306,7 +306,7 @@ $memcpy_qwc_loop1:
 	jnz		$memcpy_qwc_loop1	; last 64-byte block?
 
 $memcpy_qwc_1:
-	test	[qwc],1
+	test	qwc,1
 	jz		$memcpy_qwc_final
 	movq	mm0,[edx]
 	movq	mm1,[edx+8]
diff --git a/pcsx2/GS.h b/pcsx2/GS.h
index eeb337f2b6..a0fc4f7bb2 100644
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@@ -258,7 +258,6 @@ enum MTGS_RingCommand
 	GS_RINGTYPE_P1
 ,	GS_RINGTYPE_P2
 ,	GS_RINGTYPE_P3
-,	GS_RINGTYPE_RESTART
 ,	GS_RINGTYPE_VSYNC
 ,	GS_RINGTYPE_FRAMESKIP
 ,	GS_RINGTYPE_FREEZE
@@ -291,10 +290,12 @@ public:
 	volatile u32	m_SignalRingEnable;
 	volatile s32	m_SignalRingPosition;
 
-	int				m_QueuedFrameCount;
+	volatile s32	m_QueuedFrameCount;
+	volatile u32	m_VsyncSignalListener;
 
-	Mutex			m_lock_RingBufferBusy;
+	Mutex			m_mtx_RingBufferBusy;
 	Semaphore		m_sem_OnRingReset;
+	Semaphore		m_sem_Vsync;
 
 	// used to keep multiple threads from sending packets to the ringbuffer concurrently.
 	// (currently not used or implemented -- is a planned feature for a future threaded VU1)
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index da894e81d8..ace42c811c 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -77,6 +77,7 @@ void SysMtgsThread::OnStart()
 	m_packet_ringpos	= 0;
 
 	m_QueuedFrameCount	= 0;
+	m_VsyncSignalListener = false;
 	m_SignalRingEnable	= 0;
 	m_SignalRingPosition= 0;
 
@@ -106,6 +107,7 @@ void SysMtgsThread::ResetGS()
 
 	m_RingPos = m_WritePos;
 	m_QueuedFrameCount = 0;
+	m_VsyncSignalListener = false;
 
 	MTGS_LOG( "MTGS: Sending Reset..." );
 	SendSimplePacket( GS_RINGTYPE_RESET, 0, 0, 0 );
@@ -139,36 +141,22 @@ void SysMtgsThread::PostVsyncEnd()
 	remainder[1] = GSIMR;
 	(GSRegSIGBLID&)remainder[2] = GSSIGLBLID;
 	m_packet_ringpos = (m_packet_ringpos + 1) & RingBufferMask;
-	
+
 	SendDataPacket();
 
+	// Vsyncs should always start the GS thread, regardless of how little has actually be queued.
+	if (m_CopyDataTally != 0) SetEvent();
+
 	// If the MTGS is allowed to queue a lot of frames in advance, it creates input lag.
-	// Use the Queued FrameCount to stall the EE if another vsync is already queued in
-	// the ringbuffer.
+	// Use the Queued FrameCount to stall the EE if another vsync (or two) are already queued
+	// in the ringbuffer.  The queue limit is disabled when FrameLimiting is disabled, since
+	// the queue can have perverse effects on framerate benchmarking.
 
-	if( AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize ) return;
+	if ((AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize) || !EmuConfig.GS.FrameLimitEnable) return;
 
-	uint readpos = volatize(m_RingPos);
-	uint freeroom;
-
-	if (m_WritePos < readpos)
-		freeroom = readpos - m_WritePos;
-	else
-		freeroom = RingBufferSize - (m_WritePos - readpos);
-
-	uint totalAccum	= RingBufferSize - freeroom;
-	uint somedone	= totalAccum / 4;
-
-	m_SignalRingPosition = totalAccum;
-
-	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\tringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", readpos, m_WritePos, m_SignalRingPosition );
-
-	AtomicExchange( m_SignalRingEnable, 1 );
-	SetEvent();
-	m_sem_OnRingReset.WaitWithoutYield();
-	readpos = volatize(m_RingPos);
-
-	pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
+	m_VsyncSignalListener = true;
+	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", volatize(m_RingPos), m_WritePos );
+	m_sem_Vsync.WaitNoCancel();
 }
 
 struct PacketTagType
@@ -256,7 +244,7 @@ protected:
 
 public:
 	RingBufferLock( SysMtgsThread& mtgs )
-		: ScopedLock( mtgs.m_lock_RingBufferBusy )
+		: ScopedLock( mtgs.m_mtx_RingBufferBusy )
 		, m_mtgs( mtgs )
 	{
 		m_mtgs.m_RingBufferIsBusy = true;
@@ -397,25 +385,20 @@ void SysMtgsThread::ExecuteTaskInThread()
 				{
 					switch( tag.command )
 					{
-						case GS_RINGTYPE_RESTART:
-							//MTGS_LOG( "(MTGS Packet Read) ringtype=Restart" );
-							m_RingPos = 0;
-						continue;
-
 						case GS_RINGTYPE_VSYNC:
 						{
 							const int qsize = tag.data[0];
 							ringposinc += qsize;
 
 							MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", !!(((u32&)RingBuffer.Regs[0x1000]) & 0x2000) ? 0 : 1, tag.data[1] ? "true" : "false" );
-							
+
 							// Mail in the important GS registers.
 							RingCmdPacket_Vsync& local((RingCmdPacket_Vsync&)RingBuffer[m_RingPos+1]);
 							memcpy_fast( RingBuffer.Regs, local.regset1, sizeof(local.regset1));
 							((u32&)RingBuffer.Regs[0x1000]) = local.csr;
 							((u32&)RingBuffer.Regs[0x1010]) = local.imr;
 							((GSRegSIGBLID&)RingBuffer.Regs[0x1080]) = local.siglblid;
-							
+
 							// CSR & 0x2000; is the pageflip id.
 							GSvsync(((u32&)RingBuffer.Regs[0x1000]) & 0x2000);
 							gsFrameSkip();
@@ -426,7 +409,12 @@ void SysMtgsThread::ExecuteTaskInThread()
 								PADupdate(0);
 
 							AtomicDecrement( m_QueuedFrameCount );
+							if (!!AtomicExchange(m_VsyncSignalListener, false))
+								m_sem_Vsync.Post();
+
+							busy.Release();
 							StateCheckInThread();
+							busy.Acquire();
 						}
 						break;
 
@@ -512,6 +500,9 @@ void SysMtgsThread::ExecuteTaskInThread()
 			m_sem_OnRingReset.Post();
 		}
 
+		if (!!AtomicExchange(m_VsyncSignalListener, false))
+			m_sem_Vsync.Post();
+
 		//Console.Warning( "(MTGS Thread) Nothing to do!  ringpos=0x%06x", m_RingPos );
 	}
 }
@@ -558,7 +549,7 @@ void SysMtgsThread::WaitGS()
 		RethrowException();
 
 		do {
-			m_lock_RingBufferBusy.Wait();
+			m_mtx_RingBufferBusy.Wait();
 			RethrowException();
 		} while( volatize(m_RingPos) != m_WritePos );
 	}
@@ -588,32 +579,6 @@ void SysMtgsThread::SendDataPacket()
 	// make sure a previous copy block has been started somewhere.
 	pxAssert( m_packet_size != 0 );
 
-	#if 0
-	uint temp = m_packet_ringpos + m_packet_size;
-	pxAssert( temp <= RingBufferSize );
-	temp &= RingBufferMask;
-
-	if( IsDebugBuild )
-	{
-		if( m_packet_ringpos + m_packet_size < RingBufferSize )
-		{
-			uint readpos = volatize(m_RingPos);
-			if( readpos != m_WritePos )
-			{
-				// The writepos should never leapfrog the readpos
-				// since that indicates a bad write.
-				if( m_packet_ringpos < readpos )
-					pxAssert( temp < readpos );
-			}
-
-			// Updating the writepos should never make it equal the readpos, since
-			// that would stop the buffer prematurely (and indicates bad code in the
-			// ringbuffer manager)
-			pxAssert( readpos != temp );
-		}
-	}
-	#endif
-
 	uint actualSize = ((m_packet_ringpos - m_packet_startpos) & RingBufferMask)-1;
 	pxAssert( actualSize <= m_packet_size );
 	pxAssert( m_packet_ringpos < RingBufferSize );
@@ -621,7 +586,6 @@ void SysMtgsThread::SendDataPacket()
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
 	tag.data[0] = actualSize;
 
-	//Threading::StoreFence();
 	m_WritePos = m_packet_ringpos;
 
 	if( EmuConfig.GS.SynchronousMTGS )

From d9477ab5f49d34f369b399c2fd5a13c980aedf29 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Tue, 13 Jul 2010 16:36:57 +0000
Subject: [PATCH 11/26] ReorderingMTGS: only disable the vsync queue limit is
 both framelimiting AND vsync are disabled.  (ensures no mysterious
 half-second input lag if some user has a bizarre config)

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3479 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/MTGS.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index ace42c811c..cd20f945f4 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -149,10 +149,10 @@ void SysMtgsThread::PostVsyncEnd()
 
 	// If the MTGS is allowed to queue a lot of frames in advance, it creates input lag.
 	// Use the Queued FrameCount to stall the EE if another vsync (or two) are already queued
-	// in the ringbuffer.  The queue limit is disabled when FrameLimiting is disabled, since
-	// the queue can have perverse effects on framerate benchmarking.
+	// in the ringbuffer.  The queue limit is disabled when both FrameLimiting and Vsync are
+	// disabled, since the queue can have perverse effects on framerate benchmarking.
 
-	if ((AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize) || !EmuConfig.GS.FrameLimitEnable) return;
+	if ((AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize) || (!EmuConfig.GS.VsyncEnable && !EmuConfig.GS.FrameLimitEnable)) return;
 
 	m_VsyncSignalListener = true;
 	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", volatize(m_RingPos), m_WritePos );

From 6ded71561c2fa518c9e6d3d426d9b4dac1102226 Mon Sep 17 00:00:00 2001
From: arcum42 <arcum42@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Wed, 14 Jul 2010 09:19:46 +0000
Subject: [PATCH 12/26] ReorderingMTGS: Revise memcpy_amd_qwc for Linux.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3484 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/Utilities/MemcpyFast.h | 81 ++++++++++++---------------
 1 file changed, 37 insertions(+), 44 deletions(-)

diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index 9fc9331651..56c4d0ba39 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -23,7 +23,6 @@
 	extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 	extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);
 
-#if 0
 	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
 	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
 	static __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
@@ -46,30 +45,30 @@
 		__asm__
 		(
 			".intel_syntax noprefix\n"
-				"mov		ecx, [%[dest]]\n"
-				"mov		edx, [%[src]]\n"
-				"mov		eax, [%[qwc]]\n"			// keep a copy of count
-				"shr		eax, 1\n"
+				//"mov		ecx, [%[dest]]\n"
+				//"mov		edx, [%[src]]\n"
+				//"mov		eax, [%[qwc]]\n"			// keep a copy of count
+				"shr		%[qwc], 1\n"
 				"jz		memcpy_qwc_1\n"		// only one 16 byte block to copy?
 
-				"cmp		eax, 64\n" // "IN_CACHE_COPY/32"
+				"cmp		%[qwc], 64\n" // "IN_CACHE_COPY/32"
 				"jb		memcpy_qwc_loop1\n"	// small copies should be cached (definite speedup --air)
 		
 			"memcpy_qwc_loop2:\n"				// 32-byte blocks, uncached copy
-				"prefetchnta [edx + 568]\n"		// start reading ahead (tested: it helps! --air)
+				"prefetchnta [%[src] + 568]\n"		// start reading ahead (tested: it helps! --air)
 
-				"movq	mm0,[edx+0]\n"			// read 64 bits
-				"movq	mm1,[edx+8]\n"
-				"movq	mm2,[edx+16]\n"
-				"movntq	[ecx+0], mm0\n"		// write 64 bits, bypassing the cache
-				"movntq	[ecx+8], mm1\n"
-				"movq	mm3,[edx+24]\n"
-				"movntq	[ecx+16], mm2\n"
-				"movntq	[ecx+24], mm3\n"
+				"movq	mm0,[%[src]+0]\n"			// read 64 bits
+				"movq	mm1,[%[src]+8]\n"
+				"movq	mm2,[%[src]+16]\n"
+				"movntq	[%[dest]+0], mm0\n"		// write 64 bits, bypassing the cache
+				"movntq	[%[dest]+8], mm1\n"
+				"movq	mm3,[%[src]+24]\n"
+				"movntq	[%[dest]+16], mm2\n"
+				"movntq	[%[dest]+24], mm3\n"
 
-				"add		edx,32\n"				// update source pointer
-				"add		ecx,32\n"				// update destination pointer
-				"sub		eax,1\n"
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		%[qwc],1\n"
 				"jnz		memcpy_qwc_loop2\n"	// last 64-byte block?
 				"sfence\n"						// flush the write buffer
 				"jmp		memcpy_qwc_1\n"
@@ -79,39 +78,38 @@
 			// results in noticable speed loss!
 
 			"memcpy_qwc_loop1:\n"				
-				"prefetchnta [edx + 568]\n"		// start reading ahead (tested: it helps! --air)
+				"prefetchnta [%[src] + 568]\n"		// start reading ahead (tested: it helps! --air)
 
-				"movq	mm0,[edx+0]\n"			// read 64 bits
-				"movq	mm1,[edx+8]\n"
-				"movq	mm2,[edx+16]\n"
-				"movq	[ecx+0], mm0\n"		// write 64 bits, bypassing the cache
-				"movq	[ecx+8], mm1\n"
-				"movq	mm3,[edx+24]\n"
-				"movq	[ecx+16], mm2\n"
-				"movq	[ecx+24], mm3\n"
+				"movq	mm0,[%[src]+0]\n"			// read 64 bits
+				"movq	mm1,[%[src]+8]\n"
+				"movq	mm2,[%[src]+16]\n"
+				"movq	[%[dest]+0], mm0\n"		// write 64 bits, bypassing the cache
+				"movq	[%[dest]+8], mm1\n"
+				"movq	mm3,[%[src]+24]\n"
+				"movq	[%[dest]+16], mm2\n"
+				"movq	[%[dest]+24], mm3\n"
 
-				"add		edx,32\n"				// update source pointer
-				"add		ecx,32\n"				// update destination pointer
-				"sub		eax,1\n"
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		%[qwc],1\n"
 				"jnz		memcpy_qwc_loop1\n"	// last 64-byte block?
 
 			"memcpy_qwc_1:\n"
 				"test	[%[qwc]],dword ptr 1\n"
 				"jz		memcpy_qwc_final\n"
-				"movq	mm0,[edx]\n"
-				"movq	mm1,[edx+8]\n"
-				"movq	[ecx], mm0\n"
-				"movq	[ecx+8], mm1\n"
+				"movq	mm0,[%[src]]\n"
+				"movq	mm1,[%[src]+8]\n"
+				"movq	[%[dest]], mm0\n"
+				"movq	[%[dest]+8], mm1\n"
 
 			"memcpy_qwc_final:\n"
 				"emms\n"				// clean up the MMX state
 			".att_syntax\n"
-					: "=r"(dest), "=r"(src), "=r"(qwc)
-					: [dest]"r"(dest), [src]"r"(src), [qwc]"r"(qwc)
-					//: Needs a clobber list here
+					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
+					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
+					: "memory", "mm0", "mm1", "mm2", "mm3"
 		);
 	}
-#endif
 #else
 
 #	include "win_memzero.h"
@@ -131,10 +129,5 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 #define memcpy_const			memcpy_amd_	// Memcpy with constant size
 #define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned
 
-//#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
-#ifndef __LINUX__
 #define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
-#else
-#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
-//#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
-#endif
+//#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)

From e793f91993e5bcdb7401dfe282db3701de8944d7 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Wed, 14 Jul 2010 14:23:59 +0000
Subject: [PATCH 13/26] ReorderingMTGS: Linux asm memcpy fixes. (untested)

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3488 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/Utilities/MemcpyFast.h | 72 +++++++++++++--------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index 56c4d0ba39..c43087a931 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -48,62 +48,62 @@
 				//"mov		ecx, [%[dest]]\n"
 				//"mov		edx, [%[src]]\n"
 				//"mov		eax, [%[qwc]]\n"			// keep a copy of count
-				"shr		%[qwc], 1\n"
-				"jz		memcpy_qwc_1\n"		// only one 16 byte block to copy?
+				"cmp		%[qwc], 1\n"
+				"jbe		memcpy_qwc_1\n"				// only one 16 byte block to copy?
 
-				"cmp		%[qwc], 64\n" // "IN_CACHE_COPY/32"
-				"jb		memcpy_qwc_loop1\n"	// small copies should be cached (definite speedup --air)
+				"cmp		%[qwc], 128\n" // "IN_CACHE_COPY/16"
+				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
 		
-			"memcpy_qwc_loop2:\n"				// 32-byte blocks, uncached copy
-				"prefetchnta [%[src] + 568]\n"		// start reading ahead (tested: it helps! --air)
+			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
 
-				"movq	mm0,[%[src]+0]\n"			// read 64 bits
-				"movq	mm1,[%[src]+8]\n"
-				"movq	mm2,[%[src]+16]\n"
-				"movntq	[%[dest]+0], mm0\n"		// write 64 bits, bypassing the cache
-				"movntq	[%[dest]+8], mm1\n"
-				"movq	mm3,[%[src]+24]\n"
-				"movntq	[%[dest]+16], mm2\n"
-				"movntq	[%[dest]+24], mm3\n"
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		mm1,[%[src]+8]\n"
+				"movq		mm2,[%[src]+16]\n"
+				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movntq		[%[dest]+8], mm1\n"
+				"movq		mm3,[%[src]+24]\n"
+				"movntq		[%[dest]+16], mm2\n"
+				"movntq		[%[dest]+24], mm3\n"
 
 				"add		%[src],32\n"				// update source pointer
 				"add		%[dest],32\n"				// update destination pointer
-				"sub		%[qwc],1\n"
-				"jnz		memcpy_qwc_loop2\n"	// last 64-byte block?
-				"sfence\n"						// flush the write buffer
+				"sub		%[qwc],2\n"
+				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
+				"sfence\n"								// flush the write buffer
 				"jmp		memcpy_qwc_1\n"
 
 			// 32-byte blocks, cached!
 			// This *is* important.  Removing this and using exclusively non-temporal stores
-			// results in noticable speed loss!
+			// results in noticeable speed loss!
 
 			"memcpy_qwc_loop1:\n"				
-				"prefetchnta [%[src] + 568]\n"		// start reading ahead (tested: it helps! --air)
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
 
-				"movq	mm0,[%[src]+0]\n"			// read 64 bits
-				"movq	mm1,[%[src]+8]\n"
-				"movq	mm2,[%[src]+16]\n"
-				"movq	[%[dest]+0], mm0\n"		// write 64 bits, bypassing the cache
-				"movq	[%[dest]+8], mm1\n"
-				"movq	mm3,[%[src]+24]\n"
-				"movq	[%[dest]+16], mm2\n"
-				"movq	[%[dest]+24], mm3\n"
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		mm1,[%[src]+8]\n"
+				"movq		mm2,[%[src]+16]\n"
+				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movq		[%[dest]+8], mm1\n"
+				"movq		mm3,[%[src]+24]\n"
+				"movq		[%[dest]+16], mm2\n"
+				"movq		[%[dest]+24], mm3\n"
 
 				"add		%[src],32\n"				// update source pointer
 				"add		%[dest],32\n"				// update destination pointer
-				"sub		%[qwc],1\n"
-				"jnz		memcpy_qwc_loop1\n"	// last 64-byte block?
+				"sub		%[qwc],2\n"
+				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
 
 			"memcpy_qwc_1:\n"
-				"test	[%[qwc]],dword ptr 1\n"
-				"jz		memcpy_qwc_final\n"
-				"movq	mm0,[%[src]]\n"
-				"movq	mm1,[%[src]+8]\n"
-				"movq	[%[dest]], mm0\n"
-				"movq	[%[dest]+8], mm1\n"
+				"test		[%qwc],1\n"
+				"jz			memcpy_qwc_final\n"
+				"movq		mm0,[%[src]]\n"
+				"movq		mm1,[%[src]+8]\n"
+				"movq		[%[dest]], mm0\n"
+				"movq		[%[dest]+8], mm1\n"
 
 			"memcpy_qwc_final:\n"
-				"emms\n"				// clean up the MMX state
+				"emms\n"								// clean up the MMX state
 			".att_syntax\n"
 					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
 					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)

From c8f16a1cde4198280af8077dab413f4ae24aa964 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Wed, 14 Jul 2010 14:40:13 +0000
Subject: [PATCH 14/26] ReorderingMTGS: Linux memcpy attempt #1527, Action!

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3489 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/Utilities/MemcpyFast.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index c43087a931..fa3b017746 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -48,10 +48,11 @@
 				//"mov		ecx, [%[dest]]\n"
 				//"mov		edx, [%[src]]\n"
 				//"mov		eax, [%[qwc]]\n"			// keep a copy of count
-				"cmp		%[qwc], 1\n"
-				"jbe		memcpy_qwc_1\n"				// only one 16 byte block to copy?
+				"mov		eax, %[qwc]\n"
+				"shr		eax, 1\n"
+				"jz			memcpy_qwc_1\n"				// only one 16 byte block to copy?
 
-				"cmp		%[qwc], 128\n" // "IN_CACHE_COPY/16"
+				"cmp		%[qwc], 64\n"				// "IN_CACHE_COPY/32"
 				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
 		
 			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
@@ -68,7 +69,7 @@
 
 				"add		%[src],32\n"				// update source pointer
 				"add		%[dest],32\n"				// update destination pointer
-				"sub		%[qwc],2\n"
+				"sub		eax,1\n"
 				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
 				"sfence\n"								// flush the write buffer
 				"jmp		memcpy_qwc_1\n"
@@ -91,7 +92,7 @@
 
 				"add		%[src],32\n"				// update source pointer
 				"add		%[dest],32\n"				// update destination pointer
-				"sub		%[qwc],2\n"
+				"sub		eax,1\n"
 				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
 
 			"memcpy_qwc_1:\n"
@@ -107,7 +108,7 @@
 			".att_syntax\n"
 					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
 					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
-					: "memory", "mm0", "mm1", "mm2", "mm3"
+					: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
 		);
 	}
 #else

From 2f3452ec2577061ffa2d324301b4e4b8fb22887d Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Thu, 15 Jul 2010 05:21:26 +0000
Subject: [PATCH 15/26] ReorderingMTGS:  * Make PCSX2 bare minimum reqs include
 SSE as well as MMX.  * Minor bugfix which could have affected MTGS
 performance.  * Default GIFpath stuff to use SSE opts.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3491 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/GS.cpp          |   1 -
 pcsx2/GS.h            |  20 ++---
 pcsx2/MTGS.cpp        | 201 ++++++++++++++++--------------------------
 pcsx2/gui/AppInit.cpp |  10 +--
 pcsx2/ps2/GIFpath.cpp |  65 +++++---------
 5 files changed, 110 insertions(+), 187 deletions(-)

diff --git a/pcsx2/GS.cpp b/pcsx2/GS.cpp
index fa7d5cfedd..b9028de2ee 100644
--- a/pcsx2/GS.cpp
+++ b/pcsx2/GS.cpp
@@ -49,7 +49,6 @@ void gsSetRegionMode( GS_RegionMode region )
 void gsInit()
 {
 	memzero(g_RealGSMem);
-	GIFPath_Initialize();
 }
 
 extern bool SIGNAL_IMR_Pending;
diff --git a/pcsx2/GS.h b/pcsx2/GS.h
index a0fc4f7bb2..8162149218 100644
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@@ -18,15 +18,6 @@
 #include "Common.h"
 #include "System/SysThreads.h"
 
-enum CpuExtType
-{
-	CpuExt_Base,
-	CpuExt_MMX,
-	CpuExt_SSE,
-	CpuExt_SSE2,
-	CpuExt_SSE41,
-};
-
 extern __aligned16 u8 g_RealGSMem[Ps2MemSize::GSregs];
 
 enum CSR_FifoState
@@ -282,8 +273,8 @@ class SysMtgsThread : public SysThreadBase
 	typedef SysThreadBase _parent;
 
 public:
-	// note: when m_RingPos == m_WritePos, the fifo is empty
-	uint			m_RingPos;			// cur pos gs is reading from
+	// note: when m_ReadPos == m_WritePos, the fifo is empty
+	uint			m_ReadPos;			// cur pos gs is reading from
 	uint			m_WritePos;			// cur pos ee thread is writing to
 
 	volatile bool	m_RingBufferIsBusy;
@@ -313,7 +304,7 @@ public:
 
 	uint			m_packet_startpos;	// size of the packet (data only, ie. not including the 16 byte command!)
 	uint			m_packet_size;		// size of the packet (data only, ie. not including the 16 byte command!)
-	uint			m_packet_ringpos;	// index of the data location in the ringbuffer.
+	uint			m_packet_writepos;	// index of the data location in the ringbuffer.
 
 #ifdef RINGBUF_DEBUG_STACK
 	Threading::Mutex m_lock_Stack;
@@ -356,9 +347,10 @@ protected:
 	void OnResumeInThread( bool IsSuspended );
 	void OnCleanupInThread();
 
+	void GenericStall( uint size );
+
 	// Used internally by SendSimplePacket type functions
-	uint _PrepForSimplePacket();
-	void _FinishSimplePacket( uint future_writepos );
+	void _FinishSimplePacket();
 	void ExecuteTaskInThread();
 };
 
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index cd20f945f4..3b0f4694be 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -70,11 +70,11 @@ void SysMtgsThread::OnStart()
 {
 	m_PluginOpened		= false;
 
-	m_RingPos			= 0;
+	m_ReadPos			= 0;
 	m_WritePos			= 0;
 	m_RingBufferIsBusy	= false;
 	m_packet_size		= 0;
-	m_packet_ringpos	= 0;
+	m_packet_writepos	= 0;
 
 	m_QueuedFrameCount	= 0;
 	m_VsyncSignalListener = false;
@@ -98,14 +98,14 @@ void SysMtgsThread::OnResumeReady()
 
 void SysMtgsThread::ResetGS()
 {
-	pxAssertDev( !IsOpen() || (m_RingPos == m_WritePos), "Must close or terminate the GS thread prior to gsReset." );
+	pxAssertDev( !IsOpen() || (m_ReadPos == m_WritePos), "Must close or terminate the GS thread prior to gsReset." );
 
 	// MTGS Reset process:
 	//  * clear the ringbuffer.
 	//  * Signal a reset.
 	//  * clear the path and byRegs structs (used by GIFtagDummy)
 
-	m_RingPos = m_WritePos;
+	m_ReadPos = m_WritePos;
 	m_QueuedFrameCount = 0;
 	m_VsyncSignalListener = false;
 
@@ -134,13 +134,13 @@ void SysMtgsThread::PostVsyncEnd()
 
 	uint packsize = sizeof(RingCmdPacket_Vsync) / 16;
 	PrepDataPacket(GS_RINGTYPE_VSYNC, packsize);
-	MemCopy_WrappedDest( (u128*)PS2MEM_GS, RingBuffer.m_Ring, m_packet_ringpos, RingBufferSize, 0xf );
+	MemCopy_WrappedDest( (u128*)PS2MEM_GS, RingBuffer.m_Ring, m_packet_writepos, RingBufferSize, 0xf );
 
 	u32* remainder = (u32*)GetDataPacketPtr();
 	remainder[0] = GSCSRr;
 	remainder[1] = GSIMR;
 	(GSRegSIGBLID&)remainder[2] = GSSIGLBLID;
-	m_packet_ringpos = (m_packet_ringpos + 1) & RingBufferMask;
+	m_packet_writepos = (m_packet_writepos + 1) & RingBufferMask;
 
 	SendDataPacket();
 
@@ -155,7 +155,7 @@ void SysMtgsThread::PostVsyncEnd()
 	if ((AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize) || (!EmuConfig.GS.VsyncEnable && !EmuConfig.GS.FrameLimitEnable)) return;
 
 	m_VsyncSignalListener = true;
-	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", volatize(m_RingPos), m_WritePos );
+	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", volatize(m_ReadPos), m_WritePos );
 	m_sem_Vsync.WaitNoCancel();
 }
 
@@ -239,6 +239,8 @@ void SysMtgsThread::OpenPlugin()
 
 class RingBufferLock : public ScopedLock
 {
+	typedef ScopedLock _parent;
+	
 protected:
 	SysMtgsThread&		m_mtgs;
 
@@ -254,6 +256,18 @@ public:
 	{
 		m_mtgs.m_RingBufferIsBusy = false;
 	}
+	
+	void Acquire()
+	{
+		_parent::Acquire();
+		m_mtgs.m_RingBufferIsBusy = true;
+	}
+	
+	void Release()
+	{
+		m_mtgs.m_RingBufferIsBusy = false;
+		_parent::Release();	
+	}
 };
 
 void SysMtgsThread::ExecuteTaskInThread()
@@ -262,31 +276,33 @@ void SysMtgsThread::ExecuteTaskInThread()
 	PacketTagType prevCmd;
 #endif
 
+	RingBufferLock busy( *this );
+
 	while( true )
 	{
+		busy.Release();
+
 		// Performance note: Both of these perform cancellation tests, but pthread_testcancel
 		// is very optimized (only 1 instruction test in most cases), so no point in trying
 		// to avoid it.
 
 		m_sem_event.WaitWithoutYield();
 		StateCheckInThread();
+		busy.Acquire();
 
-		{
-		RingBufferLock busy( *this );
-
-		// note: m_RingPos is intentionally not volatile, because it should only
+		// note: m_ReadPos is intentionally not volatile, because it should only
 		// ever be modified by this thread.
-		while( m_RingPos != volatize(m_WritePos))
+		while( m_ReadPos != volatize(m_WritePos))
 		{
 			if( EmuConfig.GS.DisableOutput )
 			{
-				m_RingPos = m_WritePos;
+				m_ReadPos = m_WritePos;
 				continue;
 			}
 
-			pxAssert( m_RingPos < RingBufferSize );
+			pxAssert( m_ReadPos < RingBufferSize );
 
-			const PacketTagType& tag = (PacketTagType&)RingBuffer[m_RingPos];
+			const PacketTagType& tag = (PacketTagType&)RingBuffer[m_ReadPos];
 			u32 ringposinc = 1;
 
 #ifdef RINGBUF_DEBUG_STACK
@@ -294,11 +310,11 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 			m_lock_Stack.Lock();
 			uptr stackpos = ringposStack.back();
-			if( stackpos != m_RingPos )
+			if( stackpos != m_ReadPos )
 			{
-				Console.Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, m_RingPos, prevCmd.command );
+				Console.Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, m_ReadPos, prevCmd.command );
 			}
-			pxAssert( stackpos == m_RingPos );
+			pxAssert( stackpos == m_ReadPos );
 			prevCmd = tag;
 			ringposStack.pop_back();
 			m_lock_Stack.Release();
@@ -308,7 +324,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 			{
 				case GS_RINGTYPE_P1:
 				{
-					uint datapos = (m_RingPos+1) & RingBufferMask;
+					uint datapos = (m_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
 					const u128* data = &RingBuffer[datapos];
 
@@ -333,7 +349,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 				case GS_RINGTYPE_P2:
 				{
-					uint datapos = (m_RingPos+1) & RingBufferMask;
+					uint datapos = (m_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
 					const u128* data = &RingBuffer[datapos];
 
@@ -358,7 +374,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 				case GS_RINGTYPE_P3:
 				{
-					uint datapos = (m_RingPos+1) & RingBufferMask;
+					uint datapos = (m_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
 					const u128* data = &RingBuffer[datapos];
 
@@ -393,11 +409,13 @@ void SysMtgsThread::ExecuteTaskInThread()
 							MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", !!(((u32&)RingBuffer.Regs[0x1000]) & 0x2000) ? 0 : 1, tag.data[1] ? "true" : "false" );
 
 							// Mail in the important GS registers.
-							RingCmdPacket_Vsync& local((RingCmdPacket_Vsync&)RingBuffer[m_RingPos+1]);
-							memcpy_fast( RingBuffer.Regs, local.regset1, sizeof(local.regset1));
-							((u32&)RingBuffer.Regs[0x1000]) = local.csr;
-							((u32&)RingBuffer.Regs[0x1010]) = local.imr;
-							((GSRegSIGBLID&)RingBuffer.Regs[0x1080]) = local.siglblid;
+							uint datapos = (m_ReadPos+1) & RingBufferMask;
+							MemCopy_WrappedSrc( RingBuffer.m_Ring, datapos, RingBufferSize, (u128*)RingBuffer.Regs, 0xf );
+
+							u32* remainder = (u32*)&RingBuffer[datapos];
+							GSCSRr		= remainder[0];
+							GSIMR		= remainder[1];
+							GSSIGLBLID	= (GSRegSIGBLID&)remainder[2];
 
 							// CSR & 0x2000; is the pageflip id.
 							GSvsync(((u32&)RingBuffer.Regs[0x1000]) & 0x2000);
@@ -454,9 +472,9 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 #ifdef PCSX2_DEVBUILD
 						default:
-							Console.Error("GSThreadProc, bad packet (%x) at m_RingPos: %x, m_WritePos: %x", tag.command, m_RingPos, m_WritePos);
+							Console.Error("GSThreadProc, bad packet (%x) at m_ReadPos: %x, m_WritePos: %x", tag.command, m_ReadPos, m_WritePos);
 							pxFail( "Bad packet encountered in the MTGS Ringbuffer." );
-							m_RingPos = m_WritePos;
+							m_ReadPos = m_WritePos;
 						continue;
 #else
 						// Optimized performance in non-Dev builds.
@@ -466,28 +484,29 @@ void SysMtgsThread::ExecuteTaskInThread()
 				}
 			}
 
-			uint newringpos = (m_RingPos + ringposinc) & RingBufferMask;
+			uint newringpos = (m_ReadPos + ringposinc) & RingBufferMask;
 
 			if( EmuConfig.GS.SynchronousMTGS )
 			{
 				pxAssert( m_WritePos == newringpos );
 			}
 			
-			m_RingPos = newringpos;
+			m_ReadPos = newringpos;
 
 			if( m_SignalRingEnable != 0 )
 			{
 				// The EEcore has requested a signal after some amount of processed data.
 				if( AtomicExchangeSub( m_SignalRingPosition, ringposinc ) <= 0 )
 				{
-					// Make sure to post the signal after the m_RingPos has been updated...
+					// Make sure to post the signal after the m_ReadPos has been updated...
 					AtomicExchange( m_SignalRingEnable, 0 );
 					m_sem_OnRingReset.Post();
 					continue;
 				}
 			}
 		}
-		}
+
+		busy.Release();
 
 		// Safety valve in case standard signals fail for some reason -- this ensures the EEcore
 		// won't sleep the eternity, even if SignalRingPosition didn't reach 0 for some reason.
@@ -503,7 +522,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 		if (!!AtomicExchange(m_VsyncSignalListener, false))
 			m_sem_Vsync.Post();
 
-		//Console.Warning( "(MTGS Thread) Nothing to do!  ringpos=0x%06x", m_RingPos );
+		//Console.Warning( "(MTGS Thread) Nothing to do!  ringpos=0x%06x", m_ReadPos );
 	}
 }
 
@@ -543,7 +562,7 @@ void SysMtgsThread::WaitGS()
 	if( m_ExecMode == ExecMode_NoThreadYet || !IsRunning() ) return;
 	if( !pxAssertDev( IsOpen(), "MTGS Warning!  WaitGS issued on a closed thread." ) ) return;
 
-	if( volatize(m_RingPos) != m_WritePos )
+	if( volatize(m_ReadPos) != m_WritePos )
 	{
 		SetEvent();
 		RethrowException();
@@ -551,7 +570,7 @@ void SysMtgsThread::WaitGS()
 		do {
 			m_mtx_RingBufferBusy.Wait();
 			RethrowException();
-		} while( volatize(m_RingPos) != m_WritePos );
+		} while( volatize(m_ReadPos) != m_WritePos );
 	}
 	
 	// Completely synchronize GS and MTGS register states.
@@ -570,7 +589,7 @@ void SysMtgsThread::SetEvent()
 
 u8* SysMtgsThread::GetDataPacketPtr() const
 {
-	return (u8*)&RingBuffer[m_packet_ringpos & RingBufferMask];
+	return (u8*)&RingBuffer[m_packet_writepos & RingBufferMask];
 }
 
 // Closes the data packet send command, and initiates the gs thread (if needed).
@@ -579,14 +598,14 @@ void SysMtgsThread::SendDataPacket()
 	// make sure a previous copy block has been started somewhere.
 	pxAssert( m_packet_size != 0 );
 
-	uint actualSize = ((m_packet_ringpos - m_packet_startpos) & RingBufferMask)-1;
+	uint actualSize = ((m_packet_writepos - m_packet_startpos) & RingBufferMask)-1;
 	pxAssert( actualSize <= m_packet_size );
-	pxAssert( m_packet_ringpos < RingBufferSize );
+	pxAssert( m_packet_writepos < RingBufferSize );
 
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
 	tag.data[0] = actualSize;
 
-	m_WritePos = m_packet_ringpos;
+	m_WritePos = m_packet_writepos;
 
 	if( EmuConfig.GS.SynchronousMTGS )
 	{
@@ -603,29 +622,23 @@ void SysMtgsThread::SendDataPacket()
 	//m_PacketLocker.Release();
 }
 
-void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
+void SysMtgsThread::GenericStall( uint size )
 {
 	// Note on volatiles: m_WritePos is not modified by the GS thread, so there's no need
 	// to use volatile reads here.  We do cache it though, since we know it never changes,
 	// except for calls to RingbufferRestert() -- handled below.
-	uint writepos = m_WritePos;
-
-	// Checks if a previous copy was started without an accompanying call to GSRINGBUF_DONECOPY
-	pxAssert( m_packet_size == 0 );
+	const uint writepos = m_WritePos;
 
 	// Sanity checks! (within the confines of our ringbuffer please!)
 	pxAssert( size < RingBufferSize );
 	pxAssert( writepos < RingBufferSize );
 
-	m_packet_size = size;
-	++size;			// takes into account our RingCommand QWC.
-
 	// generic gs wait/stall.
 	// if the writepos is past the readpos then we're safe.
 	// But if not then we need to make sure the readpos is outside the scope of
 	// the block about to be written (writepos + size)
 
-	uint readpos = volatize(m_RingPos);
+	uint readpos = volatize(m_ReadPos);
 	uint endpos = writepos+size;
 	uint freeroom;
 
@@ -662,7 +675,7 @@ void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 				AtomicExchange( m_SignalRingEnable, 1 );
 				SetEvent();
 				m_sem_OnRingReset.WaitWithoutYield();
-				readpos = volatize(m_RingPos);
+				readpos = volatize(m_ReadPos);
 				//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
 			} while( (writepos < readpos) && (writepos+size >= readpos) );
 
@@ -674,16 +687,17 @@ void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 			SetEvent();
 			do {
 				SpinWait();
-				readpos = volatize(m_RingPos);
+				readpos = volatize(m_ReadPos);
 			} while( (writepos < readpos) && (writepos+size >= readpos) );
 		}
 	}
+}
 
-#ifdef RINGBUF_DEBUG_STACK
-	m_lock_Stack.Lock();
-	ringposStack.push_front( writepos );
-	m_lock_Stack.Release();
-#endif
+void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
+{
+	m_packet_size = size;
+	++size;			// takes into account our RingCommand QWC.
+	GenericStall(size);
 
 	// Command qword: Low word is the command, and the high word is the packet
 	// length in SIMDs (128 bits).
@@ -692,7 +706,7 @@ void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 	tag.command = cmd;
 	tag.data[0] = m_packet_size;
 	m_packet_startpos = m_WritePos;
-	m_packet_ringpos = (m_WritePos + 1) & RingBufferMask;
+	m_packet_writepos = (m_WritePos + 1) & RingBufferMask;
 }
 
 // Returns the amount of giftag data processed (in simd128 values).
@@ -707,71 +721,10 @@ void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size )
 	PrepDataPacket( (MTGS_RingCommand)pathidx, size );
 }
 
-__forceinline uint SysMtgsThread::_PrepForSimplePacket()
+__forceinline void SysMtgsThread::_FinishSimplePacket()
 {
-#ifdef RINGBUF_DEBUG_STACK
-	m_lock_Stack.Lock();
-	ringposStack.push_front( m_WritePos );
-	m_lock_Stack.Release();
-#endif
-
-	uint future_writepos = m_WritePos+1;
-	pxAssert( future_writepos <= RingBufferSize );
-
-    future_writepos &= RingBufferMask;
-    if( future_writepos == 0 )
-		m_QueuedFrameCount = 0;
-
-	uint readpos = volatize(m_RingPos);
-	if( future_writepos == readpos )
-	{
-		// The ringbuffer read pos is blocking the future write position, so stall out
-		// until the read position has moved.
-
-		uint freeroom;
-
-		if (future_writepos < readpos)
-			freeroom = readpos - future_writepos;
-		else
-			freeroom = RingBufferSize - (future_writepos - readpos);
-
-		uint totalAccum	= RingBufferSize - freeroom;
-
-		uint somedone	= totalAccum / 4;
-
-		if( somedone > 0x80 )
-		{
-			m_SignalRingPosition = somedone;
-
-			//Console.WriteLn( Color_Blue, "(EEcore Sleep) PrepSimplePacket\tringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", readpos, m_WritePos, m_SignalRingPosition );
-
-			do {
-				AtomicExchange( m_SignalRingEnable, 1 );
-				SetEvent();
-				m_sem_OnRingReset.WaitWithoutYield();
-				readpos = volatize(m_RingPos);
-				//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Simple Post-sleep Report!\tringpos=0x%06x", readpos );
-			} while( future_writepos  == readpos );
-
-			pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
-		}
-		else
-		{
-			//Console.WriteLn( Color_StrongGray, "(EEcore Spin) PrepSimplePacket!" );
-
-			SetEvent();
-			do {
-				SpinWait();
-			} while( future_writepos == volatize(m_RingPos) );
-		}
-	}
-
-	return future_writepos;
-}
-
-__forceinline void SysMtgsThread::_FinishSimplePacket( uint future_writepos )
-{
-	pxAssert( future_writepos != volatize(m_RingPos) );
+	uint future_writepos = (m_WritePos+1) & RingBufferMask;
+	pxAssert( future_writepos != volatize(m_ReadPos) );
 	m_WritePos = future_writepos;
 
 	if( EmuConfig.GS.SynchronousMTGS )
@@ -784,7 +737,7 @@ void SysMtgsThread::SendSimplePacket( MTGS_RingCommand type, int data0, int data
 {
 	//ScopedLock locker( m_PacketLocker );
 
-	const uint thefuture = _PrepForSimplePacket();
+	GenericStall(1);
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
 
 	tag.command = type;
@@ -792,21 +745,21 @@ void SysMtgsThread::SendSimplePacket( MTGS_RingCommand type, int data0, int data
 	tag.data[1] = data1;
 	tag.data[2] = data2;
 
-	_FinishSimplePacket( thefuture );
+	_FinishSimplePacket();
 }
 
 void SysMtgsThread::SendPointerPacket( MTGS_RingCommand type, u32 data0, void* data1 )
 {
 	//ScopedLock locker( m_PacketLocker );
 
-	const uint thefuture = _PrepForSimplePacket();
+	GenericStall(1);
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
 
 	tag.command = type;
 	tag.data[0] = data0;
 	*(uptr*)&tag.data[1] = (uptr)data1;
 
-	_FinishSimplePacket( thefuture );
+	_FinishSimplePacket();
 }
 
 void SysMtgsThread::SendGameCRC( u32 crc )
diff --git a/pcsx2/gui/AppInit.cpp b/pcsx2/gui/AppInit.cpp
index 70985ad576..7d36249b39 100644
--- a/pcsx2/gui/AppInit.cpp
+++ b/pcsx2/gui/AppInit.cpp
@@ -189,13 +189,13 @@ void Pcsx2App::DetectCpuAndUserMode()
 	x86caps.CountCores();
 	x86caps.SIMD_EstablishMXCSRmask();
 
-	if( !x86caps.hasMultimediaExtensions )
+	if( !x86caps.hasMultimediaExtensions || !x86caps.hasStreamingSIMDExtensions )
 	{
-		// Note: due to memcpy_fast, we need minimum MMX even for interpreters.  This will
-		// hopefully change later once we have a dynamically recompiled memcpy.
+		// Note: Due to optimizations to GIFpath parsers, memcpy, and possibly other things, we need
+		// a bare minimum of SSE supported by the CPU.
 		throw Exception::HardwareDeficiency()
-			.SetDiagMsg(L"Critical Failure: MMX Extensions not available.")
-			.SetUserMsg(_("MMX extensions are not available.  PCSX2 requires cpu with MMX extension support to run."));
+			.SetDiagMsg(L"Critical Failure: SSE Extensions not available.")
+			.SetUserMsg(_("SSE extensions are not available.  PCSX2 requires a cpu that supports the SSE instruction set."));
 	}
 
 	ReadUserModeSettings();
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index 09c89c1200..bced656f13 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -97,10 +97,10 @@ struct GIFPath
 	u8 GetReg();
 	bool IsActive() const;
 
-	template< CpuExtType CpuExt, bool Aligned >
+	template< bool Aligned >
 	void SetTag(const void* mem);
 
-	template< CpuExtType CpuExt, int pathidx >
+	template< GIF_PATH pathidx, bool Aligned >
 	int CopyTag(const u128* pMem, u32 size);
 
 	int ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
@@ -291,13 +291,10 @@ __forceinline void GIFPath::PrepPackedRegs()
 }
 
 
-template< CpuExtType CpuExt, bool Aligned >
+template< bool Aligned >
 __forceinline void GIFPath::SetTag(const void* mem)
 {
-	if( CpuExt >= CpuExt_SSE )
-		_mm_store_ps( (float*)&tag, Aligned ? _mm_load_ps((const float*)mem) : _mm_loadu_ps((const float*)mem) );
-	else
-		const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);
+	_mm_store_ps( (float*)&tag, Aligned ? _mm_load_ps((const float*)mem) : _mm_loadu_ps((const float*)mem) );
 
 	nloop	= tag.NLOOP;
 	curreg	= 0;
@@ -391,7 +388,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 	while (size > 0) {
 		if (!nloop) {
 
-			SetTag<CpuExt_Base,false>(pMem);
+			SetTag<false>(pMem);
 			incTag(1);
 		}
 		else
@@ -567,10 +564,7 @@ __forceinline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint
 }
 
 #define copyTag() do {						\
-	if( CpuExt >= CpuExt_SSE )				\
-		_mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], (pathidx!=GIF_PATH_2) ? _mm_load_ps((float*)pMem128) : _mm_loadu_ps((float*)pMem128)); \
-	else \
-		RingBuffer.m_Ring[ringpos] = *pMem128;	\
+	_mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], Aligned ? _mm_load_ps((float*)pMem128) : _mm_loadu_ps((float*)pMem128)); \
 	++pMem128; --size;						\
 	ringpos = (ringpos+1)&RingBufferMask;	\
 } while(false)
@@ -579,10 +573,10 @@ __forceinline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint
 //   size - max size of incoming data stream, in qwc (simd128).  If the path is PATH1, and the
 //     path does not terminate (EOP) within the specified size, it is assumed that the path must
 //     loop around to the start of VU memory and continue processing.
-template< CpuExtType CpuExt, int pathidx > 
+template< GIF_PATH pathidx, bool Aligned > 
 __forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
 {
-	uint& ringpos = GetMTGS().m_packet_ringpos;
+	uint& ringpos = GetMTGS().m_packet_writepos;
 	const uint original_ringpos = ringpos;
 
 	u32	startSize =  size;						// Start Size
@@ -590,7 +584,7 @@ __forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
 	while (size > 0) {
 		if (!nloop) {
 
-			SetTag<CpuExt, (pathidx!=GIF_PATH_2)>((u8*)pMem128);
+			SetTag<Aligned>((u8*)pMem128);
 			copyTag();
 			
 			if(nloop > 0)
@@ -795,7 +789,8 @@ __forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
 
 					Console.Warning("GIFTAG error, size exceeded VU memory size %x", startSize);
 					nloop	= 0;
-					
+					const_cast<GIFTAG&>(tag).EOP = 1;
+
 					// Don't send the packet to the GS -- its incomplete and might cause the GS plugin
 					// to get confused and die. >_<
 					
@@ -870,41 +865,25 @@ __forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
 	return size;
 }
 
-typedef int __fastcall FnType_CopyTag(const u128* pMem, u32 size);
-
-static __aligned16 FnType_CopyTag* tbl_CopyTag[3];
-
 // Parameters:
 //   size - max size of incoming data stream, in qwc (simd128).  If the path is PATH1, and the
 //     path does not terminate (EOP) within the specified size, it is assumed that the path must
 //     loop around to the start of VU memory and continue processing.
-template< CpuExtType CpuExt, int pathidx >
-static int __fastcall _CopyTag_tmpl(const u128* pMem, u32 size)
-{
-	return s_gifPath[pathidx].CopyTag<CpuExt,pathidx>(pMem, size);
-}
-
-void GIFPath_Initialize()
-{
-#ifdef __LINUX__
-	// It's already thrown an exception if it isn't SSE, and the check was giving me a compilation error.
-	// I could fix it, but why bother?
-	tbl_CopyTag[0] = _CopyTag_tmpl<CpuExt_SSE, 0>;
-	tbl_CopyTag[1] = _CopyTag_tmpl<CpuExt_SSE, 1>;
-	tbl_CopyTag[2] = _CopyTag_tmpl<CpuExt_SSE, 2>;
-#else
-	tbl_CopyTag[0] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 0> : _CopyTag_tmpl<CpuExt_Base, 0>;
-	tbl_CopyTag[1] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 1> : _CopyTag_tmpl<CpuExt_Base, 1>;
-	tbl_CopyTag[2] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 2> : _CopyTag_tmpl<CpuExt_Base, 2>;
-#endif
-}
-
 __forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
 {
-	return tbl_CopyTag[pathidx](pMem, size);
+	switch( pathidx )
+	{
+		case GIF_PATH_1: return s_gifPath[GIF_PATH_1].CopyTag<GIF_PATH_1,true>(pMem, size);
+		case GIF_PATH_2: return s_gifPath[GIF_PATH_2].CopyTag<GIF_PATH_2,false>(pMem, size);
+		case GIF_PATH_3: return s_gifPath[GIF_PATH_3].CopyTag<GIF_PATH_3,true>(pMem, size);
+
+		jNO_DEFAULT;
+	}
+	
+	return 0;		// unreachable
 }
 
-// Quick version for queueing PATH1 data.
+// Quick version for queuing PATH1 data.
 // This version calculates the real length of the packet data only.  It does not process
 // IRQs or DMA status updates.
 __forceinline int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)

From d10b60d560c44256f317c2488d3614bfe9b97247 Mon Sep 17 00:00:00 2001
From: arcum42 <arcum42@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Fri, 16 Jul 2010 09:50:36 +0000
Subject: [PATCH 16/26] ReorderingMTGS: Change the location of the Linux
 version of memcpy_amd_qwc for the moment, so it compiles.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3501 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/Utilities/MemcpyFast.h    | 89 +---------------------
 common/src/Utilities/x86/MemcpyVibes.cpp | 97 +++++++++++++++++++++++-
 2 files changed, 97 insertions(+), 89 deletions(-)

diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index c8f7e64b2f..012d8cdfaa 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -22,95 +22,8 @@
 	extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
 	extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 	extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);
+	extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
 
-	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
-	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
-	static __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
-	{	
-		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
-		// registers will improve copy performance, because they won't.  Use of XMMs is only
-		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
-		// and even then the benefits are typically minimal (sometimes slower depending on the
-		// amount of data being copied).
-		//
-		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
-		//   --air
-
-		// Linux Conversion note:
-		//  This code would benefit nicely from having inline-able GAS syntax, since it should
-		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
-		//  And its called enough times to probably merit the extra effort to ensure proper
-		//  optimization. --air
-
-		__asm__
-		(
-			".intel_syntax noprefix\n"
-				//"mov		ecx, [%[dest]]\n"
-				//"mov		edx, [%[src]]\n"
-				//"mov		eax, [%[qwc]]\n"			// keep a copy of count
-				"mov		eax, %[qwc]\n"
-				"shr		eax, 1\n"
-				"jz			memcpy_qwc_1\n"				// only one 16 byte block to copy?
-
-				"cmp		%[qwc], 64\n"				// "IN_CACHE_COPY/32"
-				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
-		
-			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
-				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
-
-				"movq		mm0,[%[src]+0]\n"			// read 64 bits
-				"movq		mm1,[%[src]+8]\n"
-				"movq		mm2,[%[src]+16]\n"
-				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
-				"movntq		[%[dest]+8], mm1\n"
-				"movq		mm3,[%[src]+24]\n"
-				"movntq		[%[dest]+16], mm2\n"
-				"movntq		[%[dest]+24], mm3\n"
-
-				"add		%[src],32\n"				// update source pointer
-				"add		%[dest],32\n"				// update destination pointer
-				"sub		eax,1\n"
-				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
-				"sfence\n"								// flush the write buffer
-				"jmp		memcpy_qwc_1\n"
-
-			// 32-byte blocks, cached!
-			// This *is* important.  Removing this and using exclusively non-temporal stores
-			// results in noticeable speed loss!
-
-			"memcpy_qwc_loop1:\n"				
-				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
-
-				"movq		mm0,[%[src]+0]\n"			// read 64 bits
-				"movq		mm1,[%[src]+8]\n"
-				"movq		mm2,[%[src]+16]\n"
-				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
-				"movq		[%[dest]+8], mm1\n"
-				"movq		mm3,[%[src]+24]\n"
-				"movq		[%[dest]+16], mm2\n"
-				"movq		[%[dest]+24], mm3\n"
-
-				"add		%[src],32\n"				// update source pointer
-				"add		%[dest],32\n"				// update destination pointer
-				"sub		eax,1\n"
-				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
-
-			"memcpy_qwc_1:\n"
-				"test		[%qwc],1\n"
-				"jz			memcpy_qwc_final\n"
-				"movq		mm0,[%[src]]\n"
-				"movq		mm1,[%[src]+8]\n"
-				"movq		[%[dest]], mm0\n"
-				"movq		[%[dest]+8], mm1\n"
-
-			"memcpy_qwc_final:\n"
-				"emms\n"								// clean up the MMX state
-			".att_syntax\n"
-					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
-					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
-					: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
-		);
-	}
 #else
 
 #	include "win_memzero.h"
diff --git a/common/src/Utilities/x86/MemcpyVibes.cpp b/common/src/Utilities/x86/MemcpyVibes.cpp
index 11ac0c10fd..ced85b9e92 100644
--- a/common/src/Utilities/x86/MemcpyVibes.cpp
+++ b/common/src/Utilities/x86/MemcpyVibes.cpp
@@ -155,4 +155,99 @@ __forceinline void memcpy_vibes(void * dest, const void * src, int size) {
 }
 
 #endif
-#endif
+#endif
+
+// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
+// to get around compilation issues with having it in the headers.
+#ifdef __LINUX__
+
+	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
+	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
+	__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+	{	
+		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
+		// registers will improve copy performance, because they won't.  Use of XMMs is only
+		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
+		// and even then the benefits are typically minimal (sometimes slower depending on the
+		// amount of data being copied).
+		//
+		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
+		//   --air
+
+		// Linux Conversion note:
+		//  This code would benefit nicely from having inline-able GAS syntax, since it should
+		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
+		//  And its called enough times to probably merit the extra effort to ensure proper
+		//  optimization. --air
+
+		__asm__
+		(
+			".intel_syntax noprefix\n"
+				//"mov		ecx, [%[dest]]\n"
+				//"mov		edx, [%[src]]\n"
+				//"mov		eax, [%[qwc]]\n"			// keep a copy of count
+				"mov		eax, %[qwc]\n"
+				"shr		eax, 1\n"
+				"jz			memcpy_qwc_1\n"				// only one 16 byte block to copy?
+
+				"cmp		%[qwc], 64\n"				// "IN_CACHE_COPY/32"
+				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
+		
+			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
+
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		mm1,[%[src]+8]\n"
+				"movq		mm2,[%[src]+16]\n"
+				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movntq		[%[dest]+8], mm1\n"
+				"movq		mm3,[%[src]+24]\n"
+				"movntq		[%[dest]+16], mm2\n"
+				"movntq		[%[dest]+24], mm3\n"
+
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
+				"sfence\n"								// flush the write buffer
+				"jmp		memcpy_qwc_1\n"
+
+			// 32-byte blocks, cached!
+			// This *is* important.  Removing this and using exclusively non-temporal stores
+			// results in noticeable speed loss!
+
+			"memcpy_qwc_loop1:\n"				
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
+
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		mm1,[%[src]+8]\n"
+				"movq		mm2,[%[src]+16]\n"
+				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movq		[%[dest]+8], mm1\n"
+				"movq		mm3,[%[src]+24]\n"
+				"movq		[%[dest]+16], mm2\n"
+				"movq		[%[dest]+24], mm3\n"
+
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
+
+			"memcpy_qwc_1:\n"
+				"test		%[qwc],1\n"
+				"jz			memcpy_qwc_final\n"
+				"movq		mm0,[%[src]]\n"
+				"movq		mm1,[%[src]+8]\n"
+				"movq		[%[dest]], mm0\n"
+				"movq		[%[dest]+8], mm1\n"
+
+			"memcpy_qwc_final:\n"
+				"emms\n"								// clean up the MMX state
+			".att_syntax\n"
+					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
+					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
+					: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
+		);
+	}
+#endif
+

From 32942ec9a674d78c62d7bf924259673635229a06 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Fri, 16 Jul 2010 16:48:08 +0000
Subject: [PATCH 17/26] ReorderingMTGS:  * Added a few assertions to detect
 when PATH transfers are started that violate other pending PATH transfers.  *
 Removed a lot of obsolete code from vif1's DIRECT handler (Vif_Codes.cpp)  *
 Add alignment to Path1buffer to avoid SSE alignment faults.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3503 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/Gif.cpp         |  3 +-
 pcsx2/Gif.h           |  2 +-
 pcsx2/Vif_Codes.cpp   | 90 +++++++++++--------------------------------
 pcsx2/ps2/GIFpath.cpp | 18 +++++++--
 4 files changed, 40 insertions(+), 73 deletions(-)

diff --git a/pcsx2/Gif.cpp b/pcsx2/Gif.cpp
index 37679de3ce..77002ab55b 100644
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@@ -36,7 +36,7 @@ static u32 gifqwc = 0;
 static bool gifmfifoirq = false;
 
 //Just some temporary bits to store Path1 transfers if another is in progress.
-u8 Path1Buffer[0x1000000];
+__aligned16 u8 Path1Buffer[0x1000000];
 u32 Path1WritePos = 0;
 u32 Path1ReadPos = 0;
 
@@ -65,7 +65,6 @@ void gsPath1Interrupt()
 
 			uint count = GIFPath_CopyTag(GIF_PATH_1, ((u128*)Path1Buffer) + Path1ReadPos, size);
 			GetMTGS().SendDataPacket();
-			pxAssume( count == size );
 			Path1ReadPos += count;
 
 			if(GSTransferStatus.PTH1 == STOPPED_MODE)
diff --git a/pcsx2/Gif.h b/pcsx2/Gif.h
index 8533cdc98d..097cb03532 100644
--- a/pcsx2/Gif.h
+++ b/pcsx2/Gif.h
@@ -290,7 +290,7 @@ extern void gifMFIFOInterrupt();
 
 //Just some temporary bits to store Path1 transfers if another is in progress.
 extern void gsPath1Interrupt();
-extern u8 Path1Buffer[0x1000000];
+extern __aligned16 u8 Path1Buffer[0x1000000];
 extern u32 Path1WritePos;
 extern u32 Path1ReadPos;
 #endif
diff --git a/pcsx2/Vif_Codes.cpp b/pcsx2/Vif_Codes.cpp
index f7af604502..5de48fb8dd 100644
--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@@ -134,7 +134,8 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 	}
 	pass2 {
 		vif1Only();
-		
+		nVifStruct&	v	 = nVif[1];
+
 		if (GSTransferStatus.PTH3 < IDLE_MODE || gifRegs->stat.P1Q == true)
 		{
 			if(gifRegs->stat.APATH == GIF_APATH2 || ((GSTransferStatus.PTH3 <= IMAGE_MODE && gifRegs->stat.IMT && (vif1.cmd & 0x7f) == 0x50)) && gifRegs->stat.P1Q == false)
@@ -167,79 +168,34 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 			return 0;
 		}
 
-		
-
+		// HACK ATTACK!
+		// we shouldn't be clearing the queue flag here at all.  Ideally, the queue statuses
+		// should be checked, handled, and cleared from the EOP check in GIFPath only. --air
 		gifRegs->stat.clear_flags(GIF_STAT_P2Q);
 
-		nVifStruct&	v	 = nVif[1];
-		const int	ret	 = aMin(vif1.vifpacketsize, vif1.tag.size);
-		u32			size = ret << 2;
+		// the tag size should ALWAYS be 128 bits (qwc).  If it isn't, it means there's a serious bug
+		// somewhere in the VIF (likely relating to +/-'ing the tag.size during processing).
+		pxAssumeMsg( (vif1.tag.size & 1) == 0, "Invalid Vif1 DIRECT packet size detected!" );
 
-		//gifRegs->stat.APATH = GIF_APATH2; //Flag is cleared in vif1interrupt to simulate it being in progress.
-		
-		//In the original code we were saving this data, it seems if it does happen, its just blank, so we ignore it.
-		
-			if (!size) { DevCon.WriteLn("Path2: No Data Transfer?"); }
-			
+		const int	minSize	 = aMin(vif1.vifpacketsize, vif1.tag.size)/4;
+		uint ret;
 
-			if(vif1.vifpacketsize < 4 && v.bSize < 16) 
-			{
-				nVifStruct& v = nVif[idx];
+		if (!minSize)
+			DevCon.Warning("VIF DIRECT (PATH2): No Data Transfer?");
 
-				memcpy(&v.buffer[v.bPtr], data, vif1.vifpacketsize << 2);
-				v.bSize += vif1.vifpacketsize << 2;
-				v.bPtr += vif1.vifpacketsize << 2;
-				vif1.tag.size -= vif1.vifpacketsize;
-				if(vif1.tag.size == 0) 
-				{
-					DevCon.Warning("Missaligned packet on DIRECT end!");
-					vif1.cmd = 0;
-				}
-				return vif1.vifpacketsize;
-			}
-			else
-			{
-				nVifStruct& v = nVif[idx];
-				if(v.bSize)
-				{
-					int ret = 0;
+		GetMTGS().PrepDataPacket(GIF_PATH_2, minSize);
+		ret = GIFPath_CopyTag(GIF_PATH_2, (u128*)data, minSize)*4;
+		GetMTGS().SendDataPacket();
 
-					if(v.bSize < 16)
-					{
-						if(((16 - v.bSize) >> 2) > vif1.vifpacketsize) DevCon.Warning("Not Enough Data!");
-						ret = (16 - v.bSize) >> 2;
-						memcpy(&v.buffer[v.bPtr], data, ret << 2);
-						vif1.tag.size -=  ret;						
-						v.bSize = 0;
-						v.bPtr = 0;						
-					}
-					GetMTGS().PrepDataPacket(GIF_PATH_2, 1);
-					GIFPath_CopyTag(GIF_PATH_2, (u128*)v.buffer, 1);
-					GetMTGS().SendDataPacket();
+		vif1.tag.size -= ret;
 
-					if(vif1.tag.size == 0) 
-					{
-						vif1.cmd = 0;
-					}
-					vif1.vifstalled    = true;
-					return ret;
-				}
-				else
-				{
-					GetMTGS().PrepDataPacket(GIF_PATH_2, size/16);
-					uint count = GIFPath_CopyTag(GIF_PATH_2, (u128*)data, size/16) * 4;
-					GetMTGS().SendDataPacket();
-
-					vif1.tag.size -= count;
-					if(vif1.tag.size == 0) 
-					{
-						vif1.cmd = 0;
-					}
-					vif1.vifstalled    = true;
-					return count;
-				}
-			}
-			
+		if(vif1.tag.size == 0) 
+		{
+			vif1.cmd = 0;
+			gifRegs->stat.clear_flags(GIF_STAT_APATH2 | GIF_STAT_OPH);
+		}
+		vif1.vifstalled    = true;
+		return ret;
 	}
 	return 0;
 }
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index a0e8ac58a5..859f4db46a 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -817,6 +817,9 @@ __forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
 					gsIrq();
 				}
 			}
+			
+			// [TODO] : DMAC Arbitration rights should select the next queued GIF transfer here.
+			
 			break;
 		}
 		if(SIGNAL_IMR_Pending == true)
@@ -873,9 +876,18 @@ __forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
 {
 	switch( pathidx )
 	{
-		case GIF_PATH_1: return s_gifPath[GIF_PATH_1].CopyTag<GIF_PATH_1,true>(pMem, size);
-		case GIF_PATH_2: return s_gifPath[GIF_PATH_2].CopyTag<GIF_PATH_2,false>(pMem, size);
-		case GIF_PATH_3: return s_gifPath[GIF_PATH_3].CopyTag<GIF_PATH_3,true>(pMem, size);
+		case GIF_PATH_1:
+			pxAssertMsg(!s_gifPath[GIF_PATH_2].IsActive(), "GIFpath conflict: Attempted to start PATH1 while PATH2 is already active.");
+			pxAssertMsg(!s_gifPath[GIF_PATH_3].IsActive(), "GIFpath conflict: Attempted to start PATH1 while PATH3 is already active.");
+			return s_gifPath[GIF_PATH_1].CopyTag<GIF_PATH_1,true>(pMem, size);
+		case GIF_PATH_2:
+			pxAssertMsg(!s_gifPath[GIF_PATH_1].IsActive(), "GIFpath conflict: Attempted to start PATH2 while PATH1 is already active.");
+			pxAssertMsg(!s_gifPath[GIF_PATH_3].IsActive(), "GIFpath conflict: Attempted to start PATH2 while PATH3 is already active.");
+			return s_gifPath[GIF_PATH_2].CopyTag<GIF_PATH_2,false>(pMem, size);
+		case GIF_PATH_3:
+			pxAssertMsg(!s_gifPath[GIF_PATH_1].IsActive(), "GIFpath conflict: Attempted to start PATH3 while PATH1 is already active.");
+			pxAssertMsg(!s_gifPath[GIF_PATH_2].IsActive(), "GIFpath conflict: Attempted to start PATH3 while PATH2 is already active.");
+			return s_gifPath[GIF_PATH_3].CopyTag<GIF_PATH_3,true>(pMem, size);
 
 		jNO_DEFAULT;
 	}

From 5bbdb688966a6a8c3d434525c496bd4ae8c803a3 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Fri, 16 Jul 2010 20:31:00 +0000
Subject: [PATCH 18/26] ReorderingMTGS: Assertion fixes, some comments notes on
 MARK behavior.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3504 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/Vif_Codes.cpp    |  3 +--
 pcsx2/Vif_Transfer.cpp | 27 ++++++++++++++++++---------
 pcsx2/ps2/GIFpath.cpp  |  4 ++--
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/pcsx2/Vif_Codes.cpp b/pcsx2/Vif_Codes.cpp
index 5de48fb8dd..db58db1b82 100644
--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@@ -134,7 +134,6 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 	}
 	pass2 {
 		vif1Only();
-		nVifStruct&	v	 = nVif[1];
 
 		if (GSTransferStatus.PTH3 < IDLE_MODE || gifRegs->stat.P1Q == true)
 		{
@@ -175,7 +174,7 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 
 		// the tag size should ALWAYS be 128 bits (qwc).  If it isn't, it means there's a serious bug
 		// somewhere in the VIF (likely relating to +/-'ing the tag.size during processing).
-		pxAssumeMsg( (vif1.tag.size & 1) == 0, "Invalid Vif1 DIRECT packet size detected!" );
+		pxAssumeMsg( (vif1.tag.size & 3) == 0, "Invalid Vif1 DIRECT packet size detected!" );
 
 		const int	minSize	 = aMin(vif1.vifpacketsize, vif1.tag.size)/4;
 		uint ret;
diff --git a/pcsx2/Vif_Transfer.cpp b/pcsx2/Vif_Transfer.cpp
index a5f80db686..21aafa5529 100644
--- a/pcsx2/Vif_Transfer.cpp
+++ b/pcsx2/Vif_Transfer.cpp
@@ -36,16 +36,8 @@ _vifT bool analyzeIbit(u32* &data, int iBit) {
 	if (iBit && !vifX.cmd && !vifXRegs->err.MII) {
 		//DevCon.WriteLn("Vif I-Bit IRQ");
 		vifX.irq++;
-		// On i-bit, the command is run, vif stalls etc,
-		// however if the vifcode is MARK, you do NOT stall, just send IRQ. - Max Payne shows this up.
-		//if(((vifXRegs->code >> 24) & 0x7f) == 0x7) return 0;
 
-		// If we have a vifcode with i-bit, the following instruction
-		// should stall unless its MARK?.. we test that case here...
-		// Not 100% sure if this is the correct behavior, so printing
-		// a console message to see games that use this. (cottonvibes)
-
-		// Okay did some testing with Max Payne, it does this
+		// Okay did some testing with Max Payne, it does this:
 		// VifMark  value = 0x666   (i know, evil!)
 		// NOP with I Bit
 		// VifMark  value = 0
@@ -53,6 +45,23 @@ _vifT bool analyzeIbit(u32* &data, int iBit) {
 		// If you break after the 2nd Mark has run, the game reports invalid mark 0 and the game dies.
 		// So it has to occur here, testing a theory that it only doesn't stall if the command with
 		// the iBit IS mark, but still sends the IRQ to let the cpu know the mark is there. (Refraction)
+		//
+		// --------------------------
+		//
+		// This is how it probably works: i-bit sets the IRQ flag, and VIF keeps running until it encounters
+		// a non-MARK instruction.  This includes the *current* instruction.  ie, execution only continues
+		// unimpeded if MARK[i] is specified, and keeps executing unimpeded until any non-MARK command.
+		// Any other command with an I bit should stall immediately.
+		// Example:
+		//
+		// VifMark[i] value = 0x321   (with I bit)
+		// VifMark    value = 0
+		// VifMark    value = 0x333
+		// NOP
+		//
+		// ... the VIF should not stall and raise the interrupt until after the NOP is processed.
+		// So the final value for MARK as the game sees it will be 0x333. --air
+		
 		return runMark<idx>(data);
 	}
 	return 0;
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index 859f4db46a..843b0605eb 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -878,11 +878,11 @@ __forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
 	{
 		case GIF_PATH_1:
 			pxAssertMsg(!s_gifPath[GIF_PATH_2].IsActive(), "GIFpath conflict: Attempted to start PATH1 while PATH2 is already active.");
-			pxAssertMsg(!s_gifPath[GIF_PATH_3].IsActive(), "GIFpath conflict: Attempted to start PATH1 while PATH3 is already active.");
+			pxAssertMsg(!s_gifPath[GIF_PATH_3].IsActive() || (GSTransferStatus.PTH3 == IMAGE_MODE), "GIFpath conflict: Attempted to start PATH1 while PATH3 is already active.");
 			return s_gifPath[GIF_PATH_1].CopyTag<GIF_PATH_1,true>(pMem, size);
 		case GIF_PATH_2:
 			pxAssertMsg(!s_gifPath[GIF_PATH_1].IsActive(), "GIFpath conflict: Attempted to start PATH2 while PATH1 is already active.");
-			pxAssertMsg(!s_gifPath[GIF_PATH_3].IsActive(), "GIFpath conflict: Attempted to start PATH2 while PATH3 is already active.");
+			pxAssertMsg(!s_gifPath[GIF_PATH_3].IsActive() || (GSTransferStatus.PTH3 == IMAGE_MODE), "GIFpath conflict: Attempted to start PATH2 while PATH3 is already active.");
 			return s_gifPath[GIF_PATH_2].CopyTag<GIF_PATH_2,false>(pMem, size);
 		case GIF_PATH_3:
 			pxAssertMsg(!s_gifPath[GIF_PATH_1].IsActive(), "GIFpath conflict: Attempted to start PATH3 while PATH1 is already active.");

From 1be80fbe53f6a163bd45cc016170afb511f7ebab Mon Sep 17 00:00:00 2001
From: arcum42 <arcum42@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Fri, 16 Jul 2010 23:31:28 +0000
Subject: [PATCH 19/26] ReorderingMTGS: Comment out a few unused variables.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3505 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/FiFo.cpp                  | 2 +-
 pcsx2/Gif.cpp                   | 1 -
 pcsx2/MTGS.cpp                  | 2 +-
 pcsx2/x86/ix86-32/iR5900-32.cpp | 4 ++--
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp
index f93b80242b..28122713c4 100644
--- a/pcsx2/FiFo.cpp
+++ b/pcsx2/FiFo.cpp
@@ -196,7 +196,7 @@ void __fastcall WriteFIFO_page_6(u32 mem, const mem128_t *value)
 	nloop0_packet[2] = psHu32(GIF_FIFO + 8);
 	nloop0_packet[3] = psHu32(GIF_FIFO + 12);
 	GetMTGS().PrepDataPacket(GIF_PATH_3, 1);
-	u64* data = (u64*)GetMTGS().GetDataPacketPtr();
+	//u64* data = (u64*)GetMTGS().GetDataPacketPtr();
 	GIFPath_CopyTag( GIF_PATH_3, (u128*)nloop0_packet, 1 );
 	GetMTGS().SendDataPacket();
 	if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 )
diff --git a/pcsx2/Gif.cpp b/pcsx2/Gif.cpp
index 77002ab55b..ba1a9deb0f 100644
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@@ -163,7 +163,6 @@ static u32 WRITERING_DMA(tDMA_TAG *pMem, u32 qwc)
 int  _GIFchain()
 {
 	tDMA_TAG *pMem;
-	int qwc = 0;
 
 	pMem = dmaGetAddr(gif->madr, false);
 	if (pMem == NULL)
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index 3b0f4694be..fc70d9cf77 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -639,7 +639,7 @@ void SysMtgsThread::GenericStall( uint size )
 	// the block about to be written (writepos + size)
 
 	uint readpos = volatize(m_ReadPos);
-	uint endpos = writepos+size;
+	//uint endpos = writepos+size;
 	uint freeroom;
 
 	if (writepos < readpos)
diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp
index fe4510b4a5..87dbeb055c 100644
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@@ -1258,11 +1258,11 @@ void recompileNextInstruction(int delayslot)
 // Calling of this function can be enabled or disabled through the use of EmuConfig.Recompiler.PreBlockChecks
 static void __fastcall PreBlockCheck( u32 blockpc )
 {
-	static int lastrec = 0;
+	/*static int lastrec = 0;
 	static int curcount = 0;
 	const int skip = 0;
 
-    /*if( blockpc != 0x81fc0 ) {//&& lastrec != g_lastpc ) {
+    if( blockpc != 0x81fc0 ) {//&& lastrec != g_lastpc ) {
 		curcount++;
 
 		if( curcount > skip ) {

From ce2b9e30fc7056d85c37354478e39f35161d9a9c Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sat, 17 Jul 2010 00:14:41 +0000
Subject: [PATCH 20/26] ReorderingMTGS: threading bugfixes, ringbuffer would do
 bad things when it got full (GS load 80%+), or when vsyncs wrapped around the
 edge of the ring.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3507 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/MTGS.cpp        | 25 +++++++++++++++++++------
 pcsx2/ps2/GIFpath.cpp |  2 +-
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index fc70d9cf77..06b841dacb 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -639,7 +639,6 @@ void SysMtgsThread::GenericStall( uint size )
 	// the block about to be written (writepos + size)
 
 	uint readpos = volatize(m_ReadPos);
-	//uint endpos = writepos+size;
 	uint freeroom;
 
 	if (writepos < readpos)
@@ -647,7 +646,7 @@ void SysMtgsThread::GenericStall( uint size )
 	else
 		freeroom = RingBufferSize - (writepos - readpos);
 
-	if (freeroom < size)
+	if (freeroom <= size)
 	{
 		// writepos will overlap readpos if we commit the data, so we need to wait until
 		// readpos is out past the end of the future write pos, or until it wraps around
@@ -671,13 +670,20 @@ void SysMtgsThread::GenericStall( uint size )
 
 			//Console.WriteLn( Color_Blue, "(EEcore Sleep) PrepDataPacker \tringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", readpos, writepos, m_SignalRingPosition );
 
-			do {
+			while(true) {
 				AtomicExchange( m_SignalRingEnable, 1 );
 				SetEvent();
 				m_sem_OnRingReset.WaitWithoutYield();
 				readpos = volatize(m_ReadPos);
 				//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
-			} while( (writepos < readpos) && (writepos+size >= readpos) );
+
+				if (writepos < readpos)
+					freeroom = readpos - writepos;
+				else
+					freeroom = RingBufferSize - (writepos - readpos);
+					
+				if (freeroom > size) break;
+			}
 
 			pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
 		}
@@ -685,10 +691,17 @@ void SysMtgsThread::GenericStall( uint size )
 		{
 			//Console.WriteLn( Color_StrongGray, "(EEcore Spin) PrepDataPacket!" );
 			SetEvent();
-			do {
+			while(true) {
 				SpinWait();
 				readpos = volatize(m_ReadPos);
-			} while( (writepos < readpos) && (writepos+size >= readpos) );
+
+				if (writepos < readpos)
+					freeroom = readpos - writepos;
+				else
+					freeroom = RingBufferSize - (writepos - readpos);
+
+				if (freeroom > size) break;
+			}
 		}
 	}
 }
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index 843b0605eb..049fbb566b 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -558,7 +558,7 @@ __forceinline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint
 		uint firstcopylen = srcSize - srcStart;
 		memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
 
-		srcStart = endpos & srcSize;
+		srcStart = endpos % srcSize;
 		memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
 	}
 }

From 1c9cefd778b7cb83ecb0f331bb807fa945182039 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sat, 17 Jul 2010 12:11:37 +0000
Subject: [PATCH 21/26] ReorderingMTGS:  * fixes flickering screen in Soul
 Calibur 3 (caused by VSYNC register bug)  * Optimized upload of queued Path1
 transfers; such that all Path1's are uploaded as a single MTGS packet.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3515 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/Gif.cpp               | 23 +++++++++++++----------
 pcsx2/MTGS.cpp              |  9 ++++++---
 pcsx2/ps2/GIFpath.cpp       |  1 +
 pcsx2/x86/microVU_Lower.inl | 17 +++++++----------
 4 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/pcsx2/Gif.cpp b/pcsx2/Gif.cpp
index ba1a9deb0f..dd2ed58856 100644
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@@ -57,21 +57,24 @@ void gsPath1Interrupt()
 	if((gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.IP3 == true && gifRegs->stat.APATH == GIF_APATH3)) && Path1WritePos > 0 && !gifRegs->stat.PSE)
 	{
 		gifRegs->stat.P1Q = false;
-		while(Path1WritePos > 0)
+
+		if (uint size = (Path1WritePos - Path1ReadPos))
 		{
-			uint size = (Path1WritePos - Path1ReadPos);
 			GetMTGS().PrepDataPacket(GIF_PATH_1, size);
 			//DevCon.Warning("Flush Size = %x", size);
-
-			uint count = GIFPath_CopyTag(GIF_PATH_1, ((u128*)Path1Buffer) + Path1ReadPos, size);
-			GetMTGS().SendDataPacket();
-			Path1ReadPos += count;
-
-			if(GSTransferStatus.PTH1 == STOPPED_MODE)
+			while(size > 0)
 			{
-				gifRegs->stat.OPH = false;				
-				gifRegs->stat.APATH = GIF_APATH_IDLE;
+				uint count = GIFPath_CopyTag(GIF_PATH_1, ((u128*)Path1Buffer) + Path1ReadPos, size);
+				Path1ReadPos += count;
+				size -= count;
+
+				if(GSTransferStatus.PTH1 == STOPPED_MODE)
+				{
+					gifRegs->stat.OPH = false;				
+					gifRegs->stat.APATH = GIF_APATH_IDLE;
+				}
 			}
+			GetMTGS().SendDataPacket();
 
 			if(Path1ReadPos == Path1WritePos)
 			{
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index 06b841dacb..115e08c3c5 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -409,13 +409,16 @@ void SysMtgsThread::ExecuteTaskInThread()
 							MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", !!(((u32&)RingBuffer.Regs[0x1000]) & 0x2000) ? 0 : 1, tag.data[1] ? "true" : "false" );
 
 							// Mail in the important GS registers.
+							// This seemingly obtuse system is needed in order to handle cases where the vsync data wraps
+							// around the edge of the ringbuffer.  If not for that I'd just use a struct. >_<
+
 							uint datapos = (m_ReadPos+1) & RingBufferMask;
 							MemCopy_WrappedSrc( RingBuffer.m_Ring, datapos, RingBufferSize, (u128*)RingBuffer.Regs, 0xf );
 
 							u32* remainder = (u32*)&RingBuffer[datapos];
-							GSCSRr		= remainder[0];
-							GSIMR		= remainder[1];
-							GSSIGLBLID	= (GSRegSIGBLID&)remainder[2];
+							((u32&)RingBuffer.Regs[0x1000])				= remainder[0];
+							((u32&)RingBuffer.Regs[0x1010])				= remainder[1];
+							((GSRegSIGBLID&)RingBuffer.Regs[0x1080])	= (GSRegSIGBLID&)remainder[2];
 
 							// CSR & 0x2000; is the pageflip id.
 							GSvsync(((u32&)RingBuffer.Regs[0x1000]) & 0x2000);
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index 049fbb566b..4dfd10438a 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -515,6 +515,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 
 					Console.Warning("GIFTAG error, size exceeded VU memory size %x", startSize);
 					nloop	= 0;
+					const_cast<GIFTAG&>(tag).EOP = 1;
 				}
 			}
 		}
diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl
index 0068975b99..0079cc083d 100644
--- a/pcsx2/x86/microVU_Lower.inl
+++ b/pcsx2/x86/microVU_Lower.inl
@@ -1097,7 +1097,6 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 	u8* data  = microVU1.regs->Mem + (addr*16);
 	u32 diff  = 0x400 - addr;
 	u32 size;
-	u8* pDest;
 	
 	if(gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.APATH == GIF_APATH3 && gifRegs->stat.IP3 == true) && SIGNAL_IMR_Pending == false)
 	{
@@ -1120,24 +1119,22 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 	{
 		//DevCon.Warning("GIF APATH busy %x Holding for later  W %x, R %x", gifRegs->stat.APATH, Path1WritePos, Path1ReadPos);
 		size = GIFPath_ParseTagQuick(GIF_PATH_1, data, diff);
-		pDest = &Path1Buffer[Path1WritePos*16];
+		u8* pDest = &Path1Buffer[Path1WritePos*16];
 
-		pxAssumeMsg((Path1WritePos+size < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!");
+		Path1WritePos += size;
+
+		pxAssumeMsg((Path1WritePos < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!");
 		//DevCon.Warning("Storing size %x PATH 1", size);
 
 		if (size > diff) {
-			// fixme: one of these days the following *16's will get cleaned up when we introduce
-			// a special qwc/simd16 optimized version of memcpy_aligned. :)
 			//DevCon.Status("XGkick Wrap!");
-			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
-			Path1WritePos += size;
+			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
 			size  -= diff;
 			pDest += diff*16;
-			memcpy_qwc(pDest, microVU1.regs->Mem, size);			
+			memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
 		}
 		else {
-			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
-			Path1WritePos += size;
+			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
 		}
 		//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);
 		gifRegs->stat.P1Q = true;

From 2d4c7aaa25034c471f14900a0bb00caee6e4b0fb Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sat, 17 Jul 2010 15:03:45 +0000
Subject: [PATCH 22/26] ReorderingMTGS: More tweaks to asm memcpy files (made
 code changes to Linux side, comment changes to Win32 side).

Linux Devs: Let's get this memcpy thing finalized, if its not already.  I'd like to merge the current state of this branch into trunk as soon as possible, since its currently looking very stable and has been, up to this point, a code cleanup and stabilization project.  (more invasive changes coming soon)

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3518 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/Utilities/MemcpyFast.h    |  10 +-
 common/src/Utilities/x86/MemcpyFast.cpp  |  11 +-
 common/src/Utilities/x86/MemcpyVibes.cpp | 187 +++++++++++------------
 3 files changed, 102 insertions(+), 106 deletions(-)

diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index 012d8cdfaa..800c1071b6 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -42,10 +42,12 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 extern void memcpy_vibes(void * dest, const void * src, int size);
 extern void gen_memcpy_vibes();
 
-#define memcpy_fast			memcpy_amd_  // Fast memcpy
+#define memcpy_fast				memcpy_amd_  // Fast memcpy
 #define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
-#define memcpy_const		memcpy_amd_	 // Memcpy with constant size
-#define memcpy_constA		memcpy_amd_  // Memcpy with constant size and 16-byte aligned
-#define memcpy_qwc_			memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
+#define memcpy_const			memcpy_amd_	 // Memcpy with constant size
+#define memcpy_constA			memcpy_amd_  // Memcpy with constant size and 16-byte aligned
+#define memcpy_qwc_				memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
 #define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
+
+// Useful alternative if we think memcpy_amd_qwc is buggy
 //#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
diff --git a/common/src/Utilities/x86/MemcpyFast.cpp b/common/src/Utilities/x86/MemcpyFast.cpp
index 64a8191068..0c8af9e63e 100644
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@@ -41,12 +41,10 @@
 MEMCPY_AMD.CPP
 ******************************************************************************/
 
-// Very optimized memcpy() routine for AMD Athlon and Duron family.
-// This code uses any of FOUR different basic copy methods, depending
-// on the transfer size.
 // NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
 // "Streaming Store"), and also uses the software prefetch instructions,
-// be sure you're running on Athlon/Duron or other recent CPU before calling!
+// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
+// calling!
 
 #define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
@@ -68,10 +66,8 @@ MEMCPY_AMD.CPP
 
 #if defined(_MSC_VER)
 
-// --------------------------------------------------------------------------------------
-//  Fast memcpy as coded by AMD, and then improved by air. 
-// --------------------------------------------------------------------------------------
 
+//  Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
 __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
 {
     __asm
@@ -92,6 +88,7 @@ __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_
 	jbe		$memcpy_do_align	;  it appears to be slower
 	cmp		eax, 64*1024
 	jbe		$memcpy_align_done
+
 $memcpy_do_align:
 	mov		eax, 8			; a trick that's faster than rep movsb...
 	sub		eax, edi		; align destination to qword
diff --git a/common/src/Utilities/x86/MemcpyVibes.cpp b/common/src/Utilities/x86/MemcpyVibes.cpp
index ced85b9e92..dd050bc4c6 100644
--- a/common/src/Utilities/x86/MemcpyVibes.cpp
+++ b/common/src/Utilities/x86/MemcpyVibes.cpp
@@ -155,99 +155,96 @@ __forceinline void memcpy_vibes(void * dest, const void * src, int size) {
 }
 
 #endif
-#endif
-
-// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
-// to get around compilation issues with having it in the headers.
-#ifdef __LINUX__
-
-	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
-	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
-	__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
-	{	
-		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
-		// registers will improve copy performance, because they won't.  Use of XMMs is only
-		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
-		// and even then the benefits are typically minimal (sometimes slower depending on the
-		// amount of data being copied).
-		//
-		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
-		//   --air
-
-		// Linux Conversion note:
-		//  This code would benefit nicely from having inline-able GAS syntax, since it should
-		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
-		//  And its called enough times to probably merit the extra effort to ensure proper
-		//  optimization. --air
-
-		__asm__
-		(
-			".intel_syntax noprefix\n"
-				//"mov		ecx, [%[dest]]\n"
-				//"mov		edx, [%[src]]\n"
-				//"mov		eax, [%[qwc]]\n"			// keep a copy of count
-				"mov		eax, %[qwc]\n"
-				"shr		eax, 1\n"
-				"jz			memcpy_qwc_1\n"				// only one 16 byte block to copy?
-
-				"cmp		%[qwc], 64\n"				// "IN_CACHE_COPY/32"
-				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
-		
-			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
-				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
-
-				"movq		mm0,[%[src]+0]\n"			// read 64 bits
-				"movq		mm1,[%[src]+8]\n"
-				"movq		mm2,[%[src]+16]\n"
-				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
-				"movntq		[%[dest]+8], mm1\n"
-				"movq		mm3,[%[src]+24]\n"
-				"movntq		[%[dest]+16], mm2\n"
-				"movntq		[%[dest]+24], mm3\n"
-
-				"add		%[src],32\n"				// update source pointer
-				"add		%[dest],32\n"				// update destination pointer
-				"sub		eax,1\n"
-				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
-				"sfence\n"								// flush the write buffer
-				"jmp		memcpy_qwc_1\n"
-
-			// 32-byte blocks, cached!
-			// This *is* important.  Removing this and using exclusively non-temporal stores
-			// results in noticeable speed loss!
-
-			"memcpy_qwc_loop1:\n"				
-				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
-
-				"movq		mm0,[%[src]+0]\n"			// read 64 bits
-				"movq		mm1,[%[src]+8]\n"
-				"movq		mm2,[%[src]+16]\n"
-				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
-				"movq		[%[dest]+8], mm1\n"
-				"movq		mm3,[%[src]+24]\n"
-				"movq		[%[dest]+16], mm2\n"
-				"movq		[%[dest]+24], mm3\n"
-
-				"add		%[src],32\n"				// update source pointer
-				"add		%[dest],32\n"				// update destination pointer
-				"sub		eax,1\n"
-				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
-
-			"memcpy_qwc_1:\n"
-				"test		%[qwc],1\n"
-				"jz			memcpy_qwc_final\n"
-				"movq		mm0,[%[src]]\n"
-				"movq		mm1,[%[src]+8]\n"
-				"movq		[%[dest]], mm0\n"
-				"movq		[%[dest]+8], mm1\n"
-
-			"memcpy_qwc_final:\n"
-				"emms\n"								// clean up the MMX state
-			".att_syntax\n"
-					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
-					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
-					: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
-		);
-	}
-#endif
+#endif
+
+// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
+// to get around compilation issues with having it in the headers.
+#ifdef __LINUX__
+
+	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
+	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
+	__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+	{	
+		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
+		// registers will improve copy performance, because they won't.  Use of XMMs is only
+		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
+		// and even then the benefits are typically minimal (sometimes slower depending on the
+		// amount of data being copied).
+		//
+		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
+		//   --air
+
+		// Linux Conversion note:
+		//  This code would benefit nicely from having inline-able GAS syntax, since it should
+		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
+		//  And its called enough times to probably merit the extra effort to ensure proper
+		//  optimization. --air
+
+		__asm__
+		(
+			".intel_syntax noprefix\n"
+				"mov		eax, %[qwc]\n"				// keep a copy of count for looping
+				"shr		eax, 1\n"
+				"jz			memcpy_qwc_1\n"				// only one 16 byte block to copy?
+
+				"cmp		eax, 64\n"					// "IN_CACHE_COPY/32"
+				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
+		
+			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
+
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		mm1,[%[src]+8]\n"
+				"movq		mm2,[%[src]+16]\n"
+				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movntq		[%[dest]+8], mm1\n"
+				"movq		mm3,[%[src]+24]\n"
+				"movntq		[%[dest]+16], mm2\n"
+				"movntq		[%[dest]+24], mm3\n"
+
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
+				"sfence\n"								// flush the write buffer
+				"jmp		memcpy_qwc_1\n"
+
+			// 32-byte blocks, cached!
+			// This *is* important.  Removing this and using exclusively non-temporal stores
+			// results in noticeable speed loss!
+
+			"memcpy_qwc_loop1:\n"				
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
+
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		mm1,[%[src]+8]\n"
+				"movq		mm2,[%[src]+16]\n"
+				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movq		[%[dest]+8], mm1\n"
+				"movq		mm3,[%[src]+24]\n"
+				"movq		[%[dest]+16], mm2\n"
+				"movq		[%[dest]+24], mm3\n"
+
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
+
+			"memcpy_qwc_1:\n"
+				"testl		%[qwc],1\n"
+				"jz			memcpy_qwc_final\n"
+				"movq		mm0,[%[src]]\n"
+				"movq		mm1,[%[src]+8]\n"
+				"movq		[%[dest]], mm0\n"
+				"movq		[%[dest]+8], mm1\n"
+
+			"memcpy_qwc_final:\n"
+				"emms\n"								// clean up the MMX state
+			".att_syntax\n"
+					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
+					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
+					: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
+		);
+	}
+#endif
 

From 4f62554702a6be3d90e70d3038e7de53fb9f0dbc Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sat, 17 Jul 2010 18:05:13 +0000
Subject: [PATCH 23/26] ReorderingMTGS: Revert changes to Vif_Codes.cpp from
 earlier, until other bugs in VIF processing can be resolved. (fixes ICO
 bootup).

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3519 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/src/Utilities/Exceptions.cpp |  2 +-
 pcsx2/Vif_Codes.cpp                 | 82 +++++++++++++++++++++++------
 pcsx2/gui/AppAssert.cpp             |  6 +--
 3 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/common/src/Utilities/Exceptions.cpp b/common/src/Utilities/Exceptions.cpp
index 7ff4440f7f..6e4869fc5e 100644
--- a/common/src/Utilities/Exceptions.cpp
+++ b/common/src/Utilities/Exceptions.cpp
@@ -71,7 +71,7 @@ wxString DiagnosticOrigin::ToString( const wxChar* msg ) const
 
 bool pxAssertImpl_LogIt( const DiagnosticOrigin& origin, const wxChar *msg )
 {
-	wxLogError( origin.ToString( msg ) );
+	wxLogError( L"%s", origin.ToString( msg ) );
 	return false;
 }
 
diff --git a/pcsx2/Vif_Codes.cpp b/pcsx2/Vif_Codes.cpp
index db58db1b82..d7208d1976 100644
--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@@ -134,7 +134,7 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 	}
 	pass2 {
 		vif1Only();
-
+		
 		if (GSTransferStatus.PTH3 < IDLE_MODE || gifRegs->stat.P1Q == true)
 		{
 			if(gifRegs->stat.APATH == GIF_APATH2 || ((GSTransferStatus.PTH3 <= IMAGE_MODE && gifRegs->stat.IMT && (vif1.cmd & 0x7f) == 0x50)) && gifRegs->stat.P1Q == false)
@@ -174,27 +174,75 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 
 		// the tag size should ALWAYS be 128 bits (qwc).  If it isn't, it means there's a serious bug
 		// somewhere in the VIF (likely relating to +/-'ing the tag.size during processing).
-		pxAssumeMsg( (vif1.tag.size & 3) == 0, "Invalid Vif1 DIRECT packet size detected!" );
+		// NOTE: ICO [PAL] exploits this during bootup.  Needs investigation. --air
+		//pxAssumeMsg( (vif1.tag.size & 3) == 0, "Invalid Vif1 DIRECT packet size detected!" );
 
-		const int	minSize	 = aMin(vif1.vifpacketsize, vif1.tag.size)/4;
-		uint ret;
+		nVifStruct&	v	 = nVif[1];
+		const int	ret	 = aMin(vif1.vifpacketsize, vif1.tag.size);
+		u32			size = ret << 2;
 
-		if (!minSize)
-			DevCon.Warning("VIF DIRECT (PATH2): No Data Transfer?");
+		//gifRegs->stat.APATH = GIF_APATH2; //Flag is cleared in vif1interrupt to simulate it being in progress.
+		
+		//In the original code we were saving this data, it seems if it does happen, its just blank, so we ignore it.
+		
+			if (!size) { DevCon.WriteLn("Path2: No Data Transfer?"); }
+			
 
-		GetMTGS().PrepDataPacket(GIF_PATH_2, minSize);
-		ret = GIFPath_CopyTag(GIF_PATH_2, (u128*)data, minSize)*4;
-		GetMTGS().SendDataPacket();
+			if(vif1.vifpacketsize < 4 && v.bSize < 16) 
+			{
+				memcpy(&v.buffer[v.bPtr], data, vif1.vifpacketsize << 2);
+				v.bSize += vif1.vifpacketsize << 2;
+				v.bPtr += vif1.vifpacketsize << 2;
+				vif1.tag.size -= vif1.vifpacketsize;
+				if(vif1.tag.size == 0) 
+				{
+					DevCon.Warning("Missaligned packet on DIRECT end!");
+					vif1.cmd = 0;
+				}
+				return vif1.vifpacketsize;
+			}
+			else
+			{
+				if(v.bSize)
+				{
+					int ret = 0;
 
-		vif1.tag.size -= ret;
+					if(v.bSize < 16)
+					{
+						if(((16 - v.bSize) >> 2) > vif1.vifpacketsize) DevCon.Warning("Not Enough Data!");
+						ret = (16 - v.bSize) >> 2;
+						memcpy(&v.buffer[v.bPtr], data, ret << 2);
+						vif1.tag.size -=  ret;						
+						v.bSize = 0;
+						v.bPtr = 0;						
+					}
+					GetMTGS().PrepDataPacket(GIF_PATH_2, 1);
+					GIFPath_CopyTag(GIF_PATH_2, (u128*)v.buffer, 1);
+					GetMTGS().SendDataPacket();
 
-		if(vif1.tag.size == 0) 
-		{
-			vif1.cmd = 0;
-			gifRegs->stat.clear_flags(GIF_STAT_APATH2 | GIF_STAT_OPH);
-		}
-		vif1.vifstalled    = true;
-		return ret;
+					if(vif1.tag.size == 0) 
+					{
+						vif1.cmd = 0;
+					}
+					vif1.vifstalled    = true;
+					return ret;
+				}
+				else
+				{
+					GetMTGS().PrepDataPacket(GIF_PATH_2, size/16);
+					uint count = GIFPath_CopyTag(GIF_PATH_2, (u128*)data, size/16) * 4;
+					GetMTGS().SendDataPacket();
+
+					vif1.tag.size -= count;
+					if(vif1.tag.size == 0) 
+					{
+						vif1.cmd = 0;
+					}
+					vif1.vifstalled    = true;
+					return count;
+				}
+			}
+			
 	}
 	return 0;
 }
diff --git a/pcsx2/gui/AppAssert.cpp b/pcsx2/gui/AppAssert.cpp
index 6d33082872..87f55d2449 100644
--- a/pcsx2/gui/AppAssert.cpp
+++ b/pcsx2/gui/AppAssert.cpp
@@ -134,10 +134,10 @@ bool AppDoAssert( const DiagnosticOrigin& origin, const wxChar *msg )
 	wxString trace( pxGetStackTrace(origin.function) );
 	wxString dbgmsg( origin.ToString( msg ) );
 
-	wxMessageOutputDebug().Printf( dbgmsg );
+	wxMessageOutputDebug().Printf( L"%s", dbgmsg );
 
-	Console.Error( dbgmsg );
-	Console.WriteLn( trace );
+	Console.Error( L"%s", dbgmsg );
+	Console.WriteLn( L"%s", trace );
 
 	wxString windowmsg( L"Assertion failed: " );
 	if( msg != NULL )

From 6eb8f100f69b19f7452925a7f86cde51bbb43415 Mon Sep 17 00:00:00 2001
From: arcum42 <arcum42@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sat, 17 Jul 2010 20:08:01 +0000
Subject: [PATCH 24/26] ReorderingMTGS: Fix memcpy_amd_qwc.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3520 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/src/Utilities/x86/MemcpyVibes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/src/Utilities/x86/MemcpyVibes.cpp b/common/src/Utilities/x86/MemcpyVibes.cpp
index dd050bc4c6..32d13895a1 100644
--- a/common/src/Utilities/x86/MemcpyVibes.cpp
+++ b/common/src/Utilities/x86/MemcpyVibes.cpp
@@ -231,7 +231,7 @@ __forceinline void memcpy_vibes(void * dest, const void * src, int size) {
 				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
 
 			"memcpy_qwc_1:\n"
-				"testl		%[qwc],1\n"
+				"test %[qwc],1\n"
 				"jz			memcpy_qwc_final\n"
 				"movq		mm0,[%[src]]\n"
 				"movq		mm1,[%[src]+8]\n"

From ffdf8223fdcc186e8128580a07c93971c8535122 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sat, 17 Jul 2010 20:32:58 +0000
Subject: [PATCH 25/26] ReorderingMTGS: Quick cleanups to gifMFIFO's path
 parsing.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3521 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/Gif.cpp | 58 ++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 31 deletions(-)

diff --git a/pcsx2/Gif.cpp b/pcsx2/Gif.cpp
index dd2ed58856..78aaaec3bb 100644
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@@ -152,7 +152,6 @@ __forceinline void gsInterrupt()
 static u32 WRITERING_DMA(u32 *pMem, u32 qwc)
 {
 	GetMTGS().PrepDataPacket(GIF_PATH_3, qwc);
-	//uint len1 = GIFPath_ParseTag(GIF_PATH_3, (u8*)pMem, qwc );
 	uint size = GIFPath_CopyTag(GIF_PATH_3, (u128*)pMem, qwc );
 	GetMTGS().SendDataPacket();
 	return size;
@@ -180,11 +179,6 @@ int  _GIFchain()
 		return -1;
 	}
 
-	//in Intermittent Mode it enabled, IMAGE_MODE transfers are sliced.
-
-	///(gifRegs->stat.IMT && GSTransferStatus.PTH3 <= IMAGE_MODE) qwc = min((int)gif->qwc, 8);
-	/*else qwc = gif->qwc;*/
-
 	return WRITERING_DMA(pMem, gif->qwc);
 }
 
@@ -448,42 +442,44 @@ static __forceinline bool mfifoGIFrbTransfer()
 	u32 mfifoqwc = min(gifqwc, (u32)gif->qwc);
 	u32 *src;
 
+	GetMTGS().PrepDataPacket(GIF_PATH_3, mfifoqwc);
+
+	// TODO (minor optimization): The new GIFpath parser can do rather efficient wrapping of
+	// its own internally now. We just need to groom a version of it that can wrap around MFIFO
+	// memory similarly to how it wraps VU1 memory on PATH1.
+
 	/* Check if the transfer should wrap around the ring buffer */
 	if ((gif->madr + mfifoqwc * 16) > (dmacRegs->rbor.ADDR + dmacRegs->rbsr.RMSK + 16))
 	{
 		uint s1 = ((dmacRegs->rbor.ADDR + dmacRegs->rbsr.RMSK + 16) - gif->madr) >> 4;
 		uint s2 = (mfifoqwc - s1);
-		// fixme - I don't think these should use WRITERING_DMA, since our source
-		// isn't the DmaGetAddr(gif->madr) address that WRITERING_DMA expects.
 
 		/* it does (wrap around), so first copy 's1' bytes from 'addr' to 'data' */
+		/* and second copy 's2' bytes from 'maddr' to '&data[s1]' */
+
 		src = (u32*)PSM(gif->madr);
 		if (src == NULL) return false;
-		s1 = WRITERING_DMA(src, s1);
+		uint copied = GIFPath_CopyTag(GIF_PATH_3, (u128*)src, s1);
 
-		if (s1 == (mfifoqwc - s2))
+		if (copied == s1)	// but only copy second if first didn't abort prematurely for some reason.
 		{
-			/* and second copy 's2' bytes from 'maddr' to '&data[s1]' */
 			src = (u32*)PSM(dmacRegs->rbor.ADDR);
 			if (src == NULL) return false;
-			s2 = WRITERING_DMA(src, s2);
-		}
-		else
-		{
-			s2 = 0;
+			copied += GIFPath_CopyTag(GIF_PATH_3, (u128*)src, s2);
 		}
 
-		mfifoqwc = s1 + s2;
+		mfifoqwc = copied;
 	}
 	else
 	{
 		/* it doesn't, so just transfer 'qwc*16' words from 'gif->madr' to GS */
 		src = (u32*)PSM(gif->madr);
 		if (src == NULL) return false;
-		mfifoqwc = WRITERING_DMA(src, mfifoqwc);
+		mfifoqwc = GIFPath_CopyTag(GIF_PATH_3, (u128*)src, mfifoqwc);
 		gif->madr = dmacRegs->rbor.ADDR + (gif->madr & dmacRegs->rbsr.RMSK);
 	}
 
+	GetMTGS().SendDataPacket();
 	gifqwc -= mfifoqwc;
 
 	return true;
@@ -569,36 +565,36 @@ void mfifoGIFtransfer(int qwc)
 
 		switch (ptag->ID)
 		{
-			case TAG_REFE: // Refe - Transfer Packet According to ADDR field
+			case TAG_REFE:		// Refe - Transfer Packet According to ADDR field
 				gif->tadr = qwctag(gif->tadr + 16);
 				gifstate = GIF_STATE_DONE;										//End Transfer
 				break;
 
-			case TAG_CNT: // CNT - Transfer QWC following the tag.
+			case TAG_CNT:		// CNT - Transfer QWC following the tag.
 				gif->madr = qwctag(gif->tadr + 16);						//Set MADR to QW after Tag
-				gif->tadr = qwctag(gif->madr + (gif->qwc << 4));			//Set TADR to QW following the data
+				gif->tadr = qwctag(gif->madr + (gif->qwc << 4));		//Set TADR to QW following the data
 				gifstate = GIF_STATE_READY;
 				break;
 
-			case TAG_NEXT: // Next - Transfer QWC following tag. TADR = ADDR
+			case TAG_NEXT:		// Next - Transfer QWC following tag. TADR = ADDR
 			{
-				u32 temp = gif->madr;								//Temporarily Store ADDR
-				gif->madr = qwctag(gif->tadr + 16); 					  //Set MADR to QW following the tag
-				gif->tadr = temp;								//Copy temporarily stored ADDR to Tag
+				u32 temp = gif->madr;									//Temporarily Store ADDR
+				gif->madr = qwctag(gif->tadr + 16);						//Set MADR to QW following the tag
+				gif->tadr = temp;										//Copy temporarily stored ADDR to Tag
 				gifstate = GIF_STATE_READY;
 				break;
 			}
 
-			case TAG_REF: // Ref - Transfer QWC from ADDR field
-			case TAG_REFS: // Refs - Transfer QWC from ADDR field (Stall Control)
+			case TAG_REF:		// Ref - Transfer QWC from ADDR field
+			case TAG_REFS:		// Refs - Transfer QWC from ADDR field (Stall Control)
 				gif->tadr = qwctag(gif->tadr + 16);							//Set TADR to next tag
 				gifstate = GIF_STATE_READY;
 				break;
 
-			case TAG_END: // End - Transfer QWC following the tag
-				gif->madr = qwctag(gif->tadr + 16);		//Set MADR to data following the tag
-				gif->tadr = qwctag(gif->madr + (gif->qwc << 4));			//Set TADR to QW following the data
-				gifstate = GIF_STATE_DONE;						//End Transfer
+			case TAG_END:		// End - Transfer QWC following the tag
+				gif->madr = qwctag(gif->tadr + 16);					//Set MADR to data following the tag
+				gif->tadr = qwctag(gif->madr + (gif->qwc << 4));	//Set TADR to QW following the data
+				gifstate = GIF_STATE_DONE;							//End Transfer
 				break;
 			}
 

From f7d0ed2dc53c4a36667caceda72aaee95c4b3369 Mon Sep 17 00:00:00 2001
From: ramapcsx2 <ramapcsx2@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sat, 17 Jul 2010 22:37:36 +0000
Subject: [PATCH 26/26] ReorderingMTGS: Clean up and unify all OPH flag
 handling to be in the BUSDIR handler.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3522 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/FiFo.cpp              |  2 --
 pcsx2/GS.cpp                | 15 ++++++++++-----
 pcsx2/Gif.cpp               |  9 +++------
 pcsx2/Vif1_Dma.cpp          |  6 ------
 pcsx2/Vif1_MFIFO.cpp        |  1 -
 pcsx2/ps2/GIFpath.cpp       |  7 ++++---
 pcsx2/x86/microVU_Lower.inl |  1 -
 pcsx2/x86/sVU_Lower.cpp     |  1 -
 8 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp
index 28122713c4..282eb68aaa 100644
--- a/pcsx2/FiFo.cpp
+++ b/pcsx2/FiFo.cpp
@@ -164,7 +164,6 @@ void __fastcall WriteFIFO_page_5(u32 mem, const mem128_t *value)
 
 	if(GSTransferStatus.PTH2 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH2)
 	{
-		if(gifRegs->stat.DIR == 0)gifRegs->stat.OPH = false;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
 	}
@@ -201,7 +200,6 @@ void __fastcall WriteFIFO_page_6(u32 mem, const mem128_t *value)
 	GetMTGS().SendDataPacket();
 	if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 )
 	{
-		if(gifRegs->stat.DIR == 0)gifRegs->stat.OPH = false;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
 	}
diff --git a/pcsx2/GS.cpp b/pcsx2/GS.cpp
index b9028de2ee..4dc0251393 100644
--- a/pcsx2/GS.cpp
+++ b/pcsx2/GS.cpp
@@ -282,14 +282,19 @@ void __fastcall gsWrite64_page_01( u32 mem, const mem64_t* value )
 	{
 		case 0x12001040: //busdir
 
-			//This is probably a complete hack, however writing to BUSDIR "should" start a transfer (Bleach Blade Battlers)
-			//Only problem is it kills killzone :( leaving it commented out for now.
+			//This is probably a complete hack, however writing to BUSDIR "should" start a transfer 
+			//(Bleach Blade Battlers, Growlanser 2 and 3, Wizardry)
+			//Only problem is it kills killzone :(.
 			// (yes it *is* a complete hack; both lines here in fact --air)
 			//=========================================================================
-			//gifRegs->stat.OPH = true; 
+			//Console.Warning("BUSDIR write! Setting OPH and DIR to = %x",(u32)value[0]);
+			if ((u32)value[0] == 1)
+				gifRegs->stat.OPH = true;
+			else
+				gifRegs->stat.OPH = false;
+			
+			gifRegs->stat.DIR = (u32)value[0];
 			//=========================================================================
-			gifRegs->stat.DIR = (u32)value;
-
 			// BUSDIR INSANITY !! MTGS FLUSH NEEDED
 			//
 			// Yup folks.  BUSDIR is evil.  The only safe way to handle it is to flush the whole MTGS
diff --git a/pcsx2/Gif.cpp b/pcsx2/Gif.cpp
index 78aaaec3bb..137df0564f 100644
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@@ -69,8 +69,7 @@ void gsPath1Interrupt()
 				size -= count;
 
 				if(GSTransferStatus.PTH1 == STOPPED_MODE)
-				{
-					gifRegs->stat.OPH = false;				
+				{		
 					gifRegs->stat.APATH = GIF_APATH_IDLE;
 				}
 			}
@@ -106,7 +105,6 @@ __forceinline void gsInterrupt()
 
 	if(GSTransferStatus.PTH3 >= PENDINGSTOP_MODE && gifRegs->stat.APATH == GIF_APATH3 )
 	{
-		gifRegs->stat.OPH = false;
 		GSTransferStatus.PTH3 = STOPPED_MODE;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
@@ -319,7 +317,7 @@ void GIFdma()
 		
 
 	 	
-	    //gifRegs->stat.OPH = true;
+	    //gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama)
 		gifRegs->stat.FQC = min((u16)0x10, gif->qwc);// FQC=31, hack ;) (for values of 31 that equal 16) [ used to be 0xE00; // APATH=3]
 		//Check with Path3 masking games
 		if (gif->qwc > 0) {
@@ -338,7 +336,7 @@ void GIFdma()
 		
 	}
 	
-	//gifRegs->stat.OPH = true;
+	//gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama)
 	// Transfer Dn_QWC from Dn_MADR to GIF
 	if ((gif->chcr.MOD == NORMAL_MODE) || (gif->qwc > 0)) // Normal Mode
 	{
@@ -632,7 +630,6 @@ void gifMFIFOInterrupt()
 
 	if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 )
 	{
-		gifRegs->stat.OPH = false;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
 	}
diff --git a/pcsx2/Vif1_Dma.cpp b/pcsx2/Vif1_Dma.cpp
index c7e42ad814..8f738a590e 100644
--- a/pcsx2/Vif1_Dma.cpp
+++ b/pcsx2/Vif1_Dma.cpp
@@ -345,7 +345,6 @@ __forceinline void vif1Interrupt()
 
 	if(GSTransferStatus.PTH2 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH2)
 	{
-		gifRegs->stat.OPH = false;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
 	}
@@ -440,11 +439,6 @@ __forceinline void vif1Interrupt()
 	if (vif1.cmd != 0) Console.WriteLn("vif1.cmd still set %x tag size %x", vif1.cmd, vif1.tag.size);
 #endif
 
-	
-	if((vif1ch->chcr.DIR == VIF_NORMAL_TO_MEM_MODE) && vif1.GSLastDownloadSize <= 16) 
-	{   //Reverse fifo has finished and nothing is left, so lets clear the outputting flag
-		gifRegs->stat.OPH = false;
-	}
 	vif1ch->chcr.STR = false;
 	vif1.vifstalled = false;
 	g_vifCycles = 0;
diff --git a/pcsx2/Vif1_MFIFO.cpp b/pcsx2/Vif1_MFIFO.cpp
index 64ff291b90..86f8008e6f 100644
--- a/pcsx2/Vif1_MFIFO.cpp
+++ b/pcsx2/Vif1_MFIFO.cpp
@@ -239,7 +239,6 @@ void vifMFIFOInterrupt()
 	if(GSTransferStatus.PTH2 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH2)
 	{
 		GSTransferStatus.PTH2 = STOPPED_MODE;
-		if(gifRegs->stat.DIR == 0)gifRegs->stat.OPH = false;
 		gifRegs->stat.APATH = GIF_APATH_IDLE;
 		if(gifRegs->stat.P1Q) gsPath1Interrupt();
 		/*gifRegs->stat.APATH = GIF_APATH_IDLE;
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index 4dfd10438a..1c9366ec21 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -357,7 +357,8 @@ static __forceinline void gsHandler(const u8* pMem)
 			// qwords, rounded down; any extra bits are lost
 			// games must take care to ensure transfer rectangles are exact multiples of a qword
 			vif1.GSLastDownloadSize = vif1.TRXREG.RRW * vif1.TRXREG.RRH * bpp >> 7;
-			gifRegs->stat.OPH = true;
+			//DevCon.Warning("GS download in progress. OPH = %x", gifRegs->stat.OPH);
+			//gifRegs->stat.OPH = true; // Too early to set it here. It should be done on a BUSDIR call (rama)
 		}
 	}
 	if (reg >= 0x60)
@@ -617,7 +618,7 @@ __forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
 			}	
 			if(GSTransferStatus.PTH3 < PENDINGSTOP_MODE || pathidx != 2)
 			{
-				gifRegs->stat.OPH = true;
+				//gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama)
 				gifRegs->stat.APATH = pathidx + 1;	
 			}
 
@@ -645,7 +646,7 @@ __forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
 					break;
 			}
 			gifRegs->stat.APATH = pathidx + 1;
-			gifRegs->stat.OPH = true;
+			//gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama)
 	
 			switch(tag.FLG) {
 				case GIF_FLG_PACKED:
diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl
index 0079cc083d..380a75863d 100644
--- a/pcsx2/x86/microVU_Lower.inl
+++ b/pcsx2/x86/microVU_Lower.inl
@@ -1111,7 +1111,6 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 
 		if(GSTransferStatus.PTH1 == STOPPED_MODE)
 		{
-			gifRegs->stat.OPH = false;
 			gifRegs->stat.APATH = GIF_APATH_IDLE;
 		}
 	}
diff --git a/pcsx2/x86/sVU_Lower.cpp b/pcsx2/x86/sVU_Lower.cpp
index 30ec18fd91..5191748f78 100644
--- a/pcsx2/x86/sVU_Lower.cpp
+++ b/pcsx2/x86/sVU_Lower.cpp
@@ -1994,7 +1994,6 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 
 		if(GSTransferStatus.PTH1 == STOPPED_MODE )
 		{
-			gifRegs->stat.OPH = false;
 			gifRegs->stat.APATH = GIF_APATH_IDLE;
 		}
 	}