ReorderingMTGS:

* Implemented GIFPath_CopyTag, which performs a "copy-in-place" while parsing tags (big speedup over the old parse-then-copy strategy, especially with the SSE intrinsics I've included for kicks). * Removed the old ringbuffer 'restart' mechanism and replaced it with a truly free-flowing wrapping mechanism. Utilizes the ringbuffer more efficiently, and removes quite a bit of overhead from the MTGS's PrepDataPacket call. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3458 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-07-11 04:53:50 +00:00 · 2010-07-11 04:53:50 +00:00 · a9084741bc
parent 65f50f009f
commit a9084741bc
12 changed files with 304 additions and 301 deletions
--- a/common/include/PS2Edefs.h
+++ b/common/include/PS2Edefs.h
@ -564,6 +564,7 @@ typedef void (CALLBACK* _PS2EsetEmuVersion)(const char* emuId, u32 version);		//
 typedef s32  (CALLBACK* _GSopen)(void *pDsp, char *Title, int multithread);
 typedef s32  (CALLBACK* _GSopen2)( void *pDsp, u32 flags );
 typedef void (CALLBACK* _GSvsync)(int field);
+typedef void (CALLBACK* _GSgifTransfer)(u32 *pMem, u32 size);
 typedef void (CALLBACK* _GSgifTransfer1)(u32 *pMem, u32 addr);
 typedef void (CALLBACK* _GSgifTransfer2)(u32 *pMem, u32 size);
 typedef void (CALLBACK* _GSgifTransfer3)(u32 *pMem, u32 size);
@ -723,6 +724,7 @@ typedef void (CALLBACK* _FWirqCallback)(void (*callback)());
 extern _GSopen            GSopen;
 extern _GSopen2           GSopen2;
 extern _GSvsync           GSvsync;
+extern _GSgifTransfer     GSgifTransfer;
 extern _GSgifTransfer1    GSgifTransfer1;
 extern _GSgifTransfer2    GSgifTransfer2;
 extern _GSgifTransfer3    GSgifTransfer3;
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@ -36,7 +36,7 @@
 // Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
 void _memset16_unaligned( void* dest, u16 data, size_t size );

-#define memcpy_fast		memcpy_amd_ // Fast memcpy
-#define memcpy_aligned	memcpy_amd_	// Memcpy with 16-byte Aligned addresses
-#define memcpy_const	memcpy_amd_	// Memcpy with constant size
-#define memcpy_constA	memcpy_amd_ // Memcpy with constant size and 16-byte aligned
+#define memcpy_fast				memcpy_amd_ // Fast memcpy
+#define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c*16)	// Memcpy with 16-byte Aligned addresses
+#define memcpy_const			memcpy_amd_	// Memcpy with constant size
+#define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned
--- a/pcsx2/FiFo.cpp
+++ b/pcsx2/FiFo.cpp
@ -195,10 +195,9 @@ void __fastcall WriteFIFO_page_6(u32 mem, const mem128_t *value)
 	nloop0_packet[1] = psHu32(GIF_FIFO + 4);
 	nloop0_packet[2] = psHu32(GIF_FIFO + 8);
 	nloop0_packet[3] = psHu32(GIF_FIFO + 12);
-	GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)nloop0_packet, 1);
+	GetMTGS().PrepDataPacket(GIF_PATH_3, 1);
 	u64* data = (u64*)GetMTGS().GetDataPacketPtr();
-	data[0] = value[0];
-	data[1] = value[1];
+	GIFPath_CopyTag( GIF_PATH_3, (u128*)nloop0_packet, 1 );
 	GetMTGS().SendDataPacket();
 	if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 )
 	{
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@ -229,7 +229,7 @@ enum GIF_PATH
 	GIF_PATH_3,
 };

-extern int  GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size);
+extern int  GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size);
 extern int  GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
 extern void GIFPath_Reset();
 extern void GIFPath_Clear( GIF_PATH pathidx );
@ -282,7 +282,6 @@ public:
 	volatile s32	m_SignalRingPosition;

 	int				m_QueuedFrameCount;
-	u32				m_RingWrapSpot;

 	Mutex			m_lock_RingBufferBusy;
 	Semaphore		m_sem_OnRingReset;
@ -301,6 +300,7 @@ public:
 	// These vars maintain instance data for sending Data Packets.
 	// Only one data packet can be constructed and uploaded at a time.

+	uint			m_packet_startpos;	// size of the packet (data only, ie. not including the 16 byte command!)
 	uint			m_packet_size;		// size of the packet (data only, ie. not including the 16 byte command!)
 	uint			m_packet_ringpos;	// index of the data location in the ringbuffer.

@ -317,14 +317,13 @@ public:
 	void WaitGS();
 	void ResetGS();

-	int PrepDataPacket( MTGS_RingCommand cmd, u32 size );
-	int PrepDataPacket( GIF_PATH pathidx, const u8*  srcdata, u32 size );
+	void PrepDataPacket( MTGS_RingCommand cmd, u32 size );
+	void PrepDataPacket( GIF_PATH pathidx, u32 size );
 	void SendDataPacket();
 	void SendGameCRC( u32 crc );
 	void WaitForOpen();
 	void Freeze( int mode, MTGS_FreezeData& data );

-	void RestartRingbuffer( uint packsize=0 );
 	void SendSimplePacket( MTGS_RingCommand type, int data0, int data1, int data2 );
 	void SendPointerPacket( MTGS_RingCommand type, u32 data0, void* data1 );

@ -416,3 +415,31 @@ extern int g_nLeftGSFrames;

 #endif

+// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s.
+// (actual size is 1<<m_RingBufferSizeFactor simd vectors [128-bit values])
+// A value of 19 is a 8meg ring buffer.  18 would be 4 megs, and 20 would be 16 megs.
+// Default was 2mb, but some games with lots of MTGS activity want 8mb to run fast (rama)
+static const uint RingBufferSizeFactor = 19;
+
+// size of the ringbuffer in simd128's.
+static const uint RingBufferSize = 1<<RingBufferSizeFactor;
+
+// Mask to apply to ring buffer indices to wrap the pointer from end to
+// start (the wrapping is what makes it a ringbuffer, yo!)
+static const uint RingBufferMask = RingBufferSize - 1;
+
+struct MTGS_BufferedData
+{
+	u128		m_Ring[RingBufferSize];
+	u8			Regs[Ps2MemSize::GSregs];
+
+	MTGS_BufferedData() {}
+
+	u128& operator[]( uint idx )
+	{
+		pxAssert( idx < RingBufferSize );
+		return m_Ring[idx];
+	}
+};
+
+extern __aligned(32) MTGS_BufferedData RingBuffer;
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@ -59,16 +59,15 @@ void gsPath1Interrupt()
 		gifRegs->stat.P1Q = false;
 		while(Path1WritePos > 0)
 		{
-			u32 size  = GetMTGS().PrepDataPacket(GIF_PATH_1, Path1Buffer + (Path1ReadPos  * 16), (Path1WritePos - Path1ReadPos));
-			u8* pDest = GetMTGS().GetDataPacketPtr();
+			uint size = (Path1WritePos - Path1ReadPos);
+			GetMTGS().PrepDataPacket(GIF_PATH_1, size);
 			//DevCon.Warning("Flush Size = %x", size);
-			
-			memcpy_aligned(pDest, Path1Buffer + (Path1ReadPos * 16), size  * 16);
-			GetMTGS().SendDataPacket();
-			

-			Path1ReadPos += size;
-			
+			uint count = GIFPath_CopyTag(GIF_PATH_1, ((u128*)Path1Buffer) + Path1ReadPos, size);
+			GetMTGS().SendDataPacket();
+			pxAssume( count == size );
+			Path1ReadPos += count;
+
 			if(GSTransferStatus.PTH1 == STOPPED_MODE)
 			{
 				gifRegs->stat.OPH = false;				
@ -150,11 +149,9 @@ __forceinline void gsInterrupt()

 static u32 WRITERING_DMA(u32 *pMem, u32 qwc)
 {
-	int size   = GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)pMem, qwc);
-	u8* pgsmem = GetMTGS().GetDataPacketPtr();
-
-	memcpy_aligned(pgsmem, pMem, size<<4);
-
+	GetMTGS().PrepDataPacket(GIF_PATH_3, qwc);
+	//uint len1 = GIFPath_ParseTag(GIF_PATH_3, (u8*)pMem, qwc );
+	uint size = GIFPath_CopyTag(GIF_PATH_3, (u128*)pMem, qwc );
 	GetMTGS().SendDataPacket();
 	return size;
 }
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@ -29,7 +29,7 @@

 using namespace Threading;

-#if 0 // PCSX2_DEBUG
+#if 0 //PCSX2_DEBUG
 #	define MTGS_LOG Console.WriteLn
 #else
 #	define MTGS_LOG 0&&
@ -46,34 +46,7 @@ using namespace Threading;
 //  MTGS Threaded Class Implementation
 // =====================================================================================================

-// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s.
-// (actual size is 1<<m_RingBufferSizeFactor simd vectors [128-bit values])
-// A value of 19 is a 8meg ring buffer.  18 would be 4 megs, and 20 would be 16 megs.
-// Default was 2mb, but some games with lots of MTGS activity want 8mb to run fast (rama)
-static const uint RingBufferSizeFactor = 19;
-
-// size of the ringbuffer in simd128's.
-static const uint RingBufferSize = 1<<RingBufferSizeFactor;
-
-// Mask to apply to ring buffer indices to wrap the pointer from end to
-// start (the wrapping is what makes it a ringbuffer, yo!)
-static const uint RingBufferMask = RingBufferSize - 1;
-
-struct MTGS_BufferedData
-{
-	u128		m_Ring[RingBufferSize];
-	u8			Regs[Ps2MemSize::GSregs];
-
-	MTGS_BufferedData() {}
-
-	u128& operator[]( uint idx )
-	{
-		pxAssert( idx < RingBufferSize );
-		return m_Ring[idx];
-	}
-};
-
-static __aligned(32) MTGS_BufferedData RingBuffer;
+__aligned(32) MTGS_BufferedData RingBuffer;
 extern bool renderswitch;


@ -106,7 +79,6 @@ void SysMtgsThread::OnStart()
 	m_QueuedFrameCount	= 0;
 	m_SignalRingEnable	= 0;
 	m_SignalRingPosition= 0;
-	m_RingWrapSpot		= 0;

 	m_CopyDataTally		= 0;

@ -125,12 +97,15 @@ void SysMtgsThread::OnResumeReady()

 void SysMtgsThread::ResetGS()
 {
+	pxAssertDev( !IsOpen() || (m_RingPos == m_WritePos), "Must close or terminate the GS thread prior to gsReset." );
+
 	// MTGS Reset process:
 	//  * clear the ringbuffer.
 	//  * Signal a reset.
 	//  * clear the path and byRegs structs (used by GIFtagDummy)

 	m_RingPos = m_WritePos;
+	m_QueuedFrameCount = 0;

 	MTGS_LOG( "MTGS: Sending Reset..." );
 	SendSimplePacket( GS_RINGTYPE_RESET, 0, 0, 0 );
@ -155,7 +130,8 @@ void SysMtgsThread::PostVsyncEnd()
 	// 256-byte copy is only a few dozen cycles -- executed 60 times a second -- so probably
 	// not worth the effort or overhead of trying to selectively avoid it.

-	PrepDataPacket(GS_RINGTYPE_VSYNC, sizeof(RingCmdPacket_Vsync));
+	uint packsize = sizeof(RingCmdPacket_Vsync) / 16;
+	PrepDataPacket(GS_RINGTYPE_VSYNC, packsize);
 	RingCmdPacket_Vsync& local( *(RingCmdPacket_Vsync*)GetDataPacketPtr() );

 	memcpy_fast( local.regset1, PS2MEM_GS, sizeof(local.regset1) );
@ -163,6 +139,7 @@ void SysMtgsThread::PostVsyncEnd()
 	local.imr = GSIMR;
 	local.siglblid = GSSIGLBLID;

+	m_packet_ringpos += packsize;
 	SendDataPacket();

 	// Alter-frame flushing!  Restarts the ringbuffer (wraps) on every other frame.  This is a
@ -172,13 +149,29 @@ void SysMtgsThread::PostVsyncEnd()
 	// and they also allow us to reuse the front of the ringbuffer more often, which should improve
 	// L2 cache performance.

-	if( m_QueuedFrameCount > 0 )
-		RestartRingbuffer();
+	if( AtomicIncrement(m_QueuedFrameCount) == 0 ) return;
+
+	uint readpos = volatize(m_RingPos);
+	uint freeroom;
+
+	if (m_WritePos < readpos)
+		freeroom = readpos - m_WritePos;
 	else
-	{
-		m_QueuedFrameCount++;
-		SetEvent();
-	}
+		freeroom = RingBufferSize - (m_WritePos - readpos);
+
+	uint totalAccum	= RingBufferSize - freeroom;
+	uint somedone	= totalAccum / 4;
+
+	m_SignalRingPosition = totalAccum;
+
+	//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Vsync Sleep!\t\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
+
+	AtomicExchange( m_SignalRingEnable, 1 );
+	SetEvent();
+	m_sem_OnRingReset.WaitWithoutYield();
+	readpos = volatize(m_RingPos);
+
+	pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
 }

 struct PacketTagType
@ -197,7 +190,7 @@ void SysMtgsThread::OpenPlugin()
 {
 	if( m_PluginOpened ) return;

-	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
+	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 );
 	GSsetBaseMem( RingBuffer.Regs );
 	GSirqCallback( dummyIrqCallback );

@ -330,38 +323,75 @@ void SysMtgsThread::ExecuteTaskInThread()
 			{
 				case GS_RINGTYPE_P1:
 				{
+					uint datapos = (m_RingPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
-					const u128* data = &RingBuffer[m_RingPos+1];
+					const u128* data = &RingBuffer[datapos];

 					MTGS_LOG( "(MTGS Packet Read) ringtype=P1, qwc=%u", qsize );

-					// make sure that tag>>16 is the MAX size readable
-					GSgifTransfer1((u32*)(data - 0x400 + qsize), 0x4000-qsize*16);
-					//GSgifTransfer1((u32*)data, qsize);
+					uint endpos = datapos + qsize;
+					if( endpos >= RingBufferSize )
+					{
+						uint firstcopylen = RingBufferSize - datapos;
+						GSgifTransfer( (u32*)data, firstcopylen );
+						datapos = endpos & RingBufferMask;
+						GSgifTransfer( (u32*)RingBuffer.m_Ring, datapos );
+					}
+					else
+					{
+						GSgifTransfer( (u32*)data, qsize );
+					}
+
 					ringposinc += qsize;
 				}
 				break;

 				case GS_RINGTYPE_P2:
 				{
+					uint datapos = (m_RingPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
-					const u128* data = &RingBuffer[m_RingPos+1];
+					const u128* data = &RingBuffer[datapos];

 					MTGS_LOG( "(MTGS Packet Read) ringtype=P2, qwc=%u", qsize );

-					GSgifTransfer2((u32*)data, qsize);
+					uint endpos = datapos + qsize;
+					if( endpos >= RingBufferSize )
+					{
+						uint firstcopylen = RingBufferSize - datapos;
+						GSgifTransfer2( (u32*)data, firstcopylen );
+						datapos = endpos & RingBufferMask;
+						GSgifTransfer2( (u32*)RingBuffer.m_Ring, datapos );
+					}
+					else
+					{
+						GSgifTransfer2( (u32*)data, qsize );
+					}
+
 					ringposinc += qsize;
 				}
 				break;

 				case GS_RINGTYPE_P3:
 				{
+					uint datapos = (m_RingPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
-					const u128* data = &RingBuffer[m_RingPos+1];
+					const u128* data = &RingBuffer[datapos];

 					MTGS_LOG( "(MTGS Packet Read) ringtype=P3, qwc=%u", qsize );

-					GSgifTransfer3((u32*)data, qsize);
+					uint endpos = datapos + qsize;
+					if( endpos >= RingBufferSize )
+					{
+						uint firstcopylen = RingBufferSize - datapos;
+						GSgifTransfer3( (u32*)data, firstcopylen );
+						datapos = endpos & RingBufferMask;
+						GSgifTransfer3( (u32*)RingBuffer.m_Ring, datapos );
+					}
+					else
+					{
+						GSgifTransfer3( (u32*)data, qsize );
+					}
+
 					ringposinc += qsize;
 				}
 				break;
@ -380,7 +410,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 							const int qsize = tag.data[0];
 							ringposinc += qsize;

-							MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", tag.data[0], tag.data[1] ? "true" : "false" );
+							MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", !!(((u32&)RingBuffer.Regs[0x1000]) & 0x2000) ? 0 : 1, tag.data[1] ? "true" : "false" );
 							
 							// Mail in the important GS registers.
 							RingCmdPacket_Vsync& local((RingCmdPacket_Vsync&)RingBuffer[m_RingPos+1]);
@ -398,6 +428,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 							if( (GSopen2 == NULL) && (PADupdate != NULL) )
 								PADupdate(0);

+							AtomicDecrement( m_QueuedFrameCount );
 							StateCheckInThread();
 						}
 						break;
@ -450,9 +481,14 @@ void SysMtgsThread::ExecuteTaskInThread()
 				}
 			}

-			uint newringpos = m_RingPos + ringposinc;
-			pxAssert( newringpos <= RingBufferSize );
-			m_RingPos = newringpos & RingBufferMask;
+			uint newringpos = (m_RingPos + ringposinc) & RingBufferMask;
+
+			if( EmuConfig.GS.SynchronousMTGS )
+			{
+				pxAssert( m_WritePos == newringpos );
+			}
+			
+			m_RingPos = newringpos;

 			if( m_SignalRingEnable != 0 )
 			{
@ -546,7 +582,7 @@ void SysMtgsThread::SetEvent()

 u8* SysMtgsThread::GetDataPacketPtr() const
 {
-	return (u8*)&RingBuffer[m_packet_ringpos];
+	return (u8*)&RingBuffer[m_packet_ringpos & RingBufferMask];
 }

 // Closes the data packet send command, and initiates the gs thread (if needed).
@ -555,6 +591,7 @@ void SysMtgsThread::SendDataPacket()
 	// make sure a previous copy block has been started somewhere.
 	pxAssert( m_packet_size != 0 );

+	#if 0
 	uint temp = m_packet_ringpos + m_packet_size;
 	pxAssert( temp <= RingBufferSize );
 	temp &= RingBufferMask;
@ -578,8 +615,16 @@ void SysMtgsThread::SendDataPacket()
 			pxAssert( readpos != temp );
 		}
 	}
+	#endif

-	m_WritePos = temp;
+	uint actualSize = ((m_packet_ringpos - m_packet_startpos) & RingBufferMask)-1;
+	pxAssert( actualSize <= m_packet_size );
+	pxAssert( m_packet_ringpos < RingBufferSize );
+
+	PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
+	tag.data[0] = actualSize;
+
+	m_WritePos = m_packet_ringpos;

 	if( EmuConfig.GS.SynchronousMTGS )
 	{
@ -596,7 +641,7 @@ void SysMtgsThread::SendDataPacket()
 	//m_PacketLocker.Release();
 }

-int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
+void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 {
 	// Note on volatiles: m_WritePos is not modified by the GS thread, so there's no need
 	// to use volatile reads here.  We do cache it though, since we know it never changes,
@ -613,119 +658,63 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 	m_packet_size = size;
 	++size;			// takes into account our RingCommand QWC.

-	if( writepos + size < RingBufferSize )
+	// generic gs wait/stall.
+	// if the writepos is past the readpos then we're safe.
+	// But if not then we need to make sure the readpos is outside the scope of
+	// the block about to be written (writepos + size)
+
+	uint readpos = volatize(m_RingPos);
+	uint endpos = writepos+size;
+	uint freeroom;
+
+	if (writepos < readpos)
+		freeroom = readpos - writepos;
+	else
+		freeroom = RingBufferSize - (writepos - readpos);
+
+	if (freeroom < size)
 	{
-		// generic gs wait/stall.
-		// if the writepos is past the readpos then we're safe.
-		// But if not then we need to make sure the readpos is outside the scope of
-		// the block about to be written (writepos + size)
+		// writepos will overlap readpos if we commit the data, so we need to wait until
+		// readpos is out past the end of the future write pos, or until it wraps around
+		// (in which case writepos will be >= readpos).

-		uint readpos = volatize(m_RingPos);
-		if( (writepos < readpos) && (writepos+size >= readpos) )
+		// Ideally though we want to wait longer, because if we just toss in this packet
+		// the next packet will likely stall up too.  So lets set a condition for the MTGS
+		// thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied.
+
+		uint somedone	= (RingBufferSize - freeroom) / 4;
+		if( somedone < size+1 ) somedone = size + 1;
+
+		// FMV Optimization: FMVs typically send *very* little data to the GS, in some cases
+		// every other frame is nothing more than a page swap.  Sleeping the EEcore is a
+		// waste of time, and we get better results using a spinwait.
+
+		if( somedone > 0x80 )
 		{
-			// writepos is behind the readpos and will overlap it if we commit the data,
-			// so we need to wait until readpos is out past the end of the future write pos,
-			// or until it wraps around (in which case writepos will be >= readpos).
+			pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
+			m_SignalRingPosition = somedone;

-			// Ideally though we want to wait longer, because if we just toss in this packet
-			// the next packet will likely stall up too.  So lets set a condition for the MTGS
-			// thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied.
+			//Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition );

-			uint totalAccum	= (m_RingWrapSpot - readpos) + writepos;
-			uint somedone	= totalAccum / 4;
-			if( somedone < size+1 ) somedone = size + 1;
-
-			// FMV Optimization: FMVs typically send *very* little data to the GS, in some cases
-			// every other frame is nothing more than a page swap.  Sleeping the EEcore is a
-			// waste of time, and we get better results using a spinwait.
-
-			if( somedone > 0x80 )
-			{
-				pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
-				m_SignalRingPosition = somedone;
-
-				//Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition );
-
-				do {
-					AtomicExchange( m_SignalRingEnable, 1 );
-					SetEvent();
-					m_sem_OnRingReset.WaitWithoutYield();
-					readpos = volatize(m_RingPos);
-					//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
-				} while( (writepos < readpos) && (writepos+size >= readpos) );
-
-				pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
-			}
-			else
-			{
+			do {
+				AtomicExchange( m_SignalRingEnable, 1 );
 				SetEvent();
-				do {
-					SpinWait();
-					readpos = volatize(m_RingPos);
-				} while( (writepos < readpos) && (writepos+size >= readpos) );
-			}
+				m_sem_OnRingReset.WaitWithoutYield();
+				readpos = volatize(m_RingPos);
+				//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
+			} while( (writepos < readpos) && (writepos+size >= readpos) );
+
+			pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
+		}
+		else
+		{
+			SetEvent();
+			do {
+				SpinWait();
+				readpos = volatize(m_RingPos);
+			} while( (writepos < readpos) && (writepos+size >= readpos) );
 		}
 	}
-	else if( writepos + size > RingBufferSize )
-	{
-		pxAssert( writepos != 0 );
-
-		// If the incoming packet doesn't fit, then start over from the start of the ring
-		// buffer (it's a lot easier than trying to wrap the packet around the end of the
-		// buffer).
-
-		//Console.WriteLn( "MTGS > Ringbuffer Got Filled!");
-		RestartRingbuffer( size );
-		writepos = m_WritePos;
-	}
-    else	// always true - if( writepos + size == MTGS_RINGBUFFEREND )
-	{
-		// Yay.  Perfect fit.  What are the odds?
-		// Copy is ready so long as readpos is less than writepos and *not* equal to the
-		// base of the ringbuffer (otherwise the buffer will stop when the writepos is
-		// wrapped around to zero later-on in SendDataPacket).
-
-		uint readpos = volatize(m_RingPos);
-		//Console.WriteLn( "MTGS > Perfect Fit!\tringpos=0x%06x, writepos=0x%06x", readpos, writepos );
-		if( readpos > writepos || readpos == 0 )
-		{
-			uint totalAccum	= (readpos == 0) ? RingBufferSize : ((m_RingWrapSpot - readpos) + writepos);
-			uint somedone	= totalAccum / 4;
-			if( somedone < size+1 ) somedone = size + 1;
-
-			// FMV Optimization: (see above) This condition of a perfect fit is so rare that optimizing
-			// for it is pointless -- but it was also mindlessly simple copy-paste.  So there. :p
-
-			if( somedone > 0x80 )
-			{
-				m_SignalRingPosition = somedone;
-
-				//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Sleep!\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
-
-				do {
-					AtomicExchange( m_SignalRingEnable, 1 );
-					SetEvent();
-					m_sem_OnRingReset.WaitWithoutYield();
-					readpos = volatize(m_RingPos);
-					//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Post-sleep Report!\tringpos=0x%06x", readpos );
-				} while( (writepos < readpos) || (readpos==0) );
-
-				pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
-			}
-			else
-			{
-				//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Spin!" );
-				SetEvent();
-				do {
-					SpinWait();
-					readpos = volatize(m_RingPos);
-				} while( (writepos < readpos) || (readpos==0) );
-			}
-		}
-
-		m_QueuedFrameCount = 0;
-		m_RingWrapSpot = RingBufferSize;
-    }

 #ifdef RINGBUF_DEBUG_STACK
 	m_lock_Stack.Lock();
@ -739,9 +728,8 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
 	tag.command = cmd;
 	tag.data[0] = m_packet_size;
-	m_packet_ringpos = m_WritePos + 1;
-
-	return m_packet_size;
+	m_packet_startpos = m_WritePos;
+	m_packet_ringpos = (m_WritePos + 1) & RingBufferMask;
 }

 // Returns the amount of giftag data processed (in simd128 values).
@ -749,13 +737,14 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 // around VU memory instead of having buffer overflow...
 // Parameters:
 //  size - size of the packet data, in smd128's
-int SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size )
+void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size )
 {
 	//m_PacketLocker.Acquire();

-	return PrepDataPacket( (MTGS_RingCommand)pathidx, GIFPath_ParseTag(pathidx, srcdata, size) );
+	PrepDataPacket( (MTGS_RingCommand)pathidx, size );
 }

+#if 0
 void SysMtgsThread::RestartRingbuffer( uint packsize )
 {
 	if( m_WritePos == 0 ) return;
@ -816,6 +805,7 @@ void SysMtgsThread::RestartRingbuffer( uint packsize )
 	if( EmuConfig.GS.SynchronousMTGS )
 		WaitGS();
 }
+#endif

 __forceinline uint SysMtgsThread::_PrepForSimplePacket()
 {
@ -830,10 +820,7 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket()

    future_writepos &= RingBufferMask;
    if( future_writepos == 0 )
-    {
 		m_QueuedFrameCount = 0;
-		m_RingWrapSpot = RingBufferSize;
-	}

 	uint readpos = volatize(m_RingPos);
 	if( future_writepos == readpos )
@ -841,7 +828,15 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket()
 		// The ringbuffer read pos is blocking the future write position, so stall out
 		// until the read position has moved.

-		uint totalAccum	= (m_RingWrapSpot - readpos) + future_writepos;
+		uint freeroom;
+
+		if (future_writepos < readpos)
+			freeroom = readpos - future_writepos;
+		else
+			freeroom = RingBufferSize - (future_writepos - readpos);
+
+		uint totalAccum	= RingBufferSize - freeroom;
+
 		uint somedone	= totalAccum / 4;

 		if( somedone > 0x80 )
--- a/pcsx2/PluginManager.cpp
+++ b/pcsx2/PluginManager.cpp
@ -144,6 +144,7 @@ static s32  CALLBACK fallback_test() { return 0; }
 _GSvsync           GSvsync;
 _GSopen            GSopen;
 _GSopen2           GSopen2;
+_GSgifTransfer     GSgifTransfer;
 _GSgifTransfer1    GSgifTransfer1;
 _GSgifTransfer2    GSgifTransfer2;
 _GSgifTransfer3    GSgifTransfer3;
@ -309,7 +310,8 @@ static const LegacyApi_ReqMethod s_MethMessReq_GS[] =
 {
 	{	"GSopen",			(vMeth**)&GSopen,			NULL	},
 	{	"GSvsync",			(vMeth**)&GSvsync,			NULL	},
-	{	"GSgifTransfer1",	(vMeth**)&GSgifTransfer1,	NULL	},
+	{	"GSgifTransfer",	(vMeth**)&GSgifTransfer,	NULL	},
+	//{	"GSgifTransfer1",	(vMeth**)&GSgifTransfer1,	NULL	},
 	{	"GSgifTransfer2",	(vMeth**)&GSgifTransfer2,	NULL	},
 	{	"GSgifTransfer3",	(vMeth**)&GSgifTransfer3,	NULL	},
 	{	"GSreadFIFO2",		(vMeth**)&GSreadFIFO2,		NULL	},
--- a/pcsx2/VUops.cpp
+++ b/pcsx2/VUops.cpp
@ -2057,21 +2057,8 @@ void _vuXGKICK(VURegs * VU)

 	u8* data = ((u8*)VU->Mem + ((VU->VI[_Is_].US[0]*16) & 0x3fff));
 	u32 size;
-	size = GetMTGS().PrepDataPacket( GIF_PATH_1, data, (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4);
-	u8* pmem = GetMTGS().GetDataPacketPtr();
-
-	if((size << 4) > (u32)(0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)))
-	{
-		//DevCon.Warning("addr + Size = 0x%x, transferring %x then doing %x", ((VU->VI[_Is_].US[0]*16) & 0x3fff) + (size << 4), (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4, size - (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff) >> 4));
-		memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff));
-		size -= (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4;
-		//DevCon.Warning("Size left %x", size);
-		pmem += 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff);
-		memcpy_aligned(pmem, (u8*)VU->Mem, size<<4);
-	}
-	else {
-		memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), size<<4);
-	}
+	GetMTGS().PrepDataPacket( GIF_PATH_1, 0x400 );
+	size = GIFPath_CopyTag( GIF_PATH_1, (u128*)data, (0x400-(VU->VI[_Is_].US[0] & 0x3ff)) );
 	GetMTGS().SendDataPacket();
 }

--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@ -213,8 +213,8 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 						v.bSize = 0;
 						v.bPtr = 0;						
 					}
-					const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, v.buffer, 1);
-					memcpy_fast(GetMTGS().GetDataPacketPtr(), v.buffer, count << 4);
+					GetMTGS().PrepDataPacket(GIF_PATH_2, 1);
+					GIFPath_CopyTag(GIF_PATH_2, (u128*)v.buffer, 1);
 					GetMTGS().SendDataPacket();

 					if(vif1.tag.size == 0) 
@ -226,16 +226,17 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
 				}
 				else
 				{
-					const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, data, size >> 4);
-					memcpy_fast(GetMTGS().GetDataPacketPtr(), data, count << 4);
+					GetMTGS().PrepDataPacket(GIF_PATH_2, size/16);
+					uint count = GIFPath_CopyTag(GIF_PATH_2, (u128*)data, size/16) * 4;
 					GetMTGS().SendDataPacket();
-					vif1.tag.size -= count << 2;
+
+					vif1.tag.size -= count;
 					if(vif1.tag.size == 0) 
 					{
 						vif1.cmd = 0;
 					}
 					vif1.vifstalled    = true;
-					return count << 2;
+					return count;
 				}
 			}
 			
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@ -97,7 +97,7 @@ struct GIFPath
 	u8 GetReg();
 	bool IsActive() const;

-	int ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size);
+	int CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size);
 	int ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
 };

@ -287,7 +287,8 @@ __forceinline void GIFPath::PrepPackedRegs()

 __forceinline void GIFPath::SetTag(const void* mem)
 {
-	const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);
+	_mm_store_ps( (float*)&tag, _mm_loadu_ps((float*)mem) );
+	//const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);

 	nloop	= tag.NLOOP;
 	curreg	= 0;
@ -521,15 +522,50 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 	return size;
 }

-__forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
+void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
 {
+	uint endpos = destStart + len;
+	if( endpos >= destSize )
+	{
+		uint firstcopylen = RingBufferSize - destStart;
+		memcpy_aligned(&destBase[destStart], src, firstcopylen );
+
+		destStart = endpos & RingBufferMask;
+		memcpy_aligned(destBase, src+firstcopylen, destStart );
+	}
+	else
+	{
+		memcpy_aligned(&destBase[destStart], src, len );
+		destStart += len;
+	}
+}
+
+// [TODO] optimization: If later templated, we can have Paths 1 and 3 use aligned SSE movs,
+// since only PATH2 can feed us unaligned source data.
+#define copyTag() do {						\
+	/*RingBuffer.m_Ring[ringpos] = *pMem128;*/	\
+	_mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], _mm_loadu_ps((float*)pMem128)); \
+	++pMem128; --size;						\
+	ringpos = (ringpos+1)&RingBufferMask;	\
+} while(false)
+
+__forceinline int GIFPath::CopyTag(GIF_PATH pathidx, const u128* pMem128, u32 size)
+{
+	uint& ringpos = GetMTGS().m_packet_ringpos;
+	const uint original_ringpos = ringpos;
+
 	u32	startSize =  size;						// Start Size

 	while (size > 0) {
 		if (!nloop) {

-			SetTag(pMem);
-			incTag(1);		
+			// [TODO] Optimization: Use MMX intrinsics for SetTag and CopyTag, which both currently
+			//   produce a series of mov eax,[src]; mov [dest],eax instructions to copy these
+			//   individual qwcs.  Warning: Path2 transfers are not always QWC-aligned, but they are
+			//   always aligned on an 8 byte boundary; so its probably best to use MMX here.
+
+			SetTag((u8*)pMem128);
+			copyTag();
 			
 			if(nloop > 0)
 			{
@ -599,9 +635,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 					{
 						do {
 							if (GetReg() == 0xe) {
-								gsHandler(pMem);
+								gsHandler((u8*)pMem128);
 							}
-							incTag(1);
+							copyTag();
 						} while(StepReg() && size > 0 && SIGNAL_IMR_Pending == false);
 					}
 					else
@ -644,11 +680,14 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 							curreg = 0;
 							nloop = 0;
 						}
-						incTag(len);
+
+						MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
+						pMem128 += len;
+						size -= len;
 					}
 				break;
 				case GIF_FLG_REGLIST:
-				{				
+				{
 					GIF_LOG("Reglist Mode EOP %x", tag.EOP);

 					// In reglist mode, the GIF packs 2 registers into each QWC.  The nloop however
@ -687,8 +726,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 						nloop = 0;
 					}

-					incTag(len);
-
+					MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
+					pMem128 += len;
+					size -= len;
 				}
 				break;
 				case GIF_FLG_IMAGE:
@ -696,13 +736,15 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 				{
 					GIF_LOG("IMAGE Mode EOP %x", tag.EOP);
 					int len = aMin(size, nloop);
-					incTag(len);
+
+					MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
+
+					pMem128 += len;
+					size -= len;
 					nloop -= len;
 				}
 				break;
 			}
-			
-
 		}

 		if(pathidx == GIF_PATH_1)
@ -713,11 +755,11 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 				{
 					size = 0x3ff - startSize;
 					startSize = 0x3ff;
-					pMem -= 0x4000;
+					pMem128 -= 0x400;
 				}
 				else
 				{
-					// Note: The BIOS does an XGKICK on the VU1 and lets yt DMA to the GS without an EOP
+					// Note: The BIOS does an XGKICK on the VU1 and lets it DMA to the GS without an EOP
 					// (seemingly to loop forever), only to write an EOP later on.  No other game is known to
 					// do anything of the sort.
 					// So lets just cap the DMA at 16k, and force it to "look" like it's terminated for now.
@ -727,6 +769,11 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)

 					Console.Warning("GIFTAG error, size exceeded VU memory size %x", startSize);
 					nloop	= 0;
+					
+					// Don't send the packet to the GS -- its incomplete and might cause the GS plugin
+					// to get confused and die. >_<
+					
+					ringpos = original_ringpos;
 				}
 			}
 		}
@ -793,47 +840,18 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
 			gif->qwc  -= size;
 		}
 	}
-	
-	

 	return size;
 }

-// Processes a GIFtag & packet, and throws out some gsIRQs as needed.
-// Used to keep interrupts in sync with the EE, while the GS itself
-// runs potentially several frames behind.
-// Parameters:
-//   size  - max size of incoming data stream, in qwc (simd128)
-__forceinline int GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
+__forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
 {
-#ifdef PCSX2_GSRING_SAMPLING_STATS
-	static uptr profStartPtr = 0;
-	static uptr profEndPtr = 0;
-	if (profStartPtr == 0) {
-		__asm
-		{
-		__beginfunc:
-			mov profStartPtr, offset __beginfunc;
-			mov profEndPtr, offset __endfunc;
-		}
-		ProfilerRegisterSource( "GSRingBufCopy", (void*)profStartPtr, profEndPtr - profStartPtr );
-	}
-#endif
-
-	int retSize = s_gifPath[pathidx].ParseTag(pathidx, pMem, size);
-
-#ifdef PCSX2_GSRING_SAMPLING_STATS
-	__asm
-	{
-		__endfunc:
-			nop;
-	}
-#endif
-	return retSize;
+	return s_gifPath[pathidx].CopyTag(pathidx, pMem, size);
 }

-//Quick version for queueing PATH1 data
-
+// Quick version for queueing PATH1 data.
+// This version calculates the real length of the packet data only.  It does not process
+// IRQs or DMA status updates.
 __forceinline int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
 {
 	int retSize = s_gifPath[pathidx].ParseTagQuick(pathidx, pMem, size);
--- a/pcsx2/x86/microVU_Lower.inl
+++ b/pcsx2/x86/microVU_Lower.inl
@ -1101,27 +1101,15 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 	
 	if(gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.APATH == GIF_APATH3 && gifRegs->stat.IP3 == true) && SIGNAL_IMR_Pending == false)
 	{
-
 		if(Path1WritePos != 0)	
 		{
 			//Flush any pending transfers so things dont go up in the wrong order
 			while(gifRegs->stat.P1Q == true) gsPath1Interrupt();
 		}
-		size  = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
-		pDest = GetMTGS().GetDataPacketPtr();
-		if (size > diff) {
-			// fixme: one of these days the following *16's will get cleaned up when we introduce
-			// a special qwc/simd16 optimized version of memcpy_aligned. :)
-			//DevCon.Status("XGkick Wrap!");
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
-			size  -= diff;
-			pDest += diff*16;
-			memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
-		}
-		else {
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
-		}
+		GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400);
+		size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff);
 		GetMTGS().SendDataPacket();
+
 		if(GSTransferStatus.PTH1 == STOPPED_MODE)
 		{
 			gifRegs->stat.OPH = false;
@ -1141,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 			// fixme: one of these days the following *16's will get cleaned up when we introduce
 			// a special qwc/simd16 optimized version of memcpy_aligned. :)
 			//DevCon.Status("XGkick Wrap!");
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
+			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff);
 			Path1WritePos += size;
 			size  -= diff;
 			pDest += diff*16;
-			memcpy_aligned(pDest, microVU1.regs->Mem, size*16);			
+			memcpy_aligned(pDest, microVU1.regs->Mem, size);			
 		}
 		else {
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
+			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size);
 			Path1WritePos += size;
 		}
 		//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);
--- a/pcsx2/x86/sVU_Lower.cpp
+++ b/pcsx2/x86/sVU_Lower.cpp
@ -1988,21 +1988,10 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 			//Flush any pending transfers so things dont go up in the wrong order
 			while(gifRegs->stat.P1Q == true) gsPath1Interrupt();
 		}
-		size  = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
-		pDest = GetMTGS().GetDataPacketPtr();
-		if (size > diff) {
-			// fixme: one of these days the following *16's will get cleaned up when we introduce
-			// a special qwc/simd16 optimized version of memcpy_aligned. :)
-			
-			memcpy_aligned(pDest, VU1.Mem + addr, diff*16);
-			size  -= diff;
-			pDest += diff*16;
-			memcpy_aligned(pDest, VU1.Mem, size*16);
-		}
-		else {
-			memcpy_aligned(pDest, VU1.Mem + addr, size*16);
-		}
+		GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400);
+		size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff);
 		GetMTGS().SendDataPacket();
+
 		if(GSTransferStatus.PTH1 == STOPPED_MODE )
 		{
 			gifRegs->stat.OPH = false;
@ -2015,8 +2004,6 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 		size = GIFPath_ParseTagQuick(GIF_PATH_1, data, diff);
 		pDest = &Path1Buffer[Path1WritePos*16];

-
-
 		pxAssumeMsg((Path1WritePos+size < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!");

 		//DevCon.Warning("Storing size %x PATH 1", size);
@ -2024,14 +2011,14 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
 			// fixme: one of these days the following *16's will get cleaned up when we introduce
 			// a special qwc/simd16 optimized version of memcpy_aligned. :)
 			//DevCon.Status("XGkick Wrap!");
-			memcpy_aligned(pDest, VU1.Mem + addr, diff*16);
+			memcpy_aligned(pDest, VU1.Mem + addr, diff);
 			Path1WritePos += size;
 			size  -= diff;
 			pDest += diff*16;
-			memcpy_aligned(pDest, VU1.Mem, size*16);			
+			memcpy_aligned(pDest, VU1.Mem, size);			
 		}
 		else {
-			memcpy_aligned(pDest, VU1.Mem + addr, size*16);
+			memcpy_aligned(pDest, VU1.Mem + addr, size);
 			Path1WritePos += size;
 		}
 		//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);