Optimized memcpy_fast. In addition to being quite a bit faster, it also auto-preserves mmx registers now. So I was also able to remove almost every instance of FreezeMMXRegs (all except those used to guard the GS plugin calls). memcpy_fast (aka memcpy_amd_) is now faster than memcpy_raz for *all* scenarios, so it's been made the new default.

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@642 a6443dda-0b58-4228-96e9-037be469359c
2009-01-27 05:12:54 +00:00 · 2009-01-27 05:12:54 +00:00 · 44f5117d24
parent 4781be9e59
commit 44f5117d24
14 changed files with 248 additions and 169 deletions
--- a/pcsx2/CDVD.cpp
+++ b/pcsx2/CDVD.cpp
@ -865,8 +865,6 @@ int cdvdReadSector() {
 		return -1;
 	}
 	FreezeMMXRegs(1);
 	const u32 madr = HW_DMA3_MADR;
 	// if raw dvd sector 'fill in the blanks'
@ -935,7 +933,6 @@ int cdvdReadSector() {
 	HW_DMA3_BCR_H16-= (cdvd.BlockSize / (HW_DMA3_BCR_L16*4));
 	HW_DMA3_MADR+= cdvd.BlockSize;
 	FreezeMMXRegs(0);
 	return 0;
 }
@ -2024,9 +2021,7 @@ void cdvdWrite16(u8 rt)		 // SCOMMAND
 			if (cdvd.mg_size + cdvd.ParamC > cdvd.mg_maxsize)
 				cdvd.Result[0] = 0x80;
 			else{
 				FreezeMMXRegs(1);
 				memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
 				FreezeMMXRegs(0);
 				cdvd.mg_size += cdvd.ParamC;
 				cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
 			}
@ -2034,11 +2029,9 @@ void cdvdWrite16(u8 rt)		 // SCOMMAND
 		case 0x8E: // sceMgReadData
 			SetResultSize( std::min(16, cdvd.mg_size) );
 			FreezeMMXRegs(1);
 			memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
 			cdvd.mg_size -= cdvd.ResultC;
 			memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
 			FreezeMMXRegs(0);
 			break;
 		case 0x88: // secrman: __mechacon_auth_0x88	//for now it is the same; so, fall;)
@ -2089,9 +2082,7 @@ fail_pol_cal:
 			SetResultSize(3);//in:0
 			{
 				int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
 				FreezeMMXRegs(1);
 				memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
 				FreezeMMXRegs(0);
 			}
 			cdvd.mg_maxsize = 0; // don't allow any write
 			cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data
--- a/pcsx2/CDVDiso.cpp
+++ b/pcsx2/CDVDiso.cpp
@ -189,7 +189,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
 			return 0;
 		buff = CDVDgetBuffer();
 		if (buff==NULL) return 0;
 		FreezeMMXRegs(1);
 		switch (mode->datapattern){
 			case CdSecS2048:
 				memcpy_fast((void*)((uptr)buf+2048*i), buff, 2048);break;//only data
@ -198,7 +197,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
 			case CdSecS2340:
 				memcpy_fast((void*)((uptr)buf+2340*i), buff, 2340);break;//without sync
 		}
 		FreezeMMXRegs(0);
 	}
 	return 1;
 }
@ -216,9 +214,7 @@ int DvdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
 //		switch (mode->datapattern){
 //			case CdSecS2064:
 				((u32*)buf)[0] = i + 0x30000;
 				FreezeMMXRegs(1);
 				memcpy_fast((u8*)buf+12, buff, 2048); 
 				FreezeMMXRegs(0);
 				buf = (char*)buf + 2064; break;
 //			default:
 //				return 0;
@ -253,9 +249,7 @@ int CDVD_GetVolumeDescriptor(void){
 			if ((localVolDesc.filesystemType == 1) ||
 				(localVolDesc.filesystemType == 2))
 			{
 				FreezeMMXRegs(1);
 				memcpy_fast(&CDVolDesc, &localVolDesc, sizeof(cdVolDesc));
 				FreezeMMXRegs(0);
 			}
 		}
 		else
--- a/pcsx2/CDVDisodrv.cpp
+++ b/pcsx2/CDVDisodrv.cpp
@ -188,9 +188,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
 			RPC_LOG("[CDVDisodrv:    ] Couldn't Read from file for some reason\n");
 			return 0;
 		}
 		FreezeMMXRegs(1);
 		memcpy_fast(buffer, lb + off_sector, ssize);
 		FreezeMMXRegs(0);
 	}
 	if (asize)	if (CdRead(asector, asize >> 11, buffer+ssize, &cdReadMode) != TRUE){
 		RPC_LOG("[CDVDisodrv:    ] Couldn't Read from file for some reason\n");
@ -201,9 +199,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
 			RPC_LOG("[CDVDisodrv:    ] Couldn't Read from file for some reason\n");
 			return 0;
 		}
 		FreezeMMXRegs(1);
 		memcpy_fast(buffer+ssize+asize, lb, esize);
 		FreezeMMXRegs(0);
 	}
 /***********************
 	// Now work out where we want to start reading from
--- a/pcsx2/CdRom.cpp
+++ b/pcsx2/CdRom.cpp
@ -527,9 +527,7 @@ void  cdrReadInterrupt() {
 		CDREAD_INT((cdr.Mode & 0x80) ? (cdReadTime / 2) : cdReadTime);
 		return;
 	}
 	FreezeMMXRegs(1);
 	memcpy_fast(cdr.Transfer, buf+12, 2340);
 	FreezeMMXRegs(0);
 	cdr.Stat = DataReady;
 	CDR_LOG(" %x:%x:%x\n", cdr.Transfer[0], cdr.Transfer[1], cdr.Transfer[2]);
@ -923,9 +921,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) {
 			}
 			cdsize = (bcr & 0xffff) * 4;
 			FreezeMMXRegs(1);
 			memcpy_fast((u8*)PSXM(madr), cdr.pTransfer, cdsize);
 			FreezeMMXRegs(0);
 			psxCpu->Clear(madr, cdsize/4);
 			cdr.pTransfer+=cdsize;
--- a/pcsx2/GS.cpp
+++ b/pcsx2/GS.cpp
@ -575,11 +575,11 @@ static void WRITERING_DMA(u32 *pMem, u32 qwc)
 			{ 
 				pendmem = (pendmem&~0xfff)-16; 
 			} 
-			memcpy_raz_(pgsmem, pMem, pendmem-(u32)gif->madr+16);
+			memcpy_aligned(pgsmem, pMem, pendmem-(u32)gif->madr+16);
 		}
 		else
 #endif
-		memcpy_raz_(pgsmem, pMem, sizetoread); 
+		memcpy_aligned(pgsmem, pMem, sizetoread); 
 		mtgsThread->SendDataPacket();
 	} 
--- a/pcsx2/Hw.h
+++ b/pcsx2/Hw.h
@ -329,7 +329,7 @@ static __forceinline u8* dmaGetAddr(u32 mem)
 #else
-
+// Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits)
 static __forceinline void *dmaGetAddr(u32 addr) {
 	u8 *ptr;
@ -355,35 +355,17 @@ void hwShutdown();
 // hw read functions
 extern u8   hwRead8 (u32 mem);
 int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
 extern u16  hwRead16(u32 mem);
 int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
 extern u32  hwRead32(u32 mem);
-int hwConstRead32(u32 x86reg, u32 mem);
+extern u64  hwRead64(u32 mem);
-
+extern void hwRead128(u32 mem, u64 *out);
 u64  hwRead64(u32 mem);
 void hwConstRead64(u32 mem, int mmreg);
 void hwRead128(u32 mem, u64 *out);
 void hwConstRead128(u32 mem, int xmmreg);
 // hw write functions
-void hwWrite8 (u32 mem, u8  value);
+extern void hwWrite8 (u32 mem, u8  value);
-void hwConstWrite8 (u32 mem, int mmreg);
+extern void hwWrite16(u32 mem, u16 value);
-
+extern void hwWrite32(u32 mem, u32 value);
-void hwWrite16(u32 mem, u16 value);
+extern void hwWrite64(u32 mem, u64 value);
-void hwConstWrite16(u32 mem, int mmreg);
+extern void hwWrite128(u32 mem, const u64 *value);
 void hwWrite32(u32 mem, u32 value);
 void hwConstWrite32(u32 mem, int mmreg);
 void hwWrite64(u32 mem, u64 value);
 void hwConstWrite64(u32 mem, int mmreg);
 void hwWrite128(u32 mem, const u64 *value);
 void hwConstWrite128(u32 mem, int xmmreg);
 void hwIntcIrq(int n);
 void hwDmacIrq(int n);
@ -394,6 +376,18 @@ int  hwMFIFOWrite(u32 addr, u8 *data, u32 size);
 int  hwDmacSrcChainWithStack(DMACh *dma, int id);
 int  hwDmacSrcChain(DMACh *dma, int id);
 int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
 int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
 int hwConstRead32(u32 x86reg, u32 mem);
 void hwConstRead64(u32 mem, int mmreg);
 void hwConstRead128(u32 mem, int xmmreg);
 void hwConstWrite8 (u32 mem, int mmreg);
 void hwConstWrite16(u32 mem, int mmreg);
 void hwConstWrite32(u32 mem, int mmreg);
 void hwConstWrite64(u32 mem, int mmreg);
 void hwConstWrite128(u32 mem, int xmmreg);
 #ifdef PCSX2_VIRTUAL_MEM
 void iMemRead32Check();
 #endif
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@ -430,7 +430,7 @@ int mtgsThreadObject::Callback()
 {
 	Console::WriteLn("MTGS > Thread Started, Opening GS Plugin...");
-	memcpy_raz_( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
+	memcpy_aligned( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
 	GSsetBaseMem( m_gsMem );
 	m_returncode = GSopen((void *)&pDsp, "PCSX2", 1);
--- a/pcsx2/Misc.h
+++ b/pcsx2/Misc.h
@ -230,12 +230,19 @@ extern u8 g_globalXMMSaved;
 void _memset16_unaligned( void* dest, u16 data, size_t size );
 #if defined(_WIN32) && !defined(__x86_64__)
-	// faster memcpy
+
-	extern void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes);
+	// The new simplified memcpy_amd_ is now faster than memcpy_raz_.
-	extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t qwc);
+	// memcpy_amd_ also does mmx register saving, negating the need for freezeregs (code cleanup!)
-	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t n);
+	// Additionally, using one single memcpy implementation keeps the code cache cleaner.
 	//extern void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes);
 	//extern void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes);
 	//extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes);
 	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
 #	include "windows/memzero.h"
 #	define memcpy_fast memcpy_amd_
 #	define memcpy_aligned memcpy_amd_
 #else
@ -243,6 +250,10 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 	#define memcpy_fast memcpy
 	#define memcpy_raz_ memcpy
 	#define memcpy_raz_u memcpy
 	#define memcpy_aligned memcpy
 	#define memcpy_raz_u memcpy
 	#include "Linux/memzero.h"
 #endif
--- a/pcsx2/SPR.cpp
+++ b/pcsx2/SPR.cpp
@ -226,13 +226,11 @@ extern void mfifoGIFtransfer(int);
 #define gif ((DMACh*)&PS2MEM_HW[0xA000])
 void dmaSPR0() { // fromSPR
 	int qwc = spr0->qwc;
 	FreezeMMXRegs(1);
 	SPR_LOG("dmaSPR0 chcr = %lx, madr = %lx, qwc  = %lx, sadr = %lx\n",
 			spr0->chcr, spr0->madr, spr0->qwc, spr0->sadr);
 	_dmaSPR0();
 	FreezeMMXRegs(0);
 	if ((psHu32(DMAC_CTRL) & 0xC) == 0xC) { // GIF MFIFO
 		if((spr0->madr & ~psHu32(DMAC_RBSR)) != psHu32(DMAC_RBOR)) SysPrintf("GIF MFIFO Write outside MFIFO area\n");
 		spr0->madr = psHu32(DMAC_RBOR) + (spr0->madr & psHu32(DMAC_RBSR));
@ -308,7 +306,6 @@ void _SPR1interleave() {
 void dmaSPR1() { // toSPR
 	FreezeMMXRegs(1);
 #ifdef SPR_LOG
 	SPR_LOG("dmaSPR1 chcr = 0x%x, madr = 0x%x, qwc  = 0x%x\n"
 			"        tadr = 0x%x, sadr = 0x%x\n",
@ -325,7 +322,6 @@ void dmaSPR1() { // toSPR
 		// Transfer Dn_QWC from Dn_MADR to SPR1
 		SPR1chain();
 		CPU_INT(9, cycles); 
 		FreezeMMXRegs(0);
 		return;
 	} else if ((spr1->chcr & 0xc) == 0x4){
 			int cycles = 0;
@ -338,7 +334,6 @@ void dmaSPR1() { // toSPR
 		// Transfer Dn_QWC from Dn_MADR to SPR1
 		SPR1chain();
 		CPU_INT(9, cycles); 
 		FreezeMMXRegs(0);
 		return;
 	}
 	// Chain Mode
@ -382,7 +377,6 @@ void dmaSPR1() { // toSPR
 	} else { // Interleave Mode
 		_SPR1interleave();
 	} 
 	FreezeMMXRegs(0);
 }
--- a/pcsx2/VifDma.cpp
+++ b/pcsx2/VifDma.cpp
@ -574,9 +574,7 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
 				// v4-32
 				if(vifRegs->mode == 0 && !(vifRegs->code & 0x10000000) && vif->usn == 0){
 					vifRegs->num -= size>>4;
 					FreezeMMXRegs(1);
 					memcpy_fast((u8*)dest, cdata, size);
 					FreezeMMXRegs(0);
 					size = 0;
 					//unpacktotal += GetCPUTick()-basetick;
 					return;
@ -814,9 +812,7 @@ static __forceinline void _vif0mpgTransfer(u32 addr, u32 *data, int size) {
 		fclose(f);
 	}*/
 	if (memcmp(VU0.Micro + addr, data, size << 2)) {
 		FreezeMMXRegs(1);
 		memcpy_fast(VU0.Micro + addr, data, size << 2);
 		FreezeMMXRegs(0);
 		CpuVU0->Clear(addr, size);
 	}
 }
@ -1490,9 +1486,7 @@ static __forceinline void _vif1mpgTransfer(u32 addr, u32 *data, int size) {
 	}*/
    assert( VU1.Micro > 0 );
 	if (memcmp(VU1.Micro + addr, data, size << 2)) {
 		FreezeMMXRegs(1);
 		memcpy_fast(VU1.Micro + addr, data, size << 2);
 		FreezeMMXRegs(0);
 		CpuVU1->Clear(addr, size);
 	}
 }
@ -1644,7 +1638,7 @@ static int Vif1TransDirectHL(u32 *data){
 	{
 		//unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :)
 		const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, ret<<2 );
-		memcpy_raz_u( mtgsThread->GetDataPacketPtr(), data, count );
+		memcpy_fast( mtgsThread->GetDataPacketPtr(), data, count );
 		mtgsThread->SendDataPacket();
 	}
 	else {
--- a/pcsx2/windows/WinThreads.cpp
+++ b/pcsx2/windows/WinThreads.cpp
@ -49,6 +49,9 @@ namespace Threading
 		}
 		cpuinfo.LogicalCores = CPUs;
 		if( LogicalCoresPerPhysicalCPU > CPUs) // for 1-socket HTT-disabled machines
 			LogicalCoresPerPhysicalCPU = CPUs;
 		cpuinfo.PhysicalCores = ( CPUs / LogicalCoresPerPhysicalCPU ) * PhysicalCoresPerPhysicalCPU;
 		ptw32_smp_system = ( cpuinfo.LogicalCores > 1 ) ? TRUE : FALSE;
 	}
--- a/pcsx2/x86/fast_routines.cpp
+++ b/pcsx2/x86/fast_routines.cpp
@ -88,35 +88,27 @@ void checkregs()
 #endif
-__declspec(align(16)) static u8 _xmm_backup[16*2];
+PCSX2_ALIGNED16( static u8 _xmm_backup[16*2] );
 PCSX2_ALIGNED16( static u8 _mmx_backup[8*4] );
-// this one checks for alignments too ...
+static __declspec(naked) void __fastcall _memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
 __declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes)
 {
 	// If src is aligned, use memcpy_raz instead:
 	__asm
 	{
 		test edx,0xf;
 		jz memcpy_raz_;
 	}
 	// MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
-	#define MOVSRC movups
+	#define MOVSRC movdqu
 	#define MOVDST movdqa
 	__asm
 	{
 		//Reads before reads, to avoid stalls
 		mov eax,[esp+4];
 		//Make sure to save xmm0, it must be preserved ...
-		movaps [_xmm_backup+0x00],xmm0;
+		movaps [_xmm_backup],xmm0;
 		//if >=128 bytes use 128 byte unrolled loop
 		//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
 		cmp eax,127;
 		jna _loop_1;
 		//unrolled version also touches xmm1, save it :)
 		movaps [_xmm_backup+0x10],xmm1;
 		//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
 		align 16
@ -124,34 +116,111 @@ __declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size
 _loop_8:
 		MOVSRC xmm0,[edx+0x00];	//read first to avoid read-after-write stalls
-		MOVSRC xmm1,[edx+0x10];
+		MOVDST [ecx+0x00],xmm0; //then write :p
 		MOVSRC xmm0,[edx+0x10];
 		MOVDST [ecx+0x10],xmm0;
 		sub edx,-128;			//edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		movaps [ecx+0x00],xmm0; //then write :p
 		movaps [ecx+0x10],xmm1;
 		sub ecx,-128;			//ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		MOVSRC xmm0,[edx+0x20-128];
-		MOVSRC xmm1,[edx+0x30-128];
+		MOVDST [ecx+0x20-128],xmm0;
 		MOVSRC xmm0,[edx+0x30-128];
 		MOVDST [ecx+0x30-128],xmm0;
 		add eax,-128;			//eax won't be used for a while, so update it here. add/-128 for simm8 encoding
 		movaps [ecx+0x20-128],xmm0;
 		movaps [ecx+0x30-128],xmm1;
 		MOVSRC xmm0,[edx+0x40-128];
-		MOVSRC xmm1,[edx+0x50-128];
+		MOVDST [ecx+0x40-128],xmm0;
-		movaps [ecx+0x40-128],xmm0;
+		MOVSRC xmm0,[edx+0x50-128];
-		movaps [ecx+0x50-128],xmm1;
+		MOVDST [ecx+0x50-128],xmm0;
 		MOVSRC xmm0,[edx+0x60-128];
-		MOVSRC xmm1,[edx+0x70-128];
+		MOVDST [ecx+0x60-128],xmm0;
-		movaps [ecx+0x60-128],xmm0;
+		MOVSRC xmm0,[edx+0x70-128];
-		movaps [ecx+0x70-128],xmm1;
+		MOVDST [ecx+0x70-128],xmm0;
 		//127~ja, 127 is encodable as simm8 :)
 		cmp eax,127;
 		ja _loop_8;
-		//restore xmm1 :)
+		//direct copy for 0~7 qwords
-		movaps xmm1,[_xmm_backup+0x10];
+		//in order to avoid the inc/dec of all 3 registers
 		//i use negative relative addressing from the top of the buffers
 		//[top-current index]
 _loop_1:
 		//prepare the regs for 'negative relative addressing'
 		add edx,eax;
 		add ecx,eax;
 		neg eax;
 		jz cleanup;	//exit if nothing to do
 _loop_1_inner:
 		MOVSRC xmm0,[edx+eax];
 		MOVDST [ecx+eax],xmm0;
 		add eax,16;		//while the offset is still negative we have data to copy
 		js _loop_1_inner;
 		//done !
 cleanup:
 		//restore xmm and exit ~)
 		movaps xmm0,[_xmm_backup];
 		ret 4;
 	}
 	#undef MOVSRC
 	#undef MOVDST
 }
 static __declspec(naked) void __fastcall _memcpy_raz_udst(void *dest, const void *src, size_t bytes)
 {
 	// MOVDST = opcode used to read. I use the same code for the aligned version, with a different define :)
 	#define MOVSRC movaps
 	#define MOVDST movups
 	__asm
 	{
 		//Reads before reads, to avoid stalls
 		mov eax,[esp+4];
 		//Make sure to save xmm0, it must be preserved ...
 		movaps [_xmm_backup],xmm0;
 		//if >=128 bytes use 128 byte unrolled loop
 		//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
 		cmp eax,127;
 		jna _loop_1;
 		//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
 		align 16
 		//128 byte unrolled loop
 _loop_8:
 		MOVSRC xmm0,[edx+0x00];	//read first to avoid read-after-write stalls
 		MOVDST [ecx+0x00],xmm0; //then write :p
 		MOVSRC xmm0,[edx+0x10];
 		MOVDST [ecx+0x10],xmm0;
 		sub edx,-128;			//edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		sub ecx,-128;			//ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		MOVSRC xmm0,[edx+0x20-128];
 		MOVDST [ecx+0x20-128],xmm0;
 		MOVSRC xmm0,[edx+0x30-128];
 		MOVDST [ecx+0x30-128],xmm0;
 		add eax,-128;			//eax won't be used for a while, so update it here. add/-128 for simm8 encoding
 		MOVSRC xmm0,[edx+0x40-128];
 		MOVDST [ecx+0x40-128],xmm0;
 		MOVSRC xmm0,[edx+0x50-128];
 		MOVDST [ecx+0x50-128],xmm0;
 		MOVSRC xmm0,[edx+0x60-128];
 		MOVDST [ecx+0x60-128],xmm0;
 		MOVSRC xmm0,[edx+0x70-128];
 		MOVDST [ecx+0x70-128],xmm0;
 		//127~ja, 127 is encodable as simm8 :)
 		cmp eax,127;
 		ja _loop_8;
 		//direct copy for 0~7 qwords
 		//in order to avoid the inc/dec of all 3 registers
@ -168,22 +237,24 @@ _loop_1:
 _loop_1_inner:
 		MOVSRC xmm0,[edx+eax];
 		movaps [ecx+eax],xmm0;
-		
+
 		add eax,16;		//while the offset is still negative we have data to copy
 		js _loop_1_inner;
 		//done !
 cleanup:
 		//restore xmm and exit ~)
-		movaps xmm0,[_xmm_backup+0x00];
+		movaps xmm0,[_xmm_backup];
 		ret 4;
 	}
 	#undef MOVSRC
 	#undef MOVDST
 }
 // Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
 // This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
 // used since the reads are linear and the cache logic can predict em :)
-
+// *OBSOLETE* -- memcpy_amd_ has been optimized and is now faster.
 __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
 {
 	// Code Implementation Notes:
@ -191,21 +262,19 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
 	// MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
 	#define MOVSRC movaps
 	#define MOVDST movaps
 	__asm
 	{
 		//Reads before reads, to avoid stalls
 		mov eax,[esp+4];
 		//Make sure to save xmm0, it must be preserved ...
-		movaps [_xmm_backup+0x00],xmm0;
+		movaps [_xmm_backup],xmm0;
 		//if >=128 bytes use 128 byte unrolled loop
 		//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
 		cmp eax,127;
 		jna _loop_1;
 		//unrolled version also toiches xmm1, save it :)
 		movaps [_xmm_backup+0x10],xmm1;
 		//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
 		align 16
@ -213,35 +282,32 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
 _loop_8:
 		MOVSRC xmm0,[edx+0x00];	//read first to avoid read-after-write stalls
-		MOVSRC xmm1,[edx+0x10];
+		MOVDST [ecx+0x00],xmm0; //then write :p
 		MOVSRC xmm0,[edx+0x10];
 		MOVDST [ecx+0x10],xmm0;
 		sub edx,-128;			//edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		movaps [ecx+0x00],xmm0; //then write :p
 		movaps [ecx+0x10],xmm1;
 		sub ecx,-128;			//ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		MOVSRC xmm0,[edx+0x20-128];
-		MOVSRC xmm1,[edx+0x30-128];
+		MOVDST [ecx+0x20-128],xmm0;
 		MOVSRC xmm0,[edx+0x30-128];
 		MOVDST [ecx+0x30-128],xmm0;
 		add eax,-128;			//eax won't be used for a while, so update it here. add/-128 for simm8 encoding
 		movaps [ecx+0x20-128],xmm0;
 		movaps [ecx+0x30-128],xmm1;
 		MOVSRC xmm0,[edx+0x40-128];
-		MOVSRC xmm1,[edx+0x50-128];
+		MOVDST [ecx+0x40-128],xmm0;
-		movaps [ecx+0x40-128],xmm0;
+		MOVSRC xmm0,[edx+0x50-128];
-		movaps [ecx+0x50-128],xmm1;
+		MOVDST [ecx+0x50-128],xmm0;
 		MOVSRC xmm0,[edx+0x60-128];
-		MOVSRC xmm1,[edx+0x70-128];
+		MOVDST [ecx+0x60-128],xmm0;
-		movaps [ecx+0x60-128],xmm0;
+		MOVSRC xmm0,[edx+0x70-128];
-		movaps [ecx+0x70-128],xmm1;
+		MOVDST [ecx+0x70-128],xmm0;
 		//127~ja, 127 is encodable as simm8 :)
 		cmp eax,127;
 		ja _loop_8;
 		//restore xmm1 :)
 		movaps xmm1,[_xmm_backup+0x10];
 		//direct copy for 0~7 qwords
 		//in order to avoid the inc/dec of all 3 registers
 		//i use negative relative addressing from the top of the buffers
@ -256,7 +322,7 @@ _loop_1:
 _loop_1_inner:
 		MOVSRC xmm0,[edx+eax];
-		movaps [ecx+eax],xmm0;
+		MOVDST [ecx+eax],xmm0;
 		add eax,16;		//while the offset is still negative we have data to copy
 		js _loop_1_inner;
@ -264,44 +330,64 @@ _loop_1_inner:
 		//done !
 cleanup:
 		//restore xmm and exit ~)
-		movaps xmm0,[_xmm_backup+0x00];
+		movaps xmm0,[_xmm_backup];
 		ret 4;
 	}
 	#undef MOVSRC
 	#undef MOVDST
 }
-#undef MOVSRC
+// This memcpy routine is for use in situations where the source buffer's alignment is indeterminate.
 __forceinline void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
 {
 	if( ((uptr)src & 0xf) == 0 )
 		memcpy_raz_( dest, src, bytes );
 	else
 		_memcpy_raz_usrc( dest, src, bytes );
 }
 // This memcpy routine is for use in situations where the destination buffer's alignment is indeterminate.
 __forceinline void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes)
 {
 	if( ((uptr)dest & 0xf) == 0 )
 		memcpy_raz_( dest, src, bytes );
 	else
 		_memcpy_raz_udst( dest, src, bytes );
 }
 //////////////////////////////////////////////////////////////////////////
-// Fast memcpy as coded by AMD.
+// Fast memcpy as coded by AMD, and thn improved by air.
-
+//
-// This function clobbers all MMX registers, and is generally not optimal for short memory
+// This routine preserves mmx registers!  It's the complete real deal!
-// copies due to the amount of overhead required to test for alignments, copy length,
+__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
 // and other ABI overhead.
 void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
 {
-    __asm {
+    __asm
 	{
 	push    edi  
 	push    esi  
 	mov		edi, ecx		; destination
 	mov		esi, edx		; source
-	mov		ecx, [n]		; number of bytes to copy
+	mov		ecx, [esp+12]	; number of bytes to copy
-	mov		ebx, ecx		; keep a copy of count
+	mov		eax, ecx		; keep a copy of count
 	cld
-	cmp		ecx, TINY_BLOCK_COPY
+	cmp		eax, TINY_BLOCK_COPY
 	jb		$memcpy_ic_3	; tiny? skip mmx copy
-	cmp		ecx, 32*1024		; don't align between 32k-64k because
+	cmp		eax, 32*1024		; don't align between 32k-64k because
 	jbe		$memcpy_do_align	;  it appears to be slower
-	cmp		ecx, 64*1024
+	cmp		eax, 64*1024
 	jbe		$memcpy_align_done
 $memcpy_do_align:
-	mov		ecx, 8			; a trick that's faster than rep movsb...
+	mov		eax, 8			; a trick that's faster than rep movsb...
-	sub		ecx, edi		; align destination to qword
+	sub		eax, edi		; align destination to qword
-	and		ecx, 111b		; get the low bits
+	and		eax, 111b		; get the low bits
-	sub		ebx, ecx		; update copy count
+	sub		ecx, eax		; update copy count
-	neg		ecx				; set up to jump into the array
+	neg		eax				; set up to jump into the array
-	add		ecx, offset $memcpy_align_done
+	add		eax, offset $memcpy_align_done
-	jmp		ecx				; jump to array of movsb's
+	jmp		eax				; jump to array of movsb's
 align 4
 	movsb
@ -314,13 +400,18 @@ align 4
 	movsb
 $memcpy_align_done:			; destination is dword aligned
-	mov		ecx, ebx		; number of bytes left to copy
+	mov		eax, ecx		; number of bytes left to copy
-	shr		ecx, 6			; get 64-byte block count
+	shr		eax, 6			; get 64-byte block count
 	jz		$memcpy_ic_2	; finish the last few bytes
-	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
+	cmp		eax, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
 	jae		$memcpy_uc_test
 	movq	[_mmx_backup+0x00],mm0
 	movq	[_mmx_backup+0x08],mm1
 	movq	[_mmx_backup+0x10],mm2
 	movq	[_mmx_backup+0x18],mm3
 // This is small block copy that uses the MMX registers to copy 8 bytes
 // at a time.  It uses the "unrolled loop" optimization, and also uses
 // the software prefetch instruction to get the data into the cache.
@ -348,30 +439,39 @@ $memcpy_ic_1:			; 64-byte block copies, in-cache copy
 	add		esi, 64			; update source pointer
 	add		edi, 64			; update destination pointer
-	dec		ecx				; count down
+	dec		eax				; count down
 	jnz		$memcpy_ic_1	; last 64-byte block?
 	movq	mm0,[_mmx_backup+0x00]
 	movq	mm1,[_mmx_backup+0x08]
 	movq	mm2,[_mmx_backup+0x10]
 	movq	mm3,[_mmx_backup+0x18]
 $memcpy_ic_2:
-	mov		ecx, ebx		; has valid low 6 bits of the byte count
+	mov		eax, ecx		; has valid low 6 bits of the byte count
 $memcpy_ic_3:
-	shr		ecx, 2			; dword count
+	shr		eax, 2			; dword count
-	and		ecx, 1111b		; only look at the "remainder" bits
+	and		eax, 1111b		; only look at the "remainder" bits
-	neg		ecx				; set up to jump into the array
+	neg		eax				; set up to jump into the array
-	add		ecx, offset $memcpy_last_few
+	add		eax, offset $memcpy_last_few
-	jmp		ecx				; jump to array of movsd's
+	jmp		eax				; jump to array of movsd's
 $memcpy_uc_test:
-	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
+	/*cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
 	jae		$memcpy_bp_1
-
+$memcpy_64_test:*/
-$memcpy_64_test:
+	or		eax, eax		; tail end of block prefetch will jump here
 	or		ecx, ecx		; tail end of block prefetch will jump here
 	jz		$memcpy_ic_2	; no more 64-byte blocks left
 // For larger blocks, which will spill beyond the cache, it's faster to
 // use the Streaming Store instruction MOVNTQ.   This write instruction
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
 	movq	[_mmx_backup+0x00],mm0
 	movq	[_mmx_backup+0x08],mm1
 	movq	[_mmx_backup+0x10],mm2
 align 16
 $memcpy_uc_1:				; 64-byte blocks, uncached copy
@ -394,17 +494,25 @@ $memcpy_uc_1:				; 64-byte blocks, uncached copy
 	movq	mm1,[esi-8]
 	movntq	[edi-24], mm2
 	movntq	[edi-16], mm0
-	dec		ecx
+	dec		eax
 	movntq	[edi-8], mm1
 	jnz		$memcpy_uc_1	; last 64-byte block?
-	jmp		$memcpy_ic_2		; almost done
+	movq	mm0,[_mmx_backup+0x00]
 	movq	mm1,[_mmx_backup+0x08]
 	movq	mm2,[_mmx_backup+0x10]
 	jmp		$memcpy_ic_2		; almost done  (not needed because large copy below was removed)
 // For the largest size blocks, a special technique called Block Prefetch
 // can be used to accelerate the read operations.   Block Prefetch reads
 // one address per cache line, for a series of cache lines, in a short loop.
 // This is faster than using software prefetch.  The technique is great for
 // getting maximum read bandwidth, especially in DDR memory systems.
 // Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
 // help keep the code cache footprint of memcpy_fast to a minimum.
 /*
 $memcpy_bp_1:			; large blocks, block prefetch copy
 	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
@ -447,6 +555,7 @@ $memcpy_bp_3:
 	jnz		$memcpy_bp_3		; keep copying
 	sub		ecx, CACHEBLOCK		; update the 64-byte block count
 	jmp		$memcpy_bp_1		; keep processing chunks
 */
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".   Then it handles the last few bytes.
@ -469,8 +578,8 @@ align 4
 	movsd
 $memcpy_last_few:		; dword aligned from before movsd's
-	mov		ecx, ebx	; has valid low 2 bits of the byte count
+	mov		eax, ecx	; has valid low 2 bits of the byte count
-	and		ecx, 11b	; the last few cows must come home
+	and		eax, 11b	; the last few cows must come home
 	jz		$memcpy_final	; no more, let's leave
 	rep		movsb		; the last 1, 2, or 3 bytes
@ -479,10 +588,14 @@ $memcpy_final:
 	sfence				; flush the write buffer
 	//mov		eax, [dest]	; ret value = destination pointer
 	pop    esi  
 	pop    edi
 	ret 4
    }
 }
-// mmx memcpy implementation, size has to be a multiple of 8
+// mmx mem-compare implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
 // ~10 times faster than standard memcmp
 // (zerofrog)
--- a/pcsx2/x86/iVUmicroLower.cpp
+++ b/pcsx2/x86/iVUmicroLower.cpp
@ -1977,12 +1977,7 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
    //if( size > 0 )
 	{
 		u8* pmem = mtgsThread->GetDataPacketPtr();
-		//FreezeMMXRegs(1);
+		memcpy_aligned(pmem, (u8*)pMem+addr, size);
 	    //memcpy_fast(pmem, (u8*)pMem+addr, size);
 		//FreezeMMXRegs(0);
 		// we can use the faster memcpy_raz_ here (src/dest are garaunteed to be aligned)
 		memcpy_raz_(pmem, (u8*)pMem+addr, size);
 		mtgsThread->SendDataPacket();
 	}
 }
--- a/pcsx2/x86/iVUzerorec.cpp
+++ b/pcsx2/x86/iVUzerorec.cpp
@ -880,9 +880,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)
 #ifdef SUPERVU_CACHING
 		//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
 		r.pmem = malloc(r.size);
 		FreezeMMXRegs(1);
 		memcpy_fast(r.pmem, &VU->Micro[r.start], r.size);
 		FreezeMMXRegs(0);
 #endif
 		s_pFnHeader->ranges.push_back(r);
 	}