From 44f5117d2453dfa284585accc3d91fa9e7d3fbf9 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@a6443dda-0b58-4228-96e9-037be469359c>
Date: Tue, 27 Jan 2009 05:12:54 +0000
Subject: [PATCH] Optimized memcpy_fast. In addition to being quite a bit
 faster, it also auto-preserves mmx registers now. So I was also able to
 remove almost every instance of FreezeMMXRegs (all except those used to guard
 the GS plugin calls). memcpy_fast (aka memcpy_amd_) is now faster than
 memcpy_raz for *all* scenarios, so it's been made the new default.

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@642 a6443dda-0b58-4228-96e9-037be469359c
---
 pcsx2/CDVD.cpp               |   9 --
 pcsx2/CDVDiso.cpp            |   6 -
 pcsx2/CDVDisodrv.cpp         |   4 -
 pcsx2/CdRom.cpp              |   4 -
 pcsx2/GS.cpp                 |   4 +-
 pcsx2/Hw.h                   |  46 +++---
 pcsx2/MTGS.cpp               |   2 +-
 pcsx2/Misc.h                 |  19 ++-
 pcsx2/SPR.cpp                |   6 -
 pcsx2/VifDma.cpp             |   8 +-
 pcsx2/windows/WinThreads.cpp |   3 +
 pcsx2/x86/fast_routines.cpp  | 297 ++++++++++++++++++++++++-----------
 pcsx2/x86/iVUmicroLower.cpp  |   7 +-
 pcsx2/x86/iVUzerorec.cpp     |   2 -
 14 files changed, 248 insertions(+), 169 deletions(-)

diff --git a/pcsx2/CDVD.cpp b/pcsx2/CDVD.cpp
index e5daf56318..39078a2829 100644
--- a/pcsx2/CDVD.cpp
+++ b/pcsx2/CDVD.cpp
@@ -865,8 +865,6 @@ int cdvdReadSector() {
 		return -1;
 	}
 
-	FreezeMMXRegs(1);
-
 	const u32 madr = HW_DMA3_MADR;
 
 	// if raw dvd sector 'fill in the blanks'
@@ -935,7 +933,6 @@ int cdvdReadSector() {
 
 	HW_DMA3_BCR_H16-= (cdvd.BlockSize / (HW_DMA3_BCR_L16*4));
 	HW_DMA3_MADR+= cdvd.BlockSize;
-	FreezeMMXRegs(0);
 
 	return 0;
 }
@@ -2024,9 +2021,7 @@ void cdvdWrite16(u8 rt)		 // SCOMMAND
 			if (cdvd.mg_size + cdvd.ParamC > cdvd.mg_maxsize)
 				cdvd.Result[0] = 0x80;
 			else{
-				FreezeMMXRegs(1);
 				memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
-				FreezeMMXRegs(0);
 				cdvd.mg_size += cdvd.ParamC;
 				cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
 			}
@@ -2034,11 +2029,9 @@ void cdvdWrite16(u8 rt)		 // SCOMMAND
 
 		case 0x8E: // sceMgReadData
 			SetResultSize( std::min(16, cdvd.mg_size) );
-			FreezeMMXRegs(1);
 			memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
 			cdvd.mg_size -= cdvd.ResultC;
 			memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
-			FreezeMMXRegs(0);
 			break;
 
 		case 0x88: // secrman: __mechacon_auth_0x88	//for now it is the same; so, fall;)
@@ -2089,9 +2082,7 @@ fail_pol_cal:
 			SetResultSize(3);//in:0
 			{
 				int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
-				FreezeMMXRegs(1);
 				memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
-				FreezeMMXRegs(0);
 			}
 			cdvd.mg_maxsize = 0; // don't allow any write
 			cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data
diff --git a/pcsx2/CDVDiso.cpp b/pcsx2/CDVDiso.cpp
index a9401b5b5b..5b26848f50 100644
--- a/pcsx2/CDVDiso.cpp
+++ b/pcsx2/CDVDiso.cpp
@@ -189,7 +189,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
 			return 0;
 		buff = CDVDgetBuffer();
 		if (buff==NULL) return 0;
-		FreezeMMXRegs(1);
 		switch (mode->datapattern){
 			case CdSecS2048:
 				memcpy_fast((void*)((uptr)buf+2048*i), buff, 2048);break;//only data
@@ -198,7 +197,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
 			case CdSecS2340:
 				memcpy_fast((void*)((uptr)buf+2340*i), buff, 2340);break;//without sync
 		}
-		FreezeMMXRegs(0);
 	}
 	return 1;
 }
@@ -216,9 +214,7 @@ int DvdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
 //		switch (mode->datapattern){
 //			case CdSecS2064:
 				((u32*)buf)[0] = i + 0x30000;
-				FreezeMMXRegs(1);
 				memcpy_fast((u8*)buf+12, buff, 2048); 
-				FreezeMMXRegs(0);
 				buf = (char*)buf + 2064; break;
 //			default:
 //				return 0;
@@ -253,9 +249,7 @@ int CDVD_GetVolumeDescriptor(void){
 			if ((localVolDesc.filesystemType == 1) ||
 				(localVolDesc.filesystemType == 2))
 			{
-				FreezeMMXRegs(1);
 				memcpy_fast(&CDVolDesc, &localVolDesc, sizeof(cdVolDesc));
-				FreezeMMXRegs(0);
 			}
 		}
 		else
diff --git a/pcsx2/CDVDisodrv.cpp b/pcsx2/CDVDisodrv.cpp
index 12bc083921..6f699e857a 100644
--- a/pcsx2/CDVDisodrv.cpp
+++ b/pcsx2/CDVDisodrv.cpp
@@ -188,9 +188,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
 			RPC_LOG("[CDVDisodrv:    ] Couldn't Read from file for some reason\n");
 			return 0;
 		}
-		FreezeMMXRegs(1);
 		memcpy_fast(buffer, lb + off_sector, ssize);
-		FreezeMMXRegs(0);
 	}
 	if (asize)	if (CdRead(asector, asize >> 11, buffer+ssize, &cdReadMode) != TRUE){
 		RPC_LOG("[CDVDisodrv:    ] Couldn't Read from file for some reason\n");
@@ -201,9 +199,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
 			RPC_LOG("[CDVDisodrv:    ] Couldn't Read from file for some reason\n");
 			return 0;
 		}
-		FreezeMMXRegs(1);
 		memcpy_fast(buffer+ssize+asize, lb, esize);
-		FreezeMMXRegs(0);
 	}
 /***********************
 	// Now work out where we want to start reading from
diff --git a/pcsx2/CdRom.cpp b/pcsx2/CdRom.cpp
index 5985ade57d..583cb15e1d 100644
--- a/pcsx2/CdRom.cpp
+++ b/pcsx2/CdRom.cpp
@@ -527,9 +527,7 @@ void  cdrReadInterrupt() {
 		CDREAD_INT((cdr.Mode & 0x80) ? (cdReadTime / 2) : cdReadTime);
 		return;
 	}
-	FreezeMMXRegs(1);
 	memcpy_fast(cdr.Transfer, buf+12, 2340);
-	FreezeMMXRegs(0);
 	cdr.Stat = DataReady;
 
 	CDR_LOG(" %x:%x:%x\n", cdr.Transfer[0], cdr.Transfer[1], cdr.Transfer[2]);
@@ -923,9 +921,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) {
 			}
 
 			cdsize = (bcr & 0xffff) * 4;
-			FreezeMMXRegs(1);
 			memcpy_fast((u8*)PSXM(madr), cdr.pTransfer, cdsize);
-			FreezeMMXRegs(0);
 			psxCpu->Clear(madr, cdsize/4);
 			cdr.pTransfer+=cdsize;
 
diff --git a/pcsx2/GS.cpp b/pcsx2/GS.cpp
index 024f2b5500..916c58b6ca 100644
--- a/pcsx2/GS.cpp
+++ b/pcsx2/GS.cpp
@@ -575,11 +575,11 @@ static void WRITERING_DMA(u32 *pMem, u32 qwc)
 			{ 
 				pendmem = (pendmem&~0xfff)-16; 
 			} 
-			memcpy_raz_(pgsmem, pMem, pendmem-(u32)gif->madr+16);
+			memcpy_aligned(pgsmem, pMem, pendmem-(u32)gif->madr+16);
 		}
 		else
 #endif
-		memcpy_raz_(pgsmem, pMem, sizetoread); 
+		memcpy_aligned(pgsmem, pMem, sizetoread); 
 		
 		mtgsThread->SendDataPacket();
 	} 
diff --git a/pcsx2/Hw.h b/pcsx2/Hw.h
index 2c1a83c52c..609f509f19 100644
--- a/pcsx2/Hw.h
+++ b/pcsx2/Hw.h
@@ -329,7 +329,7 @@ static __forceinline u8* dmaGetAddr(u32 mem)
 
 #else
 
-
+// Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits)
 static __forceinline void *dmaGetAddr(u32 addr) {
 	u8 *ptr;
 
@@ -355,35 +355,17 @@ void hwShutdown();
 
 // hw read functions
 extern u8   hwRead8 (u32 mem);
-int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
-
 extern u16  hwRead16(u32 mem);
-int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
-
 extern u32  hwRead32(u32 mem);
-int hwConstRead32(u32 x86reg, u32 mem);
-
-u64  hwRead64(u32 mem);
-void hwConstRead64(u32 mem, int mmreg);
-
-void hwRead128(u32 mem, u64 *out);
-void hwConstRead128(u32 mem, int xmmreg);
+extern u64  hwRead64(u32 mem);
+extern void hwRead128(u32 mem, u64 *out);
 
 // hw write functions
-void hwWrite8 (u32 mem, u8  value);
-void hwConstWrite8 (u32 mem, int mmreg);
-
-void hwWrite16(u32 mem, u16 value);
-void hwConstWrite16(u32 mem, int mmreg);
-
-void hwWrite32(u32 mem, u32 value);
-void hwConstWrite32(u32 mem, int mmreg);
-
-void hwWrite64(u32 mem, u64 value);
-void hwConstWrite64(u32 mem, int mmreg);
-
-void hwWrite128(u32 mem, const u64 *value);
-void hwConstWrite128(u32 mem, int xmmreg);
+extern void hwWrite8 (u32 mem, u8  value);
+extern void hwWrite16(u32 mem, u16 value);
+extern void hwWrite32(u32 mem, u32 value);
+extern void hwWrite64(u32 mem, u64 value);
+extern void hwWrite128(u32 mem, const u64 *value);
 
 void hwIntcIrq(int n);
 void hwDmacIrq(int n);
@@ -394,6 +376,18 @@ int  hwMFIFOWrite(u32 addr, u8 *data, u32 size);
 int  hwDmacSrcChainWithStack(DMACh *dma, int id);
 int  hwDmacSrcChain(DMACh *dma, int id);
 
+int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
+int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
+int hwConstRead32(u32 x86reg, u32 mem);
+void hwConstRead64(u32 mem, int mmreg);
+void hwConstRead128(u32 mem, int xmmreg);
+
+void hwConstWrite8 (u32 mem, int mmreg);
+void hwConstWrite16(u32 mem, int mmreg);
+void hwConstWrite32(u32 mem, int mmreg);
+void hwConstWrite64(u32 mem, int mmreg);
+void hwConstWrite128(u32 mem, int xmmreg);
+
 #ifdef PCSX2_VIRTUAL_MEM
 void iMemRead32Check();
 #endif
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index 943796a310..a367df92d6 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -430,7 +430,7 @@ int mtgsThreadObject::Callback()
 {
 	Console::WriteLn("MTGS > Thread Started, Opening GS Plugin...");
 
-	memcpy_raz_( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
+	memcpy_aligned( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
 	GSsetBaseMem( m_gsMem );
 
 	m_returncode = GSopen((void *)&pDsp, "PCSX2", 1);
diff --git a/pcsx2/Misc.h b/pcsx2/Misc.h
index e8a5503070..9f6cb3f1ad 100644
--- a/pcsx2/Misc.h
+++ b/pcsx2/Misc.h
@@ -230,12 +230,19 @@ extern u8 g_globalXMMSaved;
 void _memset16_unaligned( void* dest, u16 data, size_t size );
 
 #if defined(_WIN32) && !defined(__x86_64__)
-	// faster memcpy
-	extern void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes);
-	extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t qwc);
-	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t n);
+
+	// The new simplified memcpy_amd_ is now faster than memcpy_raz_.
+	// memcpy_amd_ also does mmx register saving, negating the need for freezeregs (code cleanup!)
+	// Additionally, using one single memcpy implementation keeps the code cache cleaner.
+
+	//extern void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes);
+	//extern void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes);
+	//extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes);
+	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
+
 #	include "windows/memzero.h"
 #	define memcpy_fast memcpy_amd_
+#	define memcpy_aligned memcpy_amd_
 
 #else
 
@@ -243,6 +250,10 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 	#define memcpy_fast memcpy
 	#define memcpy_raz_ memcpy
 	#define memcpy_raz_u memcpy
+
+	#define memcpy_aligned memcpy
+	#define memcpy_raz_u memcpy
+
 	#include "Linux/memzero.h"
 
 #endif
diff --git a/pcsx2/SPR.cpp b/pcsx2/SPR.cpp
index 0813cc9ca2..d32f479b56 100644
--- a/pcsx2/SPR.cpp
+++ b/pcsx2/SPR.cpp
@@ -226,13 +226,11 @@ extern void mfifoGIFtransfer(int);
 #define gif ((DMACh*)&PS2MEM_HW[0xA000])
 void dmaSPR0() { // fromSPR
 	int qwc = spr0->qwc;
-	FreezeMMXRegs(1);
 
 	SPR_LOG("dmaSPR0 chcr = %lx, madr = %lx, qwc  = %lx, sadr = %lx\n",
 			spr0->chcr, spr0->madr, spr0->qwc, spr0->sadr);
 
 	_dmaSPR0();
-	FreezeMMXRegs(0);
 	if ((psHu32(DMAC_CTRL) & 0xC) == 0xC) { // GIF MFIFO
 		if((spr0->madr & ~psHu32(DMAC_RBSR)) != psHu32(DMAC_RBOR)) SysPrintf("GIF MFIFO Write outside MFIFO area\n");
 		spr0->madr = psHu32(DMAC_RBOR) + (spr0->madr & psHu32(DMAC_RBSR));
@@ -308,7 +306,6 @@ void _SPR1interleave() {
 void dmaSPR1() { // toSPR
 	
 	
-	FreezeMMXRegs(1);
 #ifdef SPR_LOG
 	SPR_LOG("dmaSPR1 chcr = 0x%x, madr = 0x%x, qwc  = 0x%x\n"
 			"        tadr = 0x%x, sadr = 0x%x\n",
@@ -325,7 +322,6 @@ void dmaSPR1() { // toSPR
 		// Transfer Dn_QWC from Dn_MADR to SPR1
 		SPR1chain();
 		CPU_INT(9, cycles); 
-		FreezeMMXRegs(0);
 		return;
 	} else if ((spr1->chcr & 0xc) == 0x4){
 			int cycles = 0;
@@ -338,7 +334,6 @@ void dmaSPR1() { // toSPR
 		// Transfer Dn_QWC from Dn_MADR to SPR1
 		SPR1chain();
 		CPU_INT(9, cycles); 
-		FreezeMMXRegs(0);
 		return;
 	}
 	// Chain Mode
@@ -382,7 +377,6 @@ void dmaSPR1() { // toSPR
 	} else { // Interleave Mode
 		_SPR1interleave();
 	} 
-	FreezeMMXRegs(0);
 	
 }
 
diff --git a/pcsx2/VifDma.cpp b/pcsx2/VifDma.cpp
index 17e6b528b5..141ab8b468 100644
--- a/pcsx2/VifDma.cpp
+++ b/pcsx2/VifDma.cpp
@@ -574,9 +574,7 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
 				// v4-32
 				if(vifRegs->mode == 0 && !(vifRegs->code & 0x10000000) && vif->usn == 0){
 					vifRegs->num -= size>>4;
-					FreezeMMXRegs(1);
 					memcpy_fast((u8*)dest, cdata, size);
-					FreezeMMXRegs(0);
 					size = 0;
 					//unpacktotal += GetCPUTick()-basetick;
 					return;
@@ -814,9 +812,7 @@ static __forceinline void _vif0mpgTransfer(u32 addr, u32 *data, int size) {
 		fclose(f);
 	}*/
 	if (memcmp(VU0.Micro + addr, data, size << 2)) {
-		FreezeMMXRegs(1);
 		memcpy_fast(VU0.Micro + addr, data, size << 2);
-		FreezeMMXRegs(0);
 		CpuVU0->Clear(addr, size);
 	}
 }
@@ -1490,9 +1486,7 @@ static __forceinline void _vif1mpgTransfer(u32 addr, u32 *data, int size) {
 	}*/
     assert( VU1.Micro > 0 );
 	if (memcmp(VU1.Micro + addr, data, size << 2)) {
-		FreezeMMXRegs(1);
 		memcpy_fast(VU1.Micro + addr, data, size << 2);
-		FreezeMMXRegs(0);
 		CpuVU1->Clear(addr, size);
 	}
 }
@@ -1644,7 +1638,7 @@ static int Vif1TransDirectHL(u32 *data){
 	{
 		//unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :)
 		const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, ret<<2 );
-		memcpy_raz_u( mtgsThread->GetDataPacketPtr(), data, count );
+		memcpy_fast( mtgsThread->GetDataPacketPtr(), data, count );
 		mtgsThread->SendDataPacket();
 	}
 	else {
diff --git a/pcsx2/windows/WinThreads.cpp b/pcsx2/windows/WinThreads.cpp
index 1faa20c157..94ff95796c 100644
--- a/pcsx2/windows/WinThreads.cpp
+++ b/pcsx2/windows/WinThreads.cpp
@@ -49,6 +49,9 @@ namespace Threading
 		}
 
 		cpuinfo.LogicalCores = CPUs;
+		if( LogicalCoresPerPhysicalCPU > CPUs) // for 1-socket HTT-disabled machines
+			LogicalCoresPerPhysicalCPU = CPUs;
+
 		cpuinfo.PhysicalCores = ( CPUs / LogicalCoresPerPhysicalCPU ) * PhysicalCoresPerPhysicalCPU;
 		ptw32_smp_system = ( cpuinfo.LogicalCores > 1 ) ? TRUE : FALSE;
 	}
diff --git a/pcsx2/x86/fast_routines.cpp b/pcsx2/x86/fast_routines.cpp
index 99ff1a920c..c561a0e0ab 100644
--- a/pcsx2/x86/fast_routines.cpp
+++ b/pcsx2/x86/fast_routines.cpp
@@ -88,35 +88,27 @@ void checkregs()
 #endif
 
 
-__declspec(align(16)) static u8 _xmm_backup[16*2];
+PCSX2_ALIGNED16( static u8 _xmm_backup[16*2] );
+PCSX2_ALIGNED16( static u8 _mmx_backup[8*4] );
 
-// this one checks for alignments too ...
-__declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes)
+static __declspec(naked) void __fastcall _memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
 {
-	// If src is aligned, use memcpy_raz instead:
-	__asm
-	{
-		test edx,0xf;
-		jz memcpy_raz_;
-	}
-
 	// MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
-	#define MOVSRC movups
+	#define MOVSRC movdqu
+	#define MOVDST movdqa
+
 	__asm
 	{
 		//Reads before reads, to avoid stalls
 		mov eax,[esp+4];
 		//Make sure to save xmm0, it must be preserved ...
-		movaps [_xmm_backup+0x00],xmm0;
+		movaps [_xmm_backup],xmm0;
 
 		//if >=128 bytes use 128 byte unrolled loop
 		//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
 		cmp eax,127;
 		jna _loop_1;
 
-		//unrolled version also touches xmm1, save it :)
-		movaps [_xmm_backup+0x10],xmm1;
-
 		//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
 		align 16
 
@@ -124,34 +116,111 @@ __declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size
 _loop_8:
 
 		MOVSRC xmm0,[edx+0x00];	//read first to avoid read-after-write stalls
-		MOVSRC xmm1,[edx+0x10];
+		MOVDST [ecx+0x00],xmm0; //then write :p
+		MOVSRC xmm0,[edx+0x10];
+		MOVDST [ecx+0x10],xmm0;
 		sub edx,-128;			//edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
-		movaps [ecx+0x00],xmm0; //then write :p
-		movaps [ecx+0x10],xmm1;
 		sub ecx,-128;			//ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 
 		MOVSRC xmm0,[edx+0x20-128];
-		MOVSRC xmm1,[edx+0x30-128];
+		MOVDST [ecx+0x20-128],xmm0;
+		MOVSRC xmm0,[edx+0x30-128];
+		MOVDST [ecx+0x30-128],xmm0;
 		add eax,-128;			//eax won't be used for a while, so update it here. add/-128 for simm8 encoding
-		movaps [ecx+0x20-128],xmm0;
-		movaps [ecx+0x30-128],xmm1;
 
 		MOVSRC xmm0,[edx+0x40-128];
-		MOVSRC xmm1,[edx+0x50-128];
-		movaps [ecx+0x40-128],xmm0;
-		movaps [ecx+0x50-128],xmm1;
+		MOVDST [ecx+0x40-128],xmm0;
+		MOVSRC xmm0,[edx+0x50-128];
+		MOVDST [ecx+0x50-128],xmm0;
 
 		MOVSRC xmm0,[edx+0x60-128];
-		MOVSRC xmm1,[edx+0x70-128];
-		movaps [ecx+0x60-128],xmm0;
-		movaps [ecx+0x70-128],xmm1;
+		MOVDST [ecx+0x60-128],xmm0;
+		MOVSRC xmm0,[edx+0x70-128];
+		MOVDST [ecx+0x70-128],xmm0;
 
 		//127~ja, 127 is encodable as simm8 :)
 		cmp eax,127;
 		ja _loop_8;
 		
-		//restore xmm1 :)
-		movaps xmm1,[_xmm_backup+0x10];
+		//direct copy for 0~7 qwords
+		//in order to avoid the inc/dec of all 3 registers
+		//i use negative relative addressing from the top of the buffers
+		//[top-current index]
+
+_loop_1:
+		//prepare the regs for 'negative relative addressing'
+		add edx,eax;
+		add ecx,eax;
+		neg eax;
+		jz cleanup;	//exit if nothing to do
+
+_loop_1_inner:
+		MOVSRC xmm0,[edx+eax];
+		MOVDST [ecx+eax],xmm0;
+		
+		add eax,16;		//while the offset is still negative we have data to copy
+		js _loop_1_inner;
+
+		//done !
+cleanup:
+		//restore xmm and exit ~)
+		movaps xmm0,[_xmm_backup];
+		ret 4;
+	}
+	#undef MOVSRC
+	#undef MOVDST
+}
+
+
+static __declspec(naked) void __fastcall _memcpy_raz_udst(void *dest, const void *src, size_t bytes)
+{
+	// MOVDST = opcode used to read. I use the same code for the aligned version, with a different define :)
+	#define MOVSRC movaps
+	#define MOVDST movups
+	__asm
+	{
+		//Reads before reads, to avoid stalls
+		mov eax,[esp+4];
+		//Make sure to save xmm0, it must be preserved ...
+		movaps [_xmm_backup],xmm0;
+
+		//if >=128 bytes use 128 byte unrolled loop
+		//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
+		cmp eax,127;
+		jna _loop_1;
+
+		//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
+		align 16
+
+		//128 byte unrolled loop
+_loop_8:
+
+		MOVSRC xmm0,[edx+0x00];	//read first to avoid read-after-write stalls
+		MOVDST [ecx+0x00],xmm0; //then write :p
+		MOVSRC xmm0,[edx+0x10];
+		MOVDST [ecx+0x10],xmm0;
+		sub edx,-128;			//edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
+		sub ecx,-128;			//ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
+
+		MOVSRC xmm0,[edx+0x20-128];
+		MOVDST [ecx+0x20-128],xmm0;
+		MOVSRC xmm0,[edx+0x30-128];
+		MOVDST [ecx+0x30-128],xmm0;
+		add eax,-128;			//eax won't be used for a while, so update it here. add/-128 for simm8 encoding
+
+		MOVSRC xmm0,[edx+0x40-128];
+		MOVDST [ecx+0x40-128],xmm0;
+		MOVSRC xmm0,[edx+0x50-128];
+		MOVDST [ecx+0x50-128],xmm0;
+
+		MOVSRC xmm0,[edx+0x60-128];
+		MOVDST [ecx+0x60-128],xmm0;
+		MOVSRC xmm0,[edx+0x70-128];
+		MOVDST [ecx+0x70-128],xmm0;
+
+		//127~ja, 127 is encodable as simm8 :)
+		cmp eax,127;
+		ja _loop_8;
 
 		//direct copy for 0~7 qwords
 		//in order to avoid the inc/dec of all 3 registers
@@ -168,22 +237,24 @@ _loop_1:
 _loop_1_inner:
 		MOVSRC xmm0,[edx+eax];
 		movaps [ecx+eax],xmm0;
-		
+
 		add eax,16;		//while the offset is still negative we have data to copy
 		js _loop_1_inner;
 
 		//done !
 cleanup:
 		//restore xmm and exit ~)
-		movaps xmm0,[_xmm_backup+0x00];
+		movaps xmm0,[_xmm_backup];
 		ret 4;
 	}
 	#undef MOVSRC
+	#undef MOVDST
 }
+
 // Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
 // This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
 // used since the reads are linear and the cache logic can predict em :)
-
+// *OBSOLETE* -- memcpy_amd_ has been optimized and is now faster.
 __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
 {
 	// Code Implementation Notes:
@@ -191,21 +262,19 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
 
 	// MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
 	#define MOVSRC movaps
+	#define MOVDST movaps
 	__asm
 	{
 		//Reads before reads, to avoid stalls
 		mov eax,[esp+4];
 		//Make sure to save xmm0, it must be preserved ...
-		movaps [_xmm_backup+0x00],xmm0;
+		movaps [_xmm_backup],xmm0;
 
 		//if >=128 bytes use 128 byte unrolled loop
 		//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
 		cmp eax,127;
 		jna _loop_1;
 
-		//unrolled version also toiches xmm1, save it :)
-		movaps [_xmm_backup+0x10],xmm1;
-
 		//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
 		align 16
 
@@ -213,35 +282,32 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
 _loop_8:
 
 		MOVSRC xmm0,[edx+0x00];	//read first to avoid read-after-write stalls
-		MOVSRC xmm1,[edx+0x10];
+		MOVDST [ecx+0x00],xmm0; //then write :p
+		MOVSRC xmm0,[edx+0x10];
+		MOVDST [ecx+0x10],xmm0;
 		sub edx,-128;			//edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
-		movaps [ecx+0x00],xmm0; //then write :p
-		movaps [ecx+0x10],xmm1;
 		sub ecx,-128;			//ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 
 		MOVSRC xmm0,[edx+0x20-128];
-		MOVSRC xmm1,[edx+0x30-128];
+		MOVDST [ecx+0x20-128],xmm0;
+		MOVSRC xmm0,[edx+0x30-128];
+		MOVDST [ecx+0x30-128],xmm0;
 		add eax,-128;			//eax won't be used for a while, so update it here. add/-128 for simm8 encoding
-		movaps [ecx+0x20-128],xmm0;
-		movaps [ecx+0x30-128],xmm1;
 
 		MOVSRC xmm0,[edx+0x40-128];
-		MOVSRC xmm1,[edx+0x50-128];
-		movaps [ecx+0x40-128],xmm0;
-		movaps [ecx+0x50-128],xmm1;
+		MOVDST [ecx+0x40-128],xmm0;
+		MOVSRC xmm0,[edx+0x50-128];
+		MOVDST [ecx+0x50-128],xmm0;
 
 		MOVSRC xmm0,[edx+0x60-128];
-		MOVSRC xmm1,[edx+0x70-128];
-		movaps [ecx+0x60-128],xmm0;
-		movaps [ecx+0x70-128],xmm1;
+		MOVDST [ecx+0x60-128],xmm0;
+		MOVSRC xmm0,[edx+0x70-128];
+		MOVDST [ecx+0x70-128],xmm0;
 
 		//127~ja, 127 is encodable as simm8 :)
 		cmp eax,127;
 		ja _loop_8;
 		
-		//restore xmm1 :)
-		movaps xmm1,[_xmm_backup+0x10];
-
 		//direct copy for 0~7 qwords
 		//in order to avoid the inc/dec of all 3 registers
 		//i use negative relative addressing from the top of the buffers
@@ -256,7 +322,7 @@ _loop_1:
 
 _loop_1_inner:
 		MOVSRC xmm0,[edx+eax];
-		movaps [ecx+eax],xmm0;
+		MOVDST [ecx+eax],xmm0;
 		
 		add eax,16;		//while the offset is still negative we have data to copy
 		js _loop_1_inner;
@@ -264,44 +330,64 @@ _loop_1_inner:
 		//done !
 cleanup:
 		//restore xmm and exit ~)
-		movaps xmm0,[_xmm_backup+0x00];
+		movaps xmm0,[_xmm_backup];
 		ret 4;
 	}
+	#undef MOVSRC
+	#undef MOVDST
 }
 
-#undef MOVSRC
+// This memcpy routine is for use in situations where the source buffer's alignment is indeterminate.
+__forceinline void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
+{
+	if( ((uptr)src & 0xf) == 0 )
+		memcpy_raz_( dest, src, bytes );
+	else
+		_memcpy_raz_usrc( dest, src, bytes );
+}
+
+// This memcpy routine is for use in situations where the destination buffer's alignment is indeterminate.
+__forceinline void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes)
+{
+	if( ((uptr)dest & 0xf) == 0 )
+		memcpy_raz_( dest, src, bytes );
+	else
+		_memcpy_raz_udst( dest, src, bytes );
+}
 
 
 //////////////////////////////////////////////////////////////////////////
-// Fast memcpy as coded by AMD.
-
-// This function clobbers all MMX registers, and is generally not optimal for short memory
-// copies due to the amount of overhead required to test for alignments, copy length,
-// and other ABI overhead.
-void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
+// Fast memcpy as coded by AMD, and thn improved by air.
+//
+// This routine preserves mmx registers!  It's the complete real deal!
+__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
 {
-    __asm {
+    __asm
+	{
+	push    edi  
+	push    esi  
+
 	mov		edi, ecx		; destination
 	mov		esi, edx		; source
-	mov		ecx, [n]		; number of bytes to copy
-	mov		ebx, ecx		; keep a copy of count
+	mov		ecx, [esp+12]	; number of bytes to copy
+	mov		eax, ecx		; keep a copy of count
 
 	cld
-	cmp		ecx, TINY_BLOCK_COPY
+	cmp		eax, TINY_BLOCK_COPY
 	jb		$memcpy_ic_3	; tiny? skip mmx copy
 
-	cmp		ecx, 32*1024		; don't align between 32k-64k because
+	cmp		eax, 32*1024		; don't align between 32k-64k because
 	jbe		$memcpy_do_align	;  it appears to be slower
-	cmp		ecx, 64*1024
+	cmp		eax, 64*1024
 	jbe		$memcpy_align_done
 $memcpy_do_align:
-	mov		ecx, 8			; a trick that's faster than rep movsb...
-	sub		ecx, edi		; align destination to qword
-	and		ecx, 111b		; get the low bits
-	sub		ebx, ecx		; update copy count
-	neg		ecx				; set up to jump into the array
-	add		ecx, offset $memcpy_align_done
-	jmp		ecx				; jump to array of movsb's
+	mov		eax, 8			; a trick that's faster than rep movsb...
+	sub		eax, edi		; align destination to qword
+	and		eax, 111b		; get the low bits
+	sub		ecx, eax		; update copy count
+	neg		eax				; set up to jump into the array
+	add		eax, offset $memcpy_align_done
+	jmp		eax				; jump to array of movsb's
 
 align 4
 	movsb
@@ -314,13 +400,18 @@ align 4
 	movsb
 
 $memcpy_align_done:			; destination is dword aligned
-	mov		ecx, ebx		; number of bytes left to copy
-	shr		ecx, 6			; get 64-byte block count
+	mov		eax, ecx		; number of bytes left to copy
+	shr		eax, 6			; get 64-byte block count
 	jz		$memcpy_ic_2	; finish the last few bytes
 
-	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
+	cmp		eax, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
 	jae		$memcpy_uc_test
 
+	movq	[_mmx_backup+0x00],mm0
+	movq	[_mmx_backup+0x08],mm1
+	movq	[_mmx_backup+0x10],mm2
+	movq	[_mmx_backup+0x18],mm3
+
 // This is small block copy that uses the MMX registers to copy 8 bytes
 // at a time.  It uses the "unrolled loop" optimization, and also uses
 // the software prefetch instruction to get the data into the cache.
@@ -348,30 +439,39 @@ $memcpy_ic_1:			; 64-byte block copies, in-cache copy
 
 	add		esi, 64			; update source pointer
 	add		edi, 64			; update destination pointer
-	dec		ecx				; count down
+	dec		eax				; count down
 	jnz		$memcpy_ic_1	; last 64-byte block?
 
+	movq	mm0,[_mmx_backup+0x00]
+	movq	mm1,[_mmx_backup+0x08]
+	movq	mm2,[_mmx_backup+0x10]
+	movq	mm3,[_mmx_backup+0x18]
+
 $memcpy_ic_2:
-	mov		ecx, ebx		; has valid low 6 bits of the byte count
+	mov		eax, ecx		; has valid low 6 bits of the byte count
 $memcpy_ic_3:
-	shr		ecx, 2			; dword count
-	and		ecx, 1111b		; only look at the "remainder" bits
-	neg		ecx				; set up to jump into the array
-	add		ecx, offset $memcpy_last_few
-	jmp		ecx				; jump to array of movsd's
+	shr		eax, 2			; dword count
+	and		eax, 1111b		; only look at the "remainder" bits
+	neg		eax				; set up to jump into the array
+	add		eax, offset $memcpy_last_few
+	jmp		eax				; jump to array of movsd's
 
 $memcpy_uc_test:
-	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
+	/*cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
 	jae		$memcpy_bp_1
-
-$memcpy_64_test:
-	or		ecx, ecx		; tail end of block prefetch will jump here
+$memcpy_64_test:*/
+	or		eax, eax		; tail end of block prefetch will jump here
 	jz		$memcpy_ic_2	; no more 64-byte blocks left
 
 // For larger blocks, which will spill beyond the cache, it's faster to
 // use the Streaming Store instruction MOVNTQ.   This write instruction
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
+
+	movq	[_mmx_backup+0x00],mm0
+	movq	[_mmx_backup+0x08],mm1
+	movq	[_mmx_backup+0x10],mm2
+
 align 16
 $memcpy_uc_1:				; 64-byte blocks, uncached copy
 
@@ -394,17 +494,25 @@ $memcpy_uc_1:				; 64-byte blocks, uncached copy
 	movq	mm1,[esi-8]
 	movntq	[edi-24], mm2
 	movntq	[edi-16], mm0
-	dec		ecx
+	dec		eax
 	movntq	[edi-8], mm1
 	jnz		$memcpy_uc_1	; last 64-byte block?
 
-	jmp		$memcpy_ic_2		; almost done
+	movq	mm0,[_mmx_backup+0x00]
+	movq	mm1,[_mmx_backup+0x08]
+	movq	mm2,[_mmx_backup+0x10]
+
+	jmp		$memcpy_ic_2		; almost done  (not needed because large copy below was removed)
 
 // For the largest size blocks, a special technique called Block Prefetch
 // can be used to accelerate the read operations.   Block Prefetch reads
 // one address per cache line, for a series of cache lines, in a short loop.
 // This is faster than using software prefetch.  The technique is great for
 // getting maximum read bandwidth, especially in DDR memory systems.
+
+// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
+// help keep the code cache footprint of memcpy_fast to a minimum.
+/*
 $memcpy_bp_1:			; large blocks, block prefetch copy
 
 	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
@@ -447,6 +555,7 @@ $memcpy_bp_3:
 	jnz		$memcpy_bp_3		; keep copying
 	sub		ecx, CACHEBLOCK		; update the 64-byte block count
 	jmp		$memcpy_bp_1		; keep processing chunks
+*/
 
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".   Then it handles the last few bytes.
@@ -469,8 +578,8 @@ align 4
 	movsd
 
 $memcpy_last_few:		; dword aligned from before movsd's
-	mov		ecx, ebx	; has valid low 2 bits of the byte count
-	and		ecx, 11b	; the last few cows must come home
+	mov		eax, ecx	; has valid low 2 bits of the byte count
+	and		eax, 11b	; the last few cows must come home
 	jz		$memcpy_final	; no more, let's leave
 	rep		movsb		; the last 1, 2, or 3 bytes
 
@@ -479,10 +588,14 @@ $memcpy_final:
 	sfence				; flush the write buffer
 	//mov		eax, [dest]	; ret value = destination pointer
 
+	pop    esi  
+	pop    edi
+
+	ret 4
     }
 }
 
-// mmx memcpy implementation, size has to be a multiple of 8
+// mmx mem-compare implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
 // ~10 times faster than standard memcmp
 // (zerofrog)
diff --git a/pcsx2/x86/iVUmicroLower.cpp b/pcsx2/x86/iVUmicroLower.cpp
index 42e8a22eb1..ca68eb3bc0 100644
--- a/pcsx2/x86/iVUmicroLower.cpp
+++ b/pcsx2/x86/iVUmicroLower.cpp
@@ -1977,12 +1977,7 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
     //if( size > 0 )
 	{
 		u8* pmem = mtgsThread->GetDataPacketPtr();
-		//FreezeMMXRegs(1);
-	    //memcpy_fast(pmem, (u8*)pMem+addr, size);
-		//FreezeMMXRegs(0);
-
-		// we can use the faster memcpy_raz_ here (src/dest are garaunteed to be aligned)
-		memcpy_raz_(pmem, (u8*)pMem+addr, size);
+		memcpy_aligned(pmem, (u8*)pMem+addr, size);
 		mtgsThread->SendDataPacket();
 	}
 }
diff --git a/pcsx2/x86/iVUzerorec.cpp b/pcsx2/x86/iVUzerorec.cpp
index eeee16d8f4..fa11268ec3 100644
--- a/pcsx2/x86/iVUzerorec.cpp
+++ b/pcsx2/x86/iVUzerorec.cpp
@@ -880,9 +880,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)
 #ifdef SUPERVU_CACHING
 		//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
 		r.pmem = malloc(r.size);
-		FreezeMMXRegs(1);
 		memcpy_fast(r.pmem, &VU->Micro[r.start], r.size);
-		FreezeMMXRegs(0);
 #endif
 		s_pFnHeader->ranges.push_back(r);
 	}