From 6ded71561c2fa518c9e6d3d426d9b4dac1102226 Mon Sep 17 00:00:00 2001
From: arcum42 <arcum42@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Wed, 14 Jul 2010 09:19:46 +0000
Subject: [PATCH] ReorderingMTGS: Revise memcpy_amd_qwc for Linux.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3484 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/Utilities/MemcpyFast.h | 81 ++++++++++++---------------
 1 file changed, 37 insertions(+), 44 deletions(-)

diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index 9fc9331651..56c4d0ba39 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -23,7 +23,6 @@
 	extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 	extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);
 
-#if 0
 	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
 	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
 	static __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
@@ -46,30 +45,30 @@
 		__asm__
 		(
 			".intel_syntax noprefix\n"
-				"mov		ecx, [%[dest]]\n"
-				"mov		edx, [%[src]]\n"
-				"mov		eax, [%[qwc]]\n"			// keep a copy of count
-				"shr		eax, 1\n"
+				//"mov		ecx, [%[dest]]\n"
+				//"mov		edx, [%[src]]\n"
+				//"mov		eax, [%[qwc]]\n"			// keep a copy of count
+				"shr		%[qwc], 1\n"
 				"jz		memcpy_qwc_1\n"		// only one 16 byte block to copy?
 
-				"cmp		eax, 64\n" // "IN_CACHE_COPY/32"
+				"cmp		%[qwc], 64\n" // "IN_CACHE_COPY/32"
 				"jb		memcpy_qwc_loop1\n"	// small copies should be cached (definite speedup --air)
 		
 			"memcpy_qwc_loop2:\n"				// 32-byte blocks, uncached copy
-				"prefetchnta [edx + 568]\n"		// start reading ahead (tested: it helps! --air)
+				"prefetchnta [%[src] + 568]\n"		// start reading ahead (tested: it helps! --air)
 
-				"movq	mm0,[edx+0]\n"			// read 64 bits
-				"movq	mm1,[edx+8]\n"
-				"movq	mm2,[edx+16]\n"
-				"movntq	[ecx+0], mm0\n"		// write 64 bits, bypassing the cache
-				"movntq	[ecx+8], mm1\n"
-				"movq	mm3,[edx+24]\n"
-				"movntq	[ecx+16], mm2\n"
-				"movntq	[ecx+24], mm3\n"
+				"movq	mm0,[%[src]+0]\n"			// read 64 bits
+				"movq	mm1,[%[src]+8]\n"
+				"movq	mm2,[%[src]+16]\n"
+				"movntq	[%[dest]+0], mm0\n"		// write 64 bits, bypassing the cache
+				"movntq	[%[dest]+8], mm1\n"
+				"movq	mm3,[%[src]+24]\n"
+				"movntq	[%[dest]+16], mm2\n"
+				"movntq	[%[dest]+24], mm3\n"
 
-				"add		edx,32\n"				// update source pointer
-				"add		ecx,32\n"				// update destination pointer
-				"sub		eax,1\n"
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		%[qwc],1\n"
 				"jnz		memcpy_qwc_loop2\n"	// last 64-byte block?
 				"sfence\n"						// flush the write buffer
 				"jmp		memcpy_qwc_1\n"
@@ -79,39 +78,38 @@
 			// results in noticable speed loss!
 
 			"memcpy_qwc_loop1:\n"				
-				"prefetchnta [edx + 568]\n"		// start reading ahead (tested: it helps! --air)
+				"prefetchnta [%[src] + 568]\n"		// start reading ahead (tested: it helps! --air)
 
-				"movq	mm0,[edx+0]\n"			// read 64 bits
-				"movq	mm1,[edx+8]\n"
-				"movq	mm2,[edx+16]\n"
-				"movq	[ecx+0], mm0\n"		// write 64 bits, bypassing the cache
-				"movq	[ecx+8], mm1\n"
-				"movq	mm3,[edx+24]\n"
-				"movq	[ecx+16], mm2\n"
-				"movq	[ecx+24], mm3\n"
+				"movq	mm0,[%[src]+0]\n"			// read 64 bits
+				"movq	mm1,[%[src]+8]\n"
+				"movq	mm2,[%[src]+16]\n"
+				"movq	[%[dest]+0], mm0\n"		// write 64 bits, bypassing the cache
+				"movq	[%[dest]+8], mm1\n"
+				"movq	mm3,[%[src]+24]\n"
+				"movq	[%[dest]+16], mm2\n"
+				"movq	[%[dest]+24], mm3\n"
 
-				"add		edx,32\n"				// update source pointer
-				"add		ecx,32\n"				// update destination pointer
-				"sub		eax,1\n"
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		%[qwc],1\n"
 				"jnz		memcpy_qwc_loop1\n"	// last 64-byte block?
 
 			"memcpy_qwc_1:\n"
 				"test	[%[qwc]],dword ptr 1\n"
 				"jz		memcpy_qwc_final\n"
-				"movq	mm0,[edx]\n"
-				"movq	mm1,[edx+8]\n"
-				"movq	[ecx], mm0\n"
-				"movq	[ecx+8], mm1\n"
+				"movq	mm0,[%[src]]\n"
+				"movq	mm1,[%[src]+8]\n"
+				"movq	[%[dest]], mm0\n"
+				"movq	[%[dest]+8], mm1\n"
 
 			"memcpy_qwc_final:\n"
 				"emms\n"				// clean up the MMX state
 			".att_syntax\n"
-					: "=r"(dest), "=r"(src), "=r"(qwc)
-					: [dest]"r"(dest), [src]"r"(src), [qwc]"r"(qwc)
-					//: Needs a clobber list here
+					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
+					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
+					: "memory", "mm0", "mm1", "mm2", "mm3"
 		);
 	}
-#endif
 #else
 
 #	include "win_memzero.h"
@@ -131,10 +129,5 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 #define memcpy_const			memcpy_amd_	// Memcpy with constant size
 #define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned
 
-//#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
-#ifndef __LINUX__
 #define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
-#else
-#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
-//#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
-#endif
+//#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)