From d1e0922417b4c69c2b66311023ba9464235428c7 Mon Sep 17 00:00:00 2001
From: arcum42 <arcum42@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Tue, 13 Jul 2010 10:48:35 +0000
Subject: [PATCH] ReorderingMTGS: Initial Linux version of memcpy_amd_qwc.
 Disabled for now, till I get a chance to look it over better.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3477 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/build/Utilities/Utilities.cbp  |  4 ++
 common/include/Utilities/MemcpyFast.h | 92 +++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)
diff --git a/common/build/Utilities/Utilities.cbp b/common/build/Utilities/Utilities.cbp
index 0e98314365..0d7fbbe596 100644
--- a/common/build/Utilities/Utilities.cbp
+++ b/common/build/Utilities/Utilities.cbp
@@ -208,6 +208,10 @@
 		<Unit filename="../../src/Utilities/wxGuiTools.cpp" />
 		<Unit filename="../../src/Utilities/wxHelpers.cpp" />
 		<Unit filename="../../src/Utilities/x86/MemcpyFast.S" />
+		<Unit filename="../../src/Utilities/x86/MemcpyFast.cpp">
+			<Option compile="0" />
+			<Option link="0" />
+		</Unit>
 		<Extensions>
 			<envvars />
 			<code_completion>
diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index d78f9f8a8e..9fc9331651 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -23,6 +23,95 @@
 	extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 	extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);
 
+#if 0
+	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
+	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
+	static __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+	{	
+		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
+		// registers will improve copy performance, because they won't.  Use of XMMs is only
+		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
+		// and even then the benefits are typically minimal (sometimes slower depending on the
+		// amount of data being copied).
+		//
+		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
+		//   --air
+
+		// Linux Conversion note:
+		//  This code would benefit nicely from having inline-able GAS syntax, since it should
+		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
+		//  And its called enough times to probably merit the extra effort to ensure proper
+		//  optimization. --air
+
+		__asm__
+		(
+			".intel_syntax noprefix\n"
+				"mov		ecx, [%[dest]]\n"
+				"mov		edx, [%[src]]\n"
+				"mov		eax, [%[qwc]]\n"			// keep a copy of count
+				"shr		eax, 1\n"
+				"jz		memcpy_qwc_1\n"		// only one 16 byte block to copy?
+
+				"cmp		eax, 64\n" // "IN_CACHE_COPY/32"
+				"jb		memcpy_qwc_loop1\n"	// small copies should be cached (definite speedup --air)
+		
+			"memcpy_qwc_loop2:\n"				// 32-byte blocks, uncached copy
+				"prefetchnta [edx + 568]\n"		// start reading ahead (tested: it helps! --air)
+
+				"movq	mm0,[edx+0]\n"			// read 64 bits
+				"movq	mm1,[edx+8]\n"
+				"movq	mm2,[edx+16]\n"
+				"movntq	[ecx+0], mm0\n"		// write 64 bits, bypassing the cache
+				"movntq	[ecx+8], mm1\n"
+				"movq	mm3,[edx+24]\n"
+				"movntq	[ecx+16], mm2\n"
+				"movntq	[ecx+24], mm3\n"
+
+				"add		edx,32\n"				// update source pointer
+				"add		ecx,32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop2\n"	// last 64-byte block?
+				"sfence\n"						// flush the write buffer
+				"jmp		memcpy_qwc_1\n"
+
+			// 32-byte blocks, cached!
+			// This *is* important.  Removing this and using exclusively non-temporal stores
+			// results in noticable speed loss!
+
+			"memcpy_qwc_loop1:\n"				
+				"prefetchnta [edx + 568]\n"		// start reading ahead (tested: it helps! --air)
+
+				"movq	mm0,[edx+0]\n"			// read 64 bits
+				"movq	mm1,[edx+8]\n"
+				"movq	mm2,[edx+16]\n"
+				"movq	[ecx+0], mm0\n"		// write 64 bits, bypassing the cache
+				"movq	[ecx+8], mm1\n"
+				"movq	mm3,[edx+24]\n"
+				"movq	[ecx+16], mm2\n"
+				"movq	[ecx+24], mm3\n"
+
+				"add		edx,32\n"				// update source pointer
+				"add		ecx,32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop1\n"	// last 64-byte block?
+
+			"memcpy_qwc_1:\n"
+				"test	[%[qwc]],dword ptr 1\n"
+				"jz		memcpy_qwc_final\n"
+				"movq	mm0,[edx]\n"
+				"movq	mm1,[edx+8]\n"
+				"movq	[ecx], mm0\n"
+				"movq	[ecx+8], mm1\n"
+
+			"memcpy_qwc_final:\n"
+				"emms\n"				// clean up the MMX state
+			".att_syntax\n"
+					: "=r"(dest), "=r"(src), "=r"(qwc)
+					: [dest]"r"(dest), [src]"r"(src), [qwc]"r"(qwc)
+					//: Needs a clobber list here
+		);
+	}
+#endif
 #else
 
 #	include "win_memzero.h"
@@ -41,8 +130,11 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 #define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
 #define memcpy_const			memcpy_amd_	// Memcpy with constant size
 #define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned
+
+//#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
 #ifndef __LINUX__
 #define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
 #else
 #define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
+//#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
 #endif