ReorderingMTGS: More tweaks to asm memcpy files (made code changes to Linux side, comment changes to Win32 side).

Linux Devs: Let's get this memcpy thing finalized, if its not already. I'd like to merge the current state of this branch into trunk as soon as possible, since its currently looking very stable and has been, up to this point, a code cleanup and stabilization project. (more invasive changes coming soon) git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3518 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-07-17 15:03:45 +00:00 · 2010-07-17 15:03:45 +00:00 · 2d4c7aaa25
parent 1c9cefd778
commit 2d4c7aaa25
3 changed files with 102 additions and 106 deletions
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@ -42,10 +42,12 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 extern void memcpy_vibes(void * dest, const void * src, int size);
 extern void gen_memcpy_vibes();

-#define memcpy_fast			memcpy_amd_  // Fast memcpy
+#define memcpy_fast				memcpy_amd_  // Fast memcpy
 #define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
-#define memcpy_const		memcpy_amd_	 // Memcpy with constant size
-#define memcpy_constA		memcpy_amd_  // Memcpy with constant size and 16-byte aligned
-#define memcpy_qwc_			memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
+#define memcpy_const			memcpy_amd_	 // Memcpy with constant size
+#define memcpy_constA			memcpy_amd_  // Memcpy with constant size and 16-byte aligned
+#define memcpy_qwc_				memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
 #define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
+
+// Useful alternative if we think memcpy_amd_qwc is buggy
 //#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@ -41,12 +41,10 @@
 MEMCPY_AMD.CPP
 ******************************************************************************/

-// Very optimized memcpy() routine for AMD Athlon and Duron family.
-// This code uses any of FOUR different basic copy methods, depending
-// on the transfer size.
 // NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
 // "Streaming Store"), and also uses the software prefetch instructions,
-// be sure you're running on Athlon/Duron or other recent CPU before calling!
+// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
+// calling!

 #define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
@ -68,10 +66,8 @@ MEMCPY_AMD.CPP

 #if defined(_MSC_VER)

-// --------------------------------------------------------------------------------------
-//  Fast memcpy as coded by AMD, and then improved by air. 
-// --------------------------------------------------------------------------------------

+//  Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
 __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
 {
    __asm
@ -92,6 +88,7 @@ __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_
 	jbe		$memcpy_do_align	;  it appears to be slower
 	cmp		eax, 64*1024
 	jbe		$memcpy_align_done
+
 $memcpy_do_align:
 	mov		eax, 8			; a trick that's faster than rep movsb...
 	sub		eax, edi		; align destination to qword
--- a/common/src/Utilities/x86/MemcpyVibes.cpp
+++ b/common/src/Utilities/x86/MemcpyVibes.cpp
@ -155,99 +155,96 @@ __forceinline void memcpy_vibes(void * dest, const void * src, int size) {
 }

 #endif
-#endif
-
-// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
-// to get around compilation issues with having it in the headers.
-#ifdef __LINUX__
-
-	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
-	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
-	__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
-	{	
-		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
-		// registers will improve copy performance, because they won't.  Use of XMMs is only
-		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
-		// and even then the benefits are typically minimal (sometimes slower depending on the
-		// amount of data being copied).
-		//
-		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
-		//   --air
-
-		// Linux Conversion note:
-		//  This code would benefit nicely from having inline-able GAS syntax, since it should
-		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
-		//  And its called enough times to probably merit the extra effort to ensure proper
-		//  optimization. --air
-
-		__asm__
-		(
-			".intel_syntax noprefix\n"
-				//"mov		ecx, [%[dest]]\n"
-				//"mov		edx, [%[src]]\n"
-				//"mov		eax, [%[qwc]]\n"			// keep a copy of count
-				"mov		eax, %[qwc]\n"
-				"shr		eax, 1\n"
-				"jz			memcpy_qwc_1\n"				// only one 16 byte block to copy?
-
-				"cmp		%[qwc], 64\n"				// "IN_CACHE_COPY/32"
-				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
-		
-			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
-				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
-
-				"movq		mm0,[%[src]+0]\n"			// read 64 bits
-				"movq		mm1,[%[src]+8]\n"
-				"movq		mm2,[%[src]+16]\n"
-				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
-				"movntq		[%[dest]+8], mm1\n"
-				"movq		mm3,[%[src]+24]\n"
-				"movntq		[%[dest]+16], mm2\n"
-				"movntq		[%[dest]+24], mm3\n"
-
-				"add		%[src],32\n"				// update source pointer
-				"add		%[dest],32\n"				// update destination pointer
-				"sub		eax,1\n"
-				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
-				"sfence\n"								// flush the write buffer
-				"jmp		memcpy_qwc_1\n"
-
-			// 32-byte blocks, cached!
-			// This *is* important.  Removing this and using exclusively non-temporal stores
-			// results in noticeable speed loss!
-
-			"memcpy_qwc_loop1:\n"				
-				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
-
-				"movq		mm0,[%[src]+0]\n"			// read 64 bits
-				"movq		mm1,[%[src]+8]\n"
-				"movq		mm2,[%[src]+16]\n"
-				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
-				"movq		[%[dest]+8], mm1\n"
-				"movq		mm3,[%[src]+24]\n"
-				"movq		[%[dest]+16], mm2\n"
-				"movq		[%[dest]+24], mm3\n"
-
-				"add		%[src],32\n"				// update source pointer
-				"add		%[dest],32\n"				// update destination pointer
-				"sub		eax,1\n"
-				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
-
-			"memcpy_qwc_1:\n"
-				"test		%[qwc],1\n"
-				"jz			memcpy_qwc_final\n"
-				"movq		mm0,[%[src]]\n"
-				"movq		mm1,[%[src]+8]\n"
-				"movq		[%[dest]], mm0\n"
-				"movq		[%[dest]+8], mm1\n"
-
-			"memcpy_qwc_final:\n"
-				"emms\n"								// clean up the MMX state
-			".att_syntax\n"
-					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
-					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
-					: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
-		);
-	}
-#endif
+#endif
+
+// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
+// to get around compilation issues with having it in the headers.
+#ifdef __LINUX__
+
+	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
+	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
+	__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+	{	
+		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
+		// registers will improve copy performance, because they won't.  Use of XMMs is only
+		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
+		// and even then the benefits are typically minimal (sometimes slower depending on the
+		// amount of data being copied).
+		//
+		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
+		//   --air
+
+		// Linux Conversion note:
+		//  This code would benefit nicely from having inline-able GAS syntax, since it should
+		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
+		//  And its called enough times to probably merit the extra effort to ensure proper
+		//  optimization. --air
+
+		__asm__
+		(
+			".intel_syntax noprefix\n"
+				"mov		eax, %[qwc]\n"				// keep a copy of count for looping
+				"shr		eax, 1\n"
+				"jz			memcpy_qwc_1\n"				// only one 16 byte block to copy?
+
+				"cmp		eax, 64\n"					// "IN_CACHE_COPY/32"
+				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
+		
+			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
+
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		mm1,[%[src]+8]\n"
+				"movq		mm2,[%[src]+16]\n"
+				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movntq		[%[dest]+8], mm1\n"
+				"movq		mm3,[%[src]+24]\n"
+				"movntq		[%[dest]+16], mm2\n"
+				"movntq		[%[dest]+24], mm3\n"
+
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
+				"sfence\n"								// flush the write buffer
+				"jmp		memcpy_qwc_1\n"
+
+			// 32-byte blocks, cached!
+			// This *is* important.  Removing this and using exclusively non-temporal stores
+			// results in noticeable speed loss!
+
+			"memcpy_qwc_loop1:\n"				
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
+
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		mm1,[%[src]+8]\n"
+				"movq		mm2,[%[src]+16]\n"
+				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movq		[%[dest]+8], mm1\n"
+				"movq		mm3,[%[src]+24]\n"
+				"movq		[%[dest]+16], mm2\n"
+				"movq		[%[dest]+24], mm3\n"
+
+				"add		%[src],32\n"				// update source pointer
+				"add		%[dest],32\n"				// update destination pointer
+				"sub		eax,1\n"
+				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
+
+			"memcpy_qwc_1:\n"
+				"testl		%[qwc],1\n"
+				"jz			memcpy_qwc_final\n"
+				"movq		mm0,[%[src]]\n"
+				"movq		mm1,[%[src]+8]\n"
+				"movq		[%[dest]], mm0\n"
+				"movq		[%[dest]+8], mm1\n"
+
+			"memcpy_qwc_final:\n"
+				"emms\n"								// clean up the MMX state
+			".att_syntax\n"
+					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
+					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
+					: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
+		);
+	}
+#endif