ReorderingMTGS: More tweaks to asm memcpy files (made code changes to Linux side, comment changes to Win32 side).

Linux Devs: Let's get this memcpy thing finalized, if its not already. I'd like to merge the current state of this branch into trunk as soon as possible, since its currently looking very stable and has been, up to this point, a code cleanup and stabilization project. (more invasive changes coming soon) git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3518 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-07-17 15:03:45 +00:00 · 2010-07-17 15:03:45 +00:00 · 2d4c7aaa25
parent 1c9cefd778
commit 2d4c7aaa25
3 changed files with 102 additions and 106 deletions
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@ -42,10 +42,12 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 extern void memcpy_vibes(void * dest, const void * src, int size);
 extern void gen_memcpy_vibes();
-#define memcpy_fast			memcpy_amd_  // Fast memcpy
+#define memcpy_fast				memcpy_amd_  // Fast memcpy
 #define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
-#define memcpy_const		memcpy_amd_	 // Memcpy with constant size
+#define memcpy_const			memcpy_amd_	 // Memcpy with constant size
-#define memcpy_constA		memcpy_amd_  // Memcpy with constant size and 16-byte aligned
+#define memcpy_constA			memcpy_amd_  // Memcpy with constant size and 16-byte aligned
-#define memcpy_qwc_			memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
+#define memcpy_qwc_				memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
 #define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
 // Useful alternative if we think memcpy_amd_qwc is buggy
 //#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@ -41,12 +41,10 @@
 MEMCPY_AMD.CPP
 ******************************************************************************/
 // Very optimized memcpy() routine for AMD Athlon and Duron family.
 // This code uses any of FOUR different basic copy methods, depending
 // on the transfer size.
 // NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
 // "Streaming Store"), and also uses the software prefetch instructions,
-// be sure you're running on Athlon/Duron or other recent CPU before calling!
+// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
 // calling!
 #define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
@ -68,10 +66,8 @@ MEMCPY_AMD.CPP
 #if defined(_MSC_VER)
 // --------------------------------------------------------------------------------------
 //  Fast memcpy as coded by AMD, and then improved by air. 
 // --------------------------------------------------------------------------------------
 //  Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
 __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
 {
    __asm
@ -92,6 +88,7 @@ __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_
 	jbe		$memcpy_do_align	;  it appears to be slower
 	cmp		eax, 64*1024
 	jbe		$memcpy_align_done
 $memcpy_do_align:
 	mov		eax, 8			; a trick that's faster than rep movsb...
 	sub		eax, edi		; align destination to qword
--- a/common/src/Utilities/x86/MemcpyVibes.cpp
+++ b/common/src/Utilities/x86/MemcpyVibes.cpp
@ -155,99 +155,96 @@ __forceinline void memcpy_vibes(void * dest, const void * src, int size) {
 }
 #endif
-#endif
+#endif
-
+
-// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
+// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
-// to get around compilation issues with having it in the headers.
+// to get around compilation issues with having it in the headers.
-#ifdef __LINUX__
+#ifdef __LINUX__
-
+
-	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
+	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
-	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
+	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
-	__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+	__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
-	{	
+	{	
-		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
+		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
-		// registers will improve copy performance, because they won't.  Use of XMMs is only
+		// registers will improve copy performance, because they won't.  Use of XMMs is only
-		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
+		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
-		// and even then the benefits are typically minimal (sometimes slower depending on the
+		// and even then the benefits are typically minimal (sometimes slower depending on the
-		// amount of data being copied).
+		// amount of data being copied).
-		//
+		//
-		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
+		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
-		//   --air
+		//   --air
-
+
-		// Linux Conversion note:
+		// Linux Conversion note:
-		//  This code would benefit nicely from having inline-able GAS syntax, since it should
+		//  This code would benefit nicely from having inline-able GAS syntax, since it should
-		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
+		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
-		//  And its called enough times to probably merit the extra effort to ensure proper
+		//  And its called enough times to probably merit the extra effort to ensure proper
-		//  optimization. --air
+		//  optimization. --air
-
+
-		__asm__
+		__asm__
-		(
+		(
-			".intel_syntax noprefix\n"
+			".intel_syntax noprefix\n"
-				//"mov		ecx, [%[dest]]\n"
+				"mov		eax, %[qwc]\n"				// keep a copy of count for looping
-				//"mov		edx, [%[src]]\n"
+				"shr		eax, 1\n"
-				//"mov		eax, [%[qwc]]\n"			// keep a copy of count
+				"jz			memcpy_qwc_1\n"				// only one 16 byte block to copy?
-				"mov		eax, %[qwc]\n"
+
-				"shr		eax, 1\n"
+				"cmp		eax, 64\n"					// "IN_CACHE_COPY/32"
-				"jz			memcpy_qwc_1\n"				// only one 16 byte block to copy?
+				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
-
+		
-				"cmp		%[qwc], 64\n"				// "IN_CACHE_COPY/32"
+			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
-				"jb			memcpy_qwc_loop1\n"			// small copies should be cached (definite speedup --air)
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
-		
+
-			"memcpy_qwc_loop2:\n"						// 32-byte blocks, uncached copy
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
-				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
+				"movq		mm1,[%[src]+8]\n"
-
+				"movq		mm2,[%[src]+16]\n"
-				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
-				"movq		mm1,[%[src]+8]\n"
+				"movntq		[%[dest]+8], mm1\n"
-				"movq		mm2,[%[src]+16]\n"
+				"movq		mm3,[%[src]+24]\n"
-				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movntq		[%[dest]+16], mm2\n"
-				"movntq		[%[dest]+8], mm1\n"
+				"movntq		[%[dest]+24], mm3\n"
-				"movq		mm3,[%[src]+24]\n"
+
-				"movntq		[%[dest]+16], mm2\n"
+				"add		%[src],32\n"				// update source pointer
-				"movntq		[%[dest]+24], mm3\n"
+				"add		%[dest],32\n"				// update destination pointer
-
+				"sub		eax,1\n"
-				"add		%[src],32\n"				// update source pointer
+				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
-				"add		%[dest],32\n"				// update destination pointer
+				"sfence\n"								// flush the write buffer
-				"sub		eax,1\n"
+				"jmp		memcpy_qwc_1\n"
-				"jnz		memcpy_qwc_loop2\n"			// last 64-byte block?
+
-				"sfence\n"								// flush the write buffer
+			// 32-byte blocks, cached!
-				"jmp		memcpy_qwc_1\n"
+			// This *is* important.  Removing this and using exclusively non-temporal stores
-
+			// results in noticeable speed loss!
-			// 32-byte blocks, cached!
+
-			// This *is* important.  Removing this and using exclusively non-temporal stores
+			"memcpy_qwc_loop1:\n"				
-			// results in noticeable speed loss!
+				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
-
+
-			"memcpy_qwc_loop1:\n"				
+				"movq		mm0,[%[src]+0]\n"			// read 64 bits
-				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
+				"movq		mm1,[%[src]+8]\n"
-
+				"movq		mm2,[%[src]+16]\n"
-				"movq		mm0,[%[src]+0]\n"			// read 64 bits
+				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
-				"movq		mm1,[%[src]+8]\n"
+				"movq		[%[dest]+8], mm1\n"
-				"movq		mm2,[%[src]+16]\n"
+				"movq		mm3,[%[src]+24]\n"
-				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
+				"movq		[%[dest]+16], mm2\n"
-				"movq		[%[dest]+8], mm1\n"
+				"movq		[%[dest]+24], mm3\n"
-				"movq		mm3,[%[src]+24]\n"
+
-				"movq		[%[dest]+16], mm2\n"
+				"add		%[src],32\n"				// update source pointer
-				"movq		[%[dest]+24], mm3\n"
+				"add		%[dest],32\n"				// update destination pointer
-
+				"sub		eax,1\n"
-				"add		%[src],32\n"				// update source pointer
+				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
-				"add		%[dest],32\n"				// update destination pointer
+
-				"sub		eax,1\n"
+			"memcpy_qwc_1:\n"
-				"jnz		memcpy_qwc_loop1\n"			// last 64-byte block?
+				"testl		%[qwc],1\n"
-
+				"jz			memcpy_qwc_final\n"
-			"memcpy_qwc_1:\n"
+				"movq		mm0,[%[src]]\n"
-				"test		%[qwc],1\n"
+				"movq		mm1,[%[src]+8]\n"
-				"jz			memcpy_qwc_final\n"
+				"movq		[%[dest]], mm0\n"
-				"movq		mm0,[%[src]]\n"
+				"movq		[%[dest]+8], mm1\n"
-				"movq		mm1,[%[src]+8]\n"
+
-				"movq		[%[dest]], mm0\n"
+			"memcpy_qwc_final:\n"
-				"movq		[%[dest]+8], mm1\n"
+				"emms\n"								// clean up the MMX state
-
+			".att_syntax\n"
-			"memcpy_qwc_final:\n"
+					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
-				"emms\n"								// clean up the MMX state
+					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
-			".att_syntax\n"
+					: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
-					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
+		);
-					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
+	}
-					: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
+#endif
 		);
 	}
 #endif