Remove MMX register freezes from MemcpyFast.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3380 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-07-03 13:08:41 +00:00 · 2010-07-03 13:08:41 +00:00 · 402f19413b
parent 84c5073b7c
commit 402f19413b
2 changed files with 3 additions and 323 deletions
--- a/common/src/Utilities/x86/MemcpyFast.S
+++ b/common/src/Utilities/x86/MemcpyFast.S
@ -368,15 +368,9 @@ $memcpy_align_done:			// destination is dword aligned
 	shr		eax, 6			// get 64-byte block count
 	jz		$memcpy_ic_2	// finish the last few bytes
 	mov     edx, offset _mmx_backup // will probably need this to save/restore mmx
 	cmp		eax, IN_CACHE_COPY/64	// too big 4 cache? use uncached copy
 	jae		$memcpy_uc_test
 	movq	[edx+0x00],mm0
 	movq	[edx+0x08],mm1
 	movq	[edx+0x10],mm2
 	movq	[edx+0x18],mm3
 // This is small block copy that uses the MMX registers to copy 8 bytes
 // at a time.  It uses the "unrolled loop" optimization, and also uses
 // the software prefetch instruction to get the data into the cache.
@ -407,11 +401,6 @@ $memcpy_ic_1:			// 64-byte block copies, in-cache copy
 	dec		eax				// count down
 	jnz		$memcpy_ic_1	// last 64-byte block?
 	movq	mm0,[edx+0x00]
 	movq	mm1,[edx+0x08]
 	movq	mm2,[edx+0x10]
 	movq	mm3,[edx+0x18]
 $memcpy_ic_2:
 	mov		eax, ecx		// has valid low 6 bits of the byte count
 $memcpy_ic_3:
@ -430,10 +419,6 @@ $memcpy_uc_test:
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
 	movq	[edx+0x00],mm0
 	movq	[edx+0x08],mm1
 	movq	[edx+0x10],mm2
 .align 16
 $memcpy_uc_1:				// 64-byte blocks, uncached copy
@ -460,10 +445,6 @@ $memcpy_uc_1:				// 64-byte blocks, uncached copy
 	movntq	[edi-8], mm1
 	jnz		$memcpy_uc_1	// last 64-byte block?
 	movq	mm0,[edx+0x00]
 	movq	mm1,[edx+0x08]
 	movq	mm2,[edx+0x10]
 	jmp		$memcpy_ic_2		// almost done  (not needed because large copy below was removed)
 // For the largest size blocks, a special technique called Block Prefetch
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@ -64,296 +64,14 @@ MEMCPY_AMD.CPP
 // uses the software prefetch instruction to pre-read the data.
 // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
 #define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch
 #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
 // For the largest size blocks, a special technique called Block Prefetch
 // can be used to accelerate the read operations.   Block Prefetch reads
 // one address per cache line, for a series of cache lines, in a short loop.
 // This is faster than using software prefetch.  The technique is great for
 // getting maximum read bandwidth, especially in DDR memory systems.
 // Inline assembly syntax for use with Visual C++
 #if defined(_MSC_VER)
-#ifdef PCSX2_DEBUG
+// --------------------------------------------------------------------------------------
-extern u8 g_globalMMXSaved;
+//  Fast memcpy as coded by AMD, and then improved by air. 
 // --------------------------------------------------------------------------------------
 #endif
 static __aligned16 u8 _xmm_backup[16*2];
 static __aligned16 u8 _mmx_backup[8*4];
 static __declspec(naked) void __fastcall _memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
 {
 	// MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
 	#define MOVSRC movdqu
 	#define MOVDST movdqa
 	__asm
 	{
 		//Reads before reads, to avoid stalls
 		mov eax,[esp+4];
 		//Make sure to save xmm0, it must be preserved ...
 		movaps [_xmm_backup],xmm0;
 		//if >=128 bytes use 128 byte unrolled loop
 		//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
 		cmp eax,127;
 		jna _loop_1;
 		//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
 		align 16
 		//128 byte unrolled loop
 _loop_8:
 		MOVSRC xmm0,[edx+0x00];	//read first to avoid read-after-write stalls
 		MOVDST [ecx+0x00],xmm0; //then write :p
 		MOVSRC xmm0,[edx+0x10];
 		MOVDST [ecx+0x10],xmm0;
 		sub edx,-128;			//edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		sub ecx,-128;			//ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		MOVSRC xmm0,[edx+0x20-128];
 		MOVDST [ecx+0x20-128],xmm0;
 		MOVSRC xmm0,[edx+0x30-128];
 		MOVDST [ecx+0x30-128],xmm0;
 		add eax,-128;			//eax won't be used for a while, so update it here. add/-128 for simm8 encoding
 		MOVSRC xmm0,[edx+0x40-128];
 		MOVDST [ecx+0x40-128],xmm0;
 		MOVSRC xmm0,[edx+0x50-128];
 		MOVDST [ecx+0x50-128],xmm0;
 		MOVSRC xmm0,[edx+0x60-128];
 		MOVDST [ecx+0x60-128],xmm0;
 		MOVSRC xmm0,[edx+0x70-128];
 		MOVDST [ecx+0x70-128],xmm0;
 		//127~ja, 127 is encodable as simm8 :)
 		cmp eax,127;
 		ja _loop_8;
 		//direct copy for 0~7 qwords
 		//in order to avoid the inc/dec of all 3 registers
 		//i use negative relative addressing from the top of the buffers
 		//[top-current index]
 _loop_1:
 		//prepare the regs for 'negative relative addressing'
 		add edx,eax;
 		add ecx,eax;
 		neg eax;
 		jz cleanup;	//exit if nothing to do
 _loop_1_inner:
 		MOVSRC xmm0,[edx+eax];
 		MOVDST [ecx+eax],xmm0;
 		add eax,16;		//while the offset is still negative we have data to copy
 		js _loop_1_inner;
 		//done !
 cleanup:
 		//restore xmm and exit ~)
 		movaps xmm0,[_xmm_backup];
 		ret 4;
 	}
 	#undef MOVSRC
 	#undef MOVDST
 }
 static __declspec(naked) void __fastcall _memcpy_raz_udst(void *dest, const void *src, size_t bytes)
 {
 	// MOVDST = opcode used to read. I use the same code for the aligned version, with a different define :)
 	#define MOVSRC movaps
 	#define MOVDST movups
 	__asm
 	{
 		//Reads before reads, to avoid stalls
 		mov eax,[esp+4];
 		//Make sure to save xmm0, it must be preserved ...
 		movaps [_xmm_backup],xmm0;
 		//if >=128 bytes use 128 byte unrolled loop
 		//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
 		cmp eax,127;
 		jna _loop_1;
 		//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
 		align 16
 		//128 byte unrolled loop
 _loop_8:
 		MOVSRC xmm0,[edx+0x00];	//read first to avoid read-after-write stalls
 		MOVDST [ecx+0x00],xmm0; //then write :p
 		MOVSRC xmm0,[edx+0x10];
 		MOVDST [ecx+0x10],xmm0;
 		sub edx,-128;			//edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		sub ecx,-128;			//ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		MOVSRC xmm0,[edx+0x20-128];
 		MOVDST [ecx+0x20-128],xmm0;
 		MOVSRC xmm0,[edx+0x30-128];
 		MOVDST [ecx+0x30-128],xmm0;
 		add eax,-128;			//eax won't be used for a while, so update it here. add/-128 for simm8 encoding
 		MOVSRC xmm0,[edx+0x40-128];
 		MOVDST [ecx+0x40-128],xmm0;
 		MOVSRC xmm0,[edx+0x50-128];
 		MOVDST [ecx+0x50-128],xmm0;
 		MOVSRC xmm0,[edx+0x60-128];
 		MOVDST [ecx+0x60-128],xmm0;
 		MOVSRC xmm0,[edx+0x70-128];
 		MOVDST [ecx+0x70-128],xmm0;
 		//127~ja, 127 is encodable as simm8 :)
 		cmp eax,127;
 		ja _loop_8;
 		//direct copy for 0~7 qwords
 		//in order to avoid the inc/dec of all 3 registers
 		//i use negative relative addressing from the top of the buffers
 		//[top-current index]
 _loop_1:
 		//prepare the regs for 'negative relative addressing'
 		add edx,eax;
 		add ecx,eax;
 		neg eax;
 		jz cleanup;	//exit if nothing to do
 _loop_1_inner:
 		MOVSRC xmm0,[edx+eax];
 		movaps [ecx+eax],xmm0;
 		add eax,16;		//while the offset is still negative we have data to copy
 		js _loop_1_inner;
 		//done !
 cleanup:
 		//restore xmm and exit ~)
 		movaps xmm0,[_xmm_backup];
 		ret 4;
 	}
 	#undef MOVSRC
 	#undef MOVDST
 }
 // Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
 // This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
 // used since the reads are linear and the cache logic can predict em :)
 // *OBSOLETE* -- memcpy_amd_ has been optimized and is now faster.
 __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
 {
 	// Code Implementation Notes:
 	// Uses a forward copy, in 128 byte blocks, and then does the remaining in 16 byte blocks :)
 	// MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
 	#define MOVSRC movaps
 	#define MOVDST movaps
 	__asm
 	{
 		//Reads before reads, to avoid stalls
 		mov eax,[esp+4];
 		//Make sure to save xmm0, it must be preserved ...
 		movaps [_xmm_backup],xmm0;
 		//if >=128 bytes use 128 byte unrolled loop
 		//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
 		cmp eax,127;
 		jna _loop_1;
 		//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
 		align 16
 		//128 byte unrolled loop
 _loop_8:
 		MOVSRC xmm0,[edx+0x00];	//read first to avoid read-after-write stalls
 		MOVDST [ecx+0x00],xmm0; //then write :p
 		MOVSRC xmm0,[edx+0x10];
 		MOVDST [ecx+0x10],xmm0;
 		sub edx,-128;			//edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		sub ecx,-128;			//ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
 		MOVSRC xmm0,[edx+0x20-128];
 		MOVDST [ecx+0x20-128],xmm0;
 		MOVSRC xmm0,[edx+0x30-128];
 		MOVDST [ecx+0x30-128],xmm0;
 		add eax,-128;			//eax won't be used for a while, so update it here. add/-128 for simm8 encoding
 		MOVSRC xmm0,[edx+0x40-128];
 		MOVDST [ecx+0x40-128],xmm0;
 		MOVSRC xmm0,[edx+0x50-128];
 		MOVDST [ecx+0x50-128],xmm0;
 		MOVSRC xmm0,[edx+0x60-128];
 		MOVDST [ecx+0x60-128],xmm0;
 		MOVSRC xmm0,[edx+0x70-128];
 		MOVDST [ecx+0x70-128],xmm0;
 		//127~ja, 127 is encodable as simm8 :)
 		cmp eax,127;
 		ja _loop_8;
 		//direct copy for 0~7 qwords
 		//in order to avoid the inc/dec of all 3 registers
 		//i use negative relative addressing from the top of the buffers
 		//[top-current index]
 _loop_1:
 		//prepare the regs for 'negative relative addressing'
 		add edx,eax;
 		add ecx,eax;
 		neg eax;
 		jz cleanup;	//exit if nothing to do
 _loop_1_inner:
 		MOVSRC xmm0,[edx+eax];
 		MOVDST [ecx+eax],xmm0;
 		add eax,16;		//while the offset is still negative we have data to copy
 		js _loop_1_inner;
 		//done !
 cleanup:
 		//restore xmm and exit ~)
 		movaps xmm0,[_xmm_backup];
 		ret 4;
 	}
 	#undef MOVSRC
 	#undef MOVDST
 }
 // This memcpy routine is for use in situations where the source buffer's alignment is indeterminate.
 __forceinline void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
 {
 	if( ((uptr)src & 0xf) == 0 )
 		memcpy_raz_( dest, src, bytes );
 	else
 		_memcpy_raz_usrc( dest, src, bytes );
 }
 // This memcpy routine is for use in situations where the destination buffer's alignment is indeterminate.
 __forceinline void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes)
 {
 	if( ((uptr)dest & 0xf) == 0 )
 		memcpy_raz_( dest, src, bytes );
 	else
 		_memcpy_raz_udst( dest, src, bytes );
 }
 //////////////////////////////////////////////////////////////////////////
 // Fast memcpy as coded by AMD, and thn improved by air.
 //
 // This routine preserves mmx registers!  It's the complete real deal!
 __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
 {
    __asm
@ -398,15 +116,9 @@ $memcpy_align_done:			; destination is dword aligned
 	shr		eax, 6			; get 64-byte block count
 	jz		$memcpy_ic_2	; finish the last few bytes
 	mov     edx, offset _mmx_backup ; will probably need this to save/restore mmx
 	cmp		eax, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
 	jae		$memcpy_uc_test
 	movq	[edx+0x00],mm0
 	movq	[edx+0x08],mm1
 	movq	[edx+0x10],mm2
 	movq	[edx+0x18],mm3
 // This is small block copy that uses the MMX registers to copy 8 bytes
 // at a time.  It uses the "unrolled loop" optimization, and also uses
 // the software prefetch instruction to get the data into the cache.
@ -437,11 +149,6 @@ $memcpy_ic_1:			; 64-byte block copies, in-cache copy
 	dec		eax				; count down
 	jnz		$memcpy_ic_1	; last 64-byte block?
 	movq	mm0,[edx+0x00]
 	movq	mm1,[edx+0x08]
 	movq	mm2,[edx+0x10]
 	movq	mm3,[edx+0x18]
 $memcpy_ic_2:
 	mov		eax, ecx		; has valid low 6 bits of the byte count
 $memcpy_ic_3:
@ -460,10 +167,6 @@ $memcpy_uc_test:
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
 	movq	[edx+0x00],mm0
 	movq	[edx+0x08],mm1
 	movq	[edx+0x10],mm2
 align 16
 $memcpy_uc_1:				; 64-byte blocks, uncached copy
@ -490,10 +193,6 @@ $memcpy_uc_1:				; 64-byte blocks, uncached copy
 	movntq	[edi-8], mm1
 	jnz		$memcpy_uc_1	; last 64-byte block?
 	movq	mm0,[edx+0x00]
 	movq	mm1,[edx+0x08]
 	movq	mm2,[edx+0x10]
 	jmp		$memcpy_ic_2		; almost done  (not needed because large copy below was removed)
 // For the largest size blocks, a special technique called Block Prefetch