common: remove old memcpy implementation

PCSX2 used standard memcpy now (thanks to xsacha)
2014-10-26 22:33:42 +01:00 · 2014-10-26 22:33:42 +01:00 · 69e88ffed0
parent 4d818f6cd9
commit 69e88ffed0
4 changed files with 3 additions and 637 deletions
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@ -32,10 +32,6 @@ extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 // Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
 void _memset16_unaligned( void* dest, u16 data, size_t size );
 // MemcpyVibes.cpp functions
 extern void memcpy_vibes(void * dest, const void * src, int size);
 extern void gen_memcpy_vibes();
 #define memcpy_fast					memcpy
 #define memcpy_aligned(d,s,c)		memcpy(d,s,c)
 #define memcpy_const					memcpy
--- a/common/src/Utilities/CMakeLists.txt
+++ b/common/src/Utilities/CMakeLists.txt
@ -128,7 +128,6 @@ set(UtilitiesSources
 	wxAppWithHelpers.cpp
 	wxGuiTools.cpp
 	wxHelpers.cpp
 	x86/MemcpyVibes.cpp
 	)
 # variable with all headers of this library
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@ -31,290 +31,19 @@
 3dsdk.support@amd.com
 ******************************************************************************/
 // GH: AMD memcpy was removed. The remaining part (memcmp_mmx) is likely from Zerofrog.
 // Hopefully memcmp_mmx will be dropped in the future.
 #include "PrecompiledHeader.h"
 #ifdef _MSC_VER
 #pragma warning(disable:4414)
 #endif
 /*****************************************************************************
 MEMCPY_AMD.CPP
 ******************************************************************************/
 // NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
 // "Streaming Store"), and also uses the software prefetch instructions,
 // be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
 // calling!
 #define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".
 #define IN_CACHE_COPY 2 * 1024  // upper limit for movq/movq copy w/SW prefetch
 // Next is a copy that uses the MMX registers to copy 8 bytes at a time,
 // also using the "unrolled loop" optimization.   This code uses
 // the software prefetch instruction to get the data into the cache.
 #define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
 // For larger blocks, which will spill beyond the cache, it's faster to
 // use the Streaming Store instruction MOVNTQ.   This write instruction
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
 // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
 // Inline assembly syntax for use with Visual C++
 #if defined(_MSC_VER)
 //  Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
 __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
 {
    __asm
 	{
 	push    edi
 	push    esi
 	mov		edi, ecx		; destination
 	mov		esi, edx		; source
 	mov		ecx, [esp+12]	; number of bytes to copy
 	mov		eax, ecx		; keep a copy of count
 	cld
 	cmp		eax, TINY_BLOCK_COPY
 	jb		$memcpy_ic_3	; tiny? skip mmx copy
 	cmp		eax, 32*1024		; dont align between 32k-64k because
 	jbe		$memcpy_do_align	;  it appears to be slower
 	cmp		eax, 64*1024
 	jbe		$memcpy_align_done
 $memcpy_do_align:
 	mov		eax, 8			; a trick that s faster than rep movsb...
 	sub		eax, edi		; align destination to qword
 	and		eax, 111b		; get the low bits
 	sub		ecx, eax		; update copy count
 	neg		eax				; set up to jump into the array
 	add		eax, offset $memcpy_align_done
 	jmp		eax				; jump to array of movsb s
 align 4
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 $memcpy_align_done:			; destination is dword aligned
 	mov		eax, ecx		; number of bytes left to copy
 	shr		eax, 6			; get 64-byte block count
 	jz		$memcpy_ic_2	; finish the last few bytes
 	cmp		eax, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
 	jae		$memcpy_uc_test
 // This is small block copy that uses the MMX registers to copy 8 bytes
 // at a time.  It uses the "unrolled loop" optimization, and also uses
 // the software prefetch instruction to get the data into the cache.
 align 16
 $memcpy_ic_1:			; 64-byte block copies, in-cache copy
 	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
 	movq	mm0, [esi+0]	; read 64 bits
 	movq	mm1, [esi+8]
 	movq	[edi+0], mm0	; write 64 bits
 	movq	[edi+8], mm1	;    note:  the normal movq writes the
 	movq	mm2, [esi+16]	;    data to cache; a cache line will be
 	movq	mm3, [esi+24]	;    allocated as needed, to store the data
 	movq	[edi+16], mm2
 	movq	[edi+24], mm3
 	movq	mm0, [esi+32]
 	movq	mm1, [esi+40]
 	movq	[edi+32], mm0
 	movq	[edi+40], mm1
 	movq	mm2, [esi+48]
 	movq	mm3, [esi+56]
 	movq	[edi+48], mm2
 	movq	[edi+56], mm3
 	add		esi, 64			; update source pointer
 	add		edi, 64			; update destination pointer
 	sub		eax, 1
 	jnz		$memcpy_ic_1	; last 64-byte block?
 $memcpy_ic_2:
 	mov		eax, ecx		; has valid low 6 bits of the byte count
 $memcpy_ic_3:
 	shr		eax, 2			; dword count
 	and		eax, 1111b		; only look at the "remainder" bits
 	neg		eax				; set up to jump into the array
 	add		eax, offset $memcpy_last_few
 	jmp		eax				; jump to array of movsd s
 $memcpy_uc_test:
 	or		eax, eax		; tail end of block prefetch will jump here
 	jz		$memcpy_ic_2	; no more 64-byte blocks left
 // For larger blocks, which will spill beyond the cache, it's faster to
 // use the Streaming Store instruction MOVNTQ.   This write instruction
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
 align 16
 $memcpy_uc_1:				; 64-byte blocks, uncached copy
 	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
 	movq	mm0,[esi+0]		; read 64 bits
 	add		edi,64			; update destination pointer
 	movq	mm1,[esi+8]
 	add		esi,64			; update source pointer
 	movq	mm2,[esi-48]
 	movntq	[edi-64], mm0	; write 64 bits, bypassing the cache
 	movq	mm0,[esi-40]	;    note: movntq also prevents the CPU
 	movntq	[edi-56], mm1	;    from READING the destination address
 	movq	mm1,[esi-32]	;    into the cache, only to be over-written
 	movntq	[edi-48], mm2	;    so that also helps performance
 	movq	mm2,[esi-24]
 	movntq	[edi-40], mm0
 	movq	mm0,[esi-16]
 	movntq	[edi-32], mm1
 	movq	mm1,[esi-8]
 	movntq	[edi-24], mm2
 	movntq	[edi-16], mm0
 	movntq	[edi-8], mm1
 	sub		eax, 1
 	jnz		$memcpy_uc_1	; last 64-byte block?
 	jmp		$memcpy_ic_2		; almost done  (not needed because large copy below was removed)
 // Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
 // disabled to help keep the code cache footprint of memcpy_fast to a minimum.
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".   Then it handles the last few bytes.
 align 16
 	movsd
 	movsd			; perform last 1-15 dword copies
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd			; perform last 1-7 dword copies
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 $memcpy_last_few:		; dword aligned from before movsd s
 	and		ecx, 11b	; the last few cows must come home
 	jz		$memcpy_final	; no more, let s leave
 	rep		movsb		; the last 1, 2, or 3 bytes
 $memcpy_final:
 	pop    esi
 	pop    edi
 	emms				; clean up the MMX state
 	sfence				; flush the write buffer
 	//mov		eax, [dest]	; ret value = destination pointer
 	ret 4
    }
 }
 // Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
 __fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
 {
 	// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
 	// registers will improve copy performance, because they won't.  Use of XMMs is only
 	// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
 	// and even then the benefits are typically minimal (sometimes slower depending on the
 	// amount of data being copied).
 	//
 	// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
 	//   --air
 	// Linux Conversion note:
 	//  This code would benefit nicely from having inline-able GAS syntax, since it should
 	//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
 	//  And its called enough times to probably merit the extra effort to ensure proper
 	//  optimization. --air
    __asm
 	{
 	mov		ecx, dest
 	mov		edx, src
 	mov		eax, qwc			; keep a copy of count
 	shr		eax, 1
 	jz		$memcpy_qwc_1		; only one 16 byte block to copy?
 	cmp		eax, IN_CACHE_COPY/32
 	jb		$memcpy_qwc_loop1	; small copies should be cached (definite speedup --air)
 $memcpy_qwc_loop2:				; 32-byte blocks, uncached copy
 	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
 	movq	mm0,[edx+0]			; read 64 bits
 	movq	mm1,[edx+8]
 	movq	mm2,[edx+16]
 	movntq	[ecx+0], mm0		; write 64 bits, bypassing the cache
 	movntq	[ecx+8], mm1
 	movq	mm3,[edx+24]
 	movntq	[ecx+16], mm2
 	movntq	[ecx+24], mm3
 	add		edx,32				; update source pointer
 	add		ecx,32				; update destination pointer
 	sub		eax,1
 	jnz		$memcpy_qwc_loop2	; last 64-byte block?
 	sfence						; flush the write buffer
 	jmp		$memcpy_qwc_1
 ; 32-byte blocks, cached!
 ; This *is* important.  Removing this and using exclusively non-temporal stores
 ; results in noticable speed loss!
 $memcpy_qwc_loop1:				
 	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
 	movq	mm0,[edx+0]			; read 64 bits
 	movq	mm1,[edx+8]
 	movq	mm2,[edx+16]
 	movq	[ecx+0], mm0		; write 64 bits, bypassing the cache
 	movq	[ecx+8], mm1
 	movq	mm3,[edx+24]
 	movq	[ecx+16], mm2
 	movq	[ecx+24], mm3
 	add		edx,32				; update source pointer
 	add		ecx,32				; update destination pointer
 	sub		eax,1
 	jnz		$memcpy_qwc_loop1	; last 64-byte block?
 $memcpy_qwc_1:
 	test	qwc,1
 	jz		$memcpy_qwc_final
 	movq	mm0,[edx]
 	movq	mm1,[edx+8]
 	movq	[ecx], mm0
 	movq	[ecx+8], mm1
 $memcpy_qwc_final:
 	emms				; clean up the MMX state
    }
 }
 // mmx mem-compare implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
 // ~10 times faster than standard memcmp
@ -489,112 +218,4 @@ End:
 	}
 }
 // returns the xor of all elements, cmpsize has to be mult of 8
 void memxor_mmx(void* dst, const void* src1, int cmpsize)
 {
 	pxAssert( (cmpsize&7) == 0 );
 	__asm {
 		mov ecx, cmpsize
 		mov eax, src1
 		mov edx, dst
 		cmp ecx, 64
 		jl Setup4
 		movq mm0, [eax]
 		movq mm1, [eax+8]
 		movq mm2, [eax+16]
 		movq mm3, [eax+24]
 		movq mm4, [eax+32]
 		movq mm5, [eax+40]
 		movq mm6, [eax+48]
 		movq mm7, [eax+56]
 		sub ecx, 64
 		add eax, 64
 		cmp ecx, 64
 		jl End8
 Cmp8:
 		pxor mm0, [eax]
 		pxor mm1, [eax+8]
 		pxor mm2, [eax+16]
 		pxor mm3, [eax+24]
 		pxor mm4, [eax+32]
 		pxor mm5, [eax+40]
 		pxor mm6, [eax+48]
 		pxor mm7, [eax+56]
 		sub ecx, 64
 		add eax, 64
 		cmp ecx, 64
 		jge Cmp8
 End8:
 		pxor mm0, mm4
 		pxor mm1, mm5
 		pxor mm2, mm6
 		pxor mm3, mm7
 		cmp ecx, 32
 		jl End4
 		pxor mm0, [eax]
 		pxor mm1, [eax+8]
 		pxor mm2, [eax+16]
 		pxor mm3, [eax+24]
 		sub ecx, 32
 		add eax, 32
 		jmp End4
 Setup4:
 		cmp ecx, 32
 		jl Setup2
 		movq mm0, [eax]
 		movq mm1, [eax+8]
 		movq mm2, [eax+16]
 		movq mm3, [eax+24]
 		sub ecx, 32
 		add eax, 32
 End4:
 		pxor mm0, mm2
 		pxor mm1, mm3
 		cmp ecx, 16
 		jl End2
 		pxor mm0, [eax]
 		pxor mm1, [eax+8]
 		sub ecx, 16
 		add eax, 16
 		jmp End2
 Setup2:
 		cmp ecx, 16
 		jl Setup1
 		movq mm0, [eax]
 		movq mm1, [eax+8]
 		sub ecx, 16
 		add eax, 16
 End2:
 		pxor mm0, mm1
 		cmp ecx, 8
 		jl End1
 		pxor mm0, [eax]
 End1:
 		movq [edx], mm0
 		jmp End
 Setup1:
 		movq mm0, [eax]
 		movq [edx], mm0
 End:
 		emms
 	}
 }
 #endif
--- a/common/src/Utilities/x86/MemcpyVibes.cpp
+++ b/common/src/Utilities/x86/MemcpyVibes.cpp
@ -1,250 +0,0 @@
 /*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2010  PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "PrecompiledHeader.h"
 #include "x86emitter/x86emitter.h"
 #include <xmmintrin.h>
 using namespace x86Emitter;
 // Max Number of qwc supported
 #define _maxSize 0x400
 typedef void (__fastcall *_memCpyCall)(void*, void*);
 __aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
 #if 1
 // this version uses SSE intrinsics to perform an inline copy.  MSVC disasm shows pretty
 // decent code generation on whole, but it hasn't been benchmarked at all yet --air
 __fi void memcpy_vibes(void * dest, const void * src, int size) {
 	float (*destxmm)[4] = (float(*)[4])dest, (*srcxmm)[4] = (float(*)[4])src;
 	size_t count = size & ~15, extra = size & 15;
 	destxmm -= 8 - extra, srcxmm -= 8 - extra;
 	switch (extra) {
 		do {
 			destxmm += 16, srcxmm += 16, count -= 16;
 			_mm_store_ps(&destxmm[-8][0], _mm_load_ps(&srcxmm[-8][0]));
 			case 15:
 				_mm_store_ps(&destxmm[-7][0], _mm_load_ps(&srcxmm[-7][0]));
 			case 14:
 				_mm_store_ps(&destxmm[-6][0], _mm_load_ps(&srcxmm[-6][0]));
 			case 13:
 				_mm_store_ps(&destxmm[-5][0], _mm_load_ps(&srcxmm[-5][0]));
 			case 12:
 				_mm_store_ps(&destxmm[-4][0], _mm_load_ps(&srcxmm[-4][0]));
 			case 11:
 				_mm_store_ps(&destxmm[-3][0], _mm_load_ps(&srcxmm[-3][0]));
 			case 10:
 				_mm_store_ps(&destxmm[-2][0], _mm_load_ps(&srcxmm[-2][0]));
 			case  9:
 				_mm_store_ps(&destxmm[-1][0], _mm_load_ps(&srcxmm[-1][0]));
 			case  8:
 				_mm_store_ps(&destxmm[ 0][0], _mm_load_ps(&srcxmm[ 0][0]));
 			case  7:
 				_mm_store_ps(&destxmm[ 1][0], _mm_load_ps(&srcxmm[ 1][0]));
 			case  6:
 				_mm_store_ps(&destxmm[ 2][0], _mm_load_ps(&srcxmm[ 2][0]));
 			case  5:
 				_mm_store_ps(&destxmm[ 3][0], _mm_load_ps(&srcxmm[ 3][0]));
 			case  4:
 				_mm_store_ps(&destxmm[ 4][0], _mm_load_ps(&srcxmm[ 4][0]));
 			case  3:
 				_mm_store_ps(&destxmm[ 5][0], _mm_load_ps(&srcxmm[ 5][0]));
 			case  2:
 				_mm_store_ps(&destxmm[ 6][0], _mm_load_ps(&srcxmm[ 6][0]));
 			case  1:
 				_mm_store_ps(&destxmm[ 7][0], _mm_load_ps(&srcxmm[ 7][0]));
 			case  0: NULL;
 		} while (count);
 	}
 }
 #else
 #if 1
 // This version creates one function with a lot of movaps
 // It jumps to the correct movaps entry-point while adding
 // the proper offset for adjustment...
 static __pagealigned u8   _memCpyExec[__pagesize*16];
 void gen_memcpy_vibes() {
 	HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
 	memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
 	xSetPtr(_memCpyExec);
 	int off =-(((_maxSize & 0xf) - 7) << 4);
 	for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) {
 		_memcpy_vibes[i] = (_memCpyCall)xGetPtr();
 		if (off >=  128) {
 			off  = -128;
 			xADD(edx, 256);
 			xADD(ecx, 256);
 		}
 		const xRegisterSSE xmm_t(x);
 		xMOVAPS (xmm_t, ptr32[edx+off]);
 		xMOVNTPS(ptr32[ecx+off], xmm_t);
 	}
 	_memcpy_vibes[0] = (_memCpyCall)xGetPtr();
 	xRET();
 	pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
 	HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
 }
 __fi void memcpy_vibes(void * dest, const void * src, int size) {
 	int offset = ((size & 0xf) - 7) << 4;
 	_memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
 }
 #else
 // This version creates '_maxSize' number of different functions,
 // and calls the appropriate one...
 static __pagealigned u8   _memCpyExec[__pagesize*_maxSize*2];
 void gen_memcpy_vibes() {
 	HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
 	memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
 	xSetPtr(_memCpyExec);
 	for (int i = 0; i < _maxSize+1; i++) 
 	{
 		int off = 0;
 		_memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget();
 		for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) {
 			if (off >=  128) {
 				off  = -128;
 				xADD(edx, 256);
 				xADD(ecx, 256);
 			}
 			const xRegisterSSE xmm_t(x);
 			xMOVAPS(xmm_t, ptr32[edx+off]);
 			xMOVAPS(ptr32[ecx+off], xmm_t);
 		}
 		xRET();
 		pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
 	}
 	HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
 }
 __fi void memcpy_vibes(void * dest, const void * src, int size) {
 	_memcpy_vibes[size](dest, src);
 }
 #endif
 #endif
 // Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
 // to get around compilation issues with having it in the headers.
 #ifdef __linux__
 	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
 	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
 	__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
 	{	
 		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
 		// registers will improve copy performance, because they won't.  Use of XMMs is only
 		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
 		// and even then the benefits are typically minimal (sometimes slower depending on the
 		// amount of data being copied).
 		//
 		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
 		//   --air
 		// Linux Conversion note:
 		//  This code would benefit nicely from having inline-able GAS syntax, since it should
 		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
 		//  And its called enough times to probably merit the extra effort to ensure proper
 		//  optimization. --air
 		__asm__ __volatile__
 		(
 			".intel_syntax noprefix\n"
                "sub        %[qwc], 1\n"                // dec the counter to ease the count of 16bytes block later (optimization)
                                                        // Note after this line, real value of the counter is %[qwc] + 1
                "jle        memcpy_qwc_1_%=\n"             // only one 16 byte block to copy? Or nothing.
 				"cmp		%[qwc], 127\n"					// "IN_CACHE_COPY/16"
 				"jb			memcpy_qwc_loop1_%=\n"			// small copies should be cached (definite speedup --air)
 			"memcpy_qwc_loop2_%=:\n"						// 32-byte blocks, uncached copy
 				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
 				"movq		mm0,[%[src]+0]\n"			// read 64 bits
 				"movq		mm1,[%[src]+8]\n"
 				"movq		mm2,[%[src]+16]\n"
 				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
 				"movntq		[%[dest]+8], mm1\n"
 				"movq		mm3,[%[src]+24]\n"
 				"movntq		[%[dest]+16], mm2\n"
 				"movntq		[%[dest]+24], mm3\n"
 				"add		%[src],32\n"				// update source pointer
 				"add		%[dest],32\n"				// update destination pointer
 				"sub		%[qwc],2\n"
 				"jg 		memcpy_qwc_loop2_%=\n"			// last 64-byte block?
 				"sfence\n"								// flush the write buffer
 				"jmp		memcpy_qwc_1_%=\n"
 			// 32-byte blocks, cached!
 			// This *is* important.  Removing this and using exclusively non-temporal stores
 			// results in noticeable speed loss!
 			"memcpy_qwc_loop1_%=:\n"				
 				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
 				"movq		mm0,[%[src]+0]\n"			// read 64 bits
 				"movq		mm1,[%[src]+8]\n"
 				"movq		mm2,[%[src]+16]\n"
 				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
 				"movq		[%[dest]+8], mm1\n"
 				"movq		mm3,[%[src]+24]\n"
 				"movq		[%[dest]+16], mm2\n"
 				"movq		[%[dest]+24], mm3\n"
 				"add		%[src],32\n"				// update source pointer
 				"add		%[dest],32\n"				// update destination pointer
 				"sub		%[qwc],2\n"
 				"jg 		memcpy_qwc_loop2_%=\n"			// last 64-byte block?
 			"memcpy_qwc_1_%=:\n"
 				"cmp        %[qwc],0\n"
 				"jne		memcpy_qwc_final_%=\n"
 				"movq		mm0,[%[src]]\n"
 				"movq		mm1,[%[src]+8]\n"
 				"movq		[%[dest]], mm0\n"
 				"movq		[%[dest]+8], mm1\n"
 			"memcpy_qwc_final_%=:\n"
 				"emms\n"								// clean up the MMX state
 			".att_syntax\n"
 					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
 					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
 					: "memory", "mm0", "mm1", "mm2", "mm3"
 		);
 	}
 #endif