common: remove old memcpy implementation

PCSX2 used standard memcpy now (thanks to xsacha)
2014-10-26 22:33:42 +01:00 · 2014-10-26 22:33:42 +01:00 · 69e88ffed0
parent 4d818f6cd9
commit 69e88ffed0
4 changed files with 3 additions and 637 deletions
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@ -32,10 +32,6 @@ extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 // Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
 void _memset16_unaligned( void* dest, u16 data, size_t size );

-// MemcpyVibes.cpp functions
-extern void memcpy_vibes(void * dest, const void * src, int size);
-extern void gen_memcpy_vibes();
-
 #define memcpy_fast					memcpy
 #define memcpy_aligned(d,s,c)		memcpy(d,s,c)
 #define memcpy_const					memcpy
--- a/common/src/Utilities/CMakeLists.txt
+++ b/common/src/Utilities/CMakeLists.txt
@ -128,7 +128,6 @@ set(UtilitiesSources
 	wxAppWithHelpers.cpp
 	wxGuiTools.cpp
 	wxHelpers.cpp
-	x86/MemcpyVibes.cpp
 	)

 # variable with all headers of this library
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@ -31,290 +31,19 @@
 3dsdk.support@amd.com
 ******************************************************************************/

+// GH: AMD memcpy was removed. The remaining part (memcmp_mmx) is likely from Zerofrog.
+// Hopefully memcmp_mmx will be dropped in the future.
+
 #include "PrecompiledHeader.h"

 #ifdef _MSC_VER
 #pragma warning(disable:4414)
 #endif

-/*****************************************************************************
-MEMCPY_AMD.CPP
-******************************************************************************/
-
-// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
-// "Streaming Store"), and also uses the software prefetch instructions,
-// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
-// calling!
-
-#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".
-
-#define IN_CACHE_COPY 2 * 1024  // upper limit for movq/movq copy w/SW prefetch
-// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
-// also using the "unrolled loop" optimization.   This code uses
-// the software prefetch instruction to get the data into the cache.
-
-#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
-
 // Inline assembly syntax for use with Visual C++

 #if defined(_MSC_VER)

-
-//  Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
-__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
-{
-    __asm
-	{
-	push    edi
-	push    esi
-
-	mov		edi, ecx		; destination
-	mov		esi, edx		; source
-	mov		ecx, [esp+12]	; number of bytes to copy
-	mov		eax, ecx		; keep a copy of count
-
-	cld
-	cmp		eax, TINY_BLOCK_COPY
-	jb		$memcpy_ic_3	; tiny? skip mmx copy
-
-	cmp		eax, 32*1024		; dont align between 32k-64k because
-	jbe		$memcpy_do_align	;  it appears to be slower
-	cmp		eax, 64*1024
-	jbe		$memcpy_align_done
-
-$memcpy_do_align:
-	mov		eax, 8			; a trick that s faster than rep movsb...
-	sub		eax, edi		; align destination to qword
-	and		eax, 111b		; get the low bits
-	sub		ecx, eax		; update copy count
-	neg		eax				; set up to jump into the array
-	add		eax, offset $memcpy_align_done
-	jmp		eax				; jump to array of movsb s
-
-align 4
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-
-$memcpy_align_done:			; destination is dword aligned
-	mov		eax, ecx		; number of bytes left to copy
-	shr		eax, 6			; get 64-byte block count
-	jz		$memcpy_ic_2	; finish the last few bytes
-
-	cmp		eax, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
-	jae		$memcpy_uc_test
-
-// This is small block copy that uses the MMX registers to copy 8 bytes
-// at a time.  It uses the "unrolled loop" optimization, and also uses
-// the software prefetch instruction to get the data into the cache.
-align 16
-$memcpy_ic_1:			; 64-byte block copies, in-cache copy
-
-	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
-
-	movq	mm0, [esi+0]	; read 64 bits
-	movq	mm1, [esi+8]
-	movq	[edi+0], mm0	; write 64 bits
-	movq	[edi+8], mm1	;    note:  the normal movq writes the
-	movq	mm2, [esi+16]	;    data to cache; a cache line will be
-	movq	mm3, [esi+24]	;    allocated as needed, to store the data
-	movq	[edi+16], mm2
-	movq	[edi+24], mm3
-	movq	mm0, [esi+32]
-	movq	mm1, [esi+40]
-	movq	[edi+32], mm0
-	movq	[edi+40], mm1
-	movq	mm2, [esi+48]
-	movq	mm3, [esi+56]
-	movq	[edi+48], mm2
-	movq	[edi+56], mm3
-
-	add		esi, 64			; update source pointer
-	add		edi, 64			; update destination pointer
-	sub		eax, 1
-	jnz		$memcpy_ic_1	; last 64-byte block?
-
-$memcpy_ic_2:
-	mov		eax, ecx		; has valid low 6 bits of the byte count
-$memcpy_ic_3:
-	shr		eax, 2			; dword count
-	and		eax, 1111b		; only look at the "remainder" bits
-	neg		eax				; set up to jump into the array
-	add		eax, offset $memcpy_last_few
-	jmp		eax				; jump to array of movsd s
-
-$memcpy_uc_test:
-	or		eax, eax		; tail end of block prefetch will jump here
-	jz		$memcpy_ic_2	; no more 64-byte blocks left
-
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-
-align 16
-$memcpy_uc_1:				; 64-byte blocks, uncached copy
-
-	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
-
-	movq	mm0,[esi+0]		; read 64 bits
-	add		edi,64			; update destination pointer
-	movq	mm1,[esi+8]
-	add		esi,64			; update source pointer
-	movq	mm2,[esi-48]
-	movntq	[edi-64], mm0	; write 64 bits, bypassing the cache
-	movq	mm0,[esi-40]	;    note: movntq also prevents the CPU
-	movntq	[edi-56], mm1	;    from READING the destination address
-	movq	mm1,[esi-32]	;    into the cache, only to be over-written
-	movntq	[edi-48], mm2	;    so that also helps performance
-	movq	mm2,[esi-24]
-	movntq	[edi-40], mm0
-	movq	mm0,[esi-16]
-	movntq	[edi-32], mm1
-	movq	mm1,[esi-8]
-	movntq	[edi-24], mm2
-	movntq	[edi-16], mm0
-	movntq	[edi-8], mm1
-
-	sub		eax, 1
-	jnz		$memcpy_uc_1	; last 64-byte block?
-
-	jmp		$memcpy_ic_2		; almost done  (not needed because large copy below was removed)
-
-// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
-// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
-
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".   Then it handles the last few bytes.
-align 16
-	movsd
-	movsd			; perform last 1-15 dword copies
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd			; perform last 1-7 dword copies
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-
-$memcpy_last_few:		; dword aligned from before movsd s
-	and		ecx, 11b	; the last few cows must come home
-	jz		$memcpy_final	; no more, let s leave
-	rep		movsb		; the last 1, 2, or 3 bytes
-
-$memcpy_final:
-	pop    esi
-	pop    edi
-
-	emms				; clean up the MMX state
-	sfence				; flush the write buffer
-	//mov		eax, [dest]	; ret value = destination pointer
-
-	ret 4
-    }
-}
-
-// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
-__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
-{
-	// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
-	// registers will improve copy performance, because they won't.  Use of XMMs is only
-	// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
-	// and even then the benefits are typically minimal (sometimes slower depending on the
-	// amount of data being copied).
-	//
-	// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
-	//   --air
-
-	// Linux Conversion note:
-	//  This code would benefit nicely from having inline-able GAS syntax, since it should
-	//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
-	//  And its called enough times to probably merit the extra effort to ensure proper
-	//  optimization. --air
-
-    __asm
-	{
-	mov		ecx, dest
-	mov		edx, src
-	mov		eax, qwc			; keep a copy of count
-	shr		eax, 1
-	jz		$memcpy_qwc_1		; only one 16 byte block to copy?
-
-	cmp		eax, IN_CACHE_COPY/32
-	jb		$memcpy_qwc_loop1	; small copies should be cached (definite speedup --air)
-	
-$memcpy_qwc_loop2:				; 32-byte blocks, uncached copy
-	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
-
-	movq	mm0,[edx+0]			; read 64 bits
-	movq	mm1,[edx+8]
-	movq	mm2,[edx+16]
-	movntq	[ecx+0], mm0		; write 64 bits, bypassing the cache
-	movntq	[ecx+8], mm1
-	movq	mm3,[edx+24]
-	movntq	[ecx+16], mm2
-	movntq	[ecx+24], mm3
-
-	add		edx,32				; update source pointer
-	add		ecx,32				; update destination pointer
-	sub		eax,1
-	jnz		$memcpy_qwc_loop2	; last 64-byte block?
-	sfence						; flush the write buffer
-	jmp		$memcpy_qwc_1
-
-; 32-byte blocks, cached!
-; This *is* important.  Removing this and using exclusively non-temporal stores
-; results in noticable speed loss!
-
-$memcpy_qwc_loop1:				
-	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
-
-	movq	mm0,[edx+0]			; read 64 bits
-	movq	mm1,[edx+8]
-	movq	mm2,[edx+16]
-	movq	[ecx+0], mm0		; write 64 bits, bypassing the cache
-	movq	[ecx+8], mm1
-	movq	mm3,[edx+24]
-	movq	[ecx+16], mm2
-	movq	[ecx+24], mm3
-
-	add		edx,32				; update source pointer
-	add		ecx,32				; update destination pointer
-	sub		eax,1
-	jnz		$memcpy_qwc_loop1	; last 64-byte block?
-
-$memcpy_qwc_1:
-	test	qwc,1
-	jz		$memcpy_qwc_final
-	movq	mm0,[edx]
-	movq	mm1,[edx+8]
-	movq	[ecx], mm0
-	movq	[ecx+8], mm1
-
-$memcpy_qwc_final:
-	emms				; clean up the MMX state
-    }
-}
-
 // mmx mem-compare implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
 // ~10 times faster than standard memcmp
@ -489,112 +218,4 @@ End:
 	}
 }

-
-// returns the xor of all elements, cmpsize has to be mult of 8
-void memxor_mmx(void* dst, const void* src1, int cmpsize)
-{
-	pxAssert( (cmpsize&7) == 0 );
-
-	__asm {
-		mov ecx, cmpsize
-		mov eax, src1
-		mov edx, dst
-
-		cmp ecx, 64
-		jl Setup4
-
-		movq mm0, [eax]
-		movq mm1, [eax+8]
-		movq mm2, [eax+16]
-		movq mm3, [eax+24]
-		movq mm4, [eax+32]
-		movq mm5, [eax+40]
-		movq mm6, [eax+48]
-		movq mm7, [eax+56]
-		sub ecx, 64
-		add eax, 64
-		cmp ecx, 64
-		jl End8
-
-Cmp8:
-		pxor mm0, [eax]
-		pxor mm1, [eax+8]
-		pxor mm2, [eax+16]
-		pxor mm3, [eax+24]
-		pxor mm4, [eax+32]
-		pxor mm5, [eax+40]
-		pxor mm6, [eax+48]
-		pxor mm7, [eax+56]
-
-		sub ecx, 64
-		add eax, 64
-		cmp ecx, 64
-		jge Cmp8
-
-End8:
-		pxor mm0, mm4
-		pxor mm1, mm5
-		pxor mm2, mm6
-		pxor mm3, mm7
-
-		cmp ecx, 32
-		jl End4
-		pxor mm0, [eax]
-		pxor mm1, [eax+8]
-		pxor mm2, [eax+16]
-		pxor mm3, [eax+24]
-		sub ecx, 32
-		add eax, 32
-		jmp End4
-
-Setup4:
-		cmp ecx, 32
-		jl Setup2
-
-		movq mm0, [eax]
-		movq mm1, [eax+8]
-		movq mm2, [eax+16]
-		movq mm3, [eax+24]
-		sub ecx, 32
-		add eax, 32
-
-End4:
-		pxor mm0, mm2
-		pxor mm1, mm3
-
-		cmp ecx, 16
-		jl End2
-		pxor mm0, [eax]
-		pxor mm1, [eax+8]
-		sub ecx, 16
-		add eax, 16
-		jmp End2
-
-Setup2:
-		cmp ecx, 16
-		jl Setup1
-
-		movq mm0, [eax]
-		movq mm1, [eax+8]
-		sub ecx, 16
-		add eax, 16
-
-End2:
-		pxor mm0, mm1
-
-		cmp ecx, 8
-		jl End1
-		pxor mm0, [eax]
-End1:
-		movq [edx], mm0
-		jmp End
-
-Setup1:
-		movq mm0, [eax]
-		movq [edx], mm0
-End:
-		emms
-	}
-}
-
 #endif
--- a/common/src/Utilities/x86/MemcpyVibes.cpp
+++ b/common/src/Utilities/x86/MemcpyVibes.cpp
@ -1,250 +0,0 @@
-/*  PCSX2 - PS2 Emulator for PCs
- *  Copyright (C) 2002-2010  PCSX2 Dev Team
- *
- *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
- *  of the GNU Lesser General Public License as published by the Free Software Found-
- *  ation, either version 3 of the License, or (at your option) any later version.
- *
- *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- *  PURPOSE.  See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along with PCSX2.
- *  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "PrecompiledHeader.h"
-#include "x86emitter/x86emitter.h"
-#include <xmmintrin.h>
-
-using namespace x86Emitter;
-
-// Max Number of qwc supported
-#define _maxSize 0x400
-
-typedef void (__fastcall *_memCpyCall)(void*, void*);
-__aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
-
-#if 1
-
-// this version uses SSE intrinsics to perform an inline copy.  MSVC disasm shows pretty
-// decent code generation on whole, but it hasn't been benchmarked at all yet --air
-__fi void memcpy_vibes(void * dest, const void * src, int size) {
-
-	float (*destxmm)[4] = (float(*)[4])dest, (*srcxmm)[4] = (float(*)[4])src;
-	size_t count = size & ~15, extra = size & 15;
-
-	destxmm -= 8 - extra, srcxmm -= 8 - extra;
-	switch (extra) {
-		do {
-			destxmm += 16, srcxmm += 16, count -= 16;
-			_mm_store_ps(&destxmm[-8][0], _mm_load_ps(&srcxmm[-8][0]));
-			case 15:
-				_mm_store_ps(&destxmm[-7][0], _mm_load_ps(&srcxmm[-7][0]));
-			case 14:
-				_mm_store_ps(&destxmm[-6][0], _mm_load_ps(&srcxmm[-6][0]));
-			case 13:
-				_mm_store_ps(&destxmm[-5][0], _mm_load_ps(&srcxmm[-5][0]));
-			case 12:
-				_mm_store_ps(&destxmm[-4][0], _mm_load_ps(&srcxmm[-4][0]));
-			case 11:
-				_mm_store_ps(&destxmm[-3][0], _mm_load_ps(&srcxmm[-3][0]));
-			case 10:
-				_mm_store_ps(&destxmm[-2][0], _mm_load_ps(&srcxmm[-2][0]));
-			case  9:
-				_mm_store_ps(&destxmm[-1][0], _mm_load_ps(&srcxmm[-1][0]));
-			case  8:
-				_mm_store_ps(&destxmm[ 0][0], _mm_load_ps(&srcxmm[ 0][0]));
-			case  7:
-				_mm_store_ps(&destxmm[ 1][0], _mm_load_ps(&srcxmm[ 1][0]));
-			case  6:
-				_mm_store_ps(&destxmm[ 2][0], _mm_load_ps(&srcxmm[ 2][0]));
-			case  5:
-				_mm_store_ps(&destxmm[ 3][0], _mm_load_ps(&srcxmm[ 3][0]));
-			case  4:
-				_mm_store_ps(&destxmm[ 4][0], _mm_load_ps(&srcxmm[ 4][0]));
-			case  3:
-				_mm_store_ps(&destxmm[ 5][0], _mm_load_ps(&srcxmm[ 5][0]));
-			case  2:
-				_mm_store_ps(&destxmm[ 6][0], _mm_load_ps(&srcxmm[ 6][0]));
-			case  1:
-				_mm_store_ps(&destxmm[ 7][0], _mm_load_ps(&srcxmm[ 7][0]));
-			case  0: NULL;
-		} while (count);
-	}
-}
-
-#else
-#if 1
-// This version creates one function with a lot of movaps
-// It jumps to the correct movaps entry-point while adding
-// the proper offset for adjustment...
-
-static __pagealigned u8   _memCpyExec[__pagesize*16];
-
-void gen_memcpy_vibes() {
-	HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
-	memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
-	xSetPtr(_memCpyExec);
-
-	int off =-(((_maxSize & 0xf) - 7) << 4);
-	for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) {
-
-		_memcpy_vibes[i] = (_memCpyCall)xGetPtr();
-
-		if (off >=  128) {
-			off  = -128;
-			xADD(edx, 256);
-			xADD(ecx, 256);
-		}
-		const xRegisterSSE xmm_t(x);
-		xMOVAPS (xmm_t, ptr32[edx+off]);
-		xMOVNTPS(ptr32[ecx+off], xmm_t);
-	}
-
-	_memcpy_vibes[0] = (_memCpyCall)xGetPtr();
-
-	xRET();
-	pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
-
-	HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
-}
-
-__fi void memcpy_vibes(void * dest, const void * src, int size) {
-	int offset = ((size & 0xf) - 7) << 4;
-	_memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
-}
-
-#else
-
-// This version creates '_maxSize' number of different functions,
-// and calls the appropriate one...
-
-static __pagealigned u8   _memCpyExec[__pagesize*_maxSize*2];
-
-void gen_memcpy_vibes() {
-	HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
-	memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
-	xSetPtr(_memCpyExec);
-
-	for (int i = 0; i < _maxSize+1; i++) 
-	{
-		int off = 0;
-		_memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget();
-
-		for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) {
-			if (off >=  128) {
-				off  = -128;
-				xADD(edx, 256);
-				xADD(ecx, 256);
-			}
-			const xRegisterSSE xmm_t(x);
-			xMOVAPS(xmm_t, ptr32[edx+off]);
-			xMOVAPS(ptr32[ecx+off], xmm_t);
-		}
-
-		xRET();
-		pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
-	}
-
-	HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
-}
-
-__fi void memcpy_vibes(void * dest, const void * src, int size) {
-	_memcpy_vibes[size](dest, src);
-}
-
-#endif
-#endif
-
-// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
-// to get around compilation issues with having it in the headers.
-#ifdef __linux__
-
-	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
-	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
-	__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
-	{	
-		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
-		// registers will improve copy performance, because they won't.  Use of XMMs is only
-		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
-		// and even then the benefits are typically minimal (sometimes slower depending on the
-		// amount of data being copied).
-		//
-		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
-		//   --air
-
-		// Linux Conversion note:
-		//  This code would benefit nicely from having inline-able GAS syntax, since it should
-		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
-		//  And its called enough times to probably merit the extra effort to ensure proper
-		//  optimization. --air
-
-		__asm__ __volatile__
-		(
-			".intel_syntax noprefix\n"
-                "sub        %[qwc], 1\n"                // dec the counter to ease the count of 16bytes block later (optimization)
-                                                        // Note after this line, real value of the counter is %[qwc] + 1
-                "jle        memcpy_qwc_1_%=\n"             // only one 16 byte block to copy? Or nothing.
-
-				"cmp		%[qwc], 127\n"					// "IN_CACHE_COPY/16"
-				"jb			memcpy_qwc_loop1_%=\n"			// small copies should be cached (definite speedup --air)
-		
-			"memcpy_qwc_loop2_%=:\n"						// 32-byte blocks, uncached copy
-				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
-
-				"movq		mm0,[%[src]+0]\n"			// read 64 bits
-				"movq		mm1,[%[src]+8]\n"
-				"movq		mm2,[%[src]+16]\n"
-				"movntq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
-				"movntq		[%[dest]+8], mm1\n"
-				"movq		mm3,[%[src]+24]\n"
-				"movntq		[%[dest]+16], mm2\n"
-				"movntq		[%[dest]+24], mm3\n"
-
-				"add		%[src],32\n"				// update source pointer
-				"add		%[dest],32\n"				// update destination pointer
-				"sub		%[qwc],2\n"
-				"jg 		memcpy_qwc_loop2_%=\n"			// last 64-byte block?
-				"sfence\n"								// flush the write buffer
-				"jmp		memcpy_qwc_1_%=\n"
-
-			// 32-byte blocks, cached!
-			// This *is* important.  Removing this and using exclusively non-temporal stores
-			// results in noticeable speed loss!
-
-			"memcpy_qwc_loop1_%=:\n"				
-				"prefetchnta [%[src] + 568]\n"			// start reading ahead (tested: it helps! --air)
-
-				"movq		mm0,[%[src]+0]\n"			// read 64 bits
-				"movq		mm1,[%[src]+8]\n"
-				"movq		mm2,[%[src]+16]\n"
-				"movq		[%[dest]+0], mm0\n"			// write 64 bits, bypassing the cache
-				"movq		[%[dest]+8], mm1\n"
-				"movq		mm3,[%[src]+24]\n"
-				"movq		[%[dest]+16], mm2\n"
-				"movq		[%[dest]+24], mm3\n"
-
-				"add		%[src],32\n"				// update source pointer
-				"add		%[dest],32\n"				// update destination pointer
-				"sub		%[qwc],2\n"
-				"jg 		memcpy_qwc_loop2_%=\n"			// last 64-byte block?
-
-			"memcpy_qwc_1_%=:\n"
-				"cmp        %[qwc],0\n"
-				"jne		memcpy_qwc_final_%=\n"
-				"movq		mm0,[%[src]]\n"
-				"movq		mm1,[%[src]+8]\n"
-				"movq		[%[dest]], mm0\n"
-				"movq		[%[dest]+8], mm1\n"
-
-			"memcpy_qwc_final_%=:\n"
-				"emms\n"								// clean up the MMX state
-			".att_syntax\n"
-					: "=&r"(dest), "=&r"(src), "=&r"(qwc)
-					: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
-					: "memory", "mm0", "mm1", "mm2", "mm3"
-		);
-	}
-#endif
-