diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h
index 9f4df897ae..b3754d8725 100644
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@@ -32,10 +32,6 @@ extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
void _memset16_unaligned( void* dest, u16 data, size_t size );
-// MemcpyVibes.cpp functions
-extern void memcpy_vibes(void * dest, const void * src, int size);
-extern void gen_memcpy_vibes();
-
#define memcpy_fast memcpy
#define memcpy_aligned(d,s,c) memcpy(d,s,c)
#define memcpy_const memcpy
diff --git a/common/src/Utilities/CMakeLists.txt b/common/src/Utilities/CMakeLists.txt
index 9e14b095b5..0765a33e54 100644
--- a/common/src/Utilities/CMakeLists.txt
+++ b/common/src/Utilities/CMakeLists.txt
@@ -128,7 +128,6 @@ set(UtilitiesSources
wxAppWithHelpers.cpp
wxGuiTools.cpp
wxHelpers.cpp
- x86/MemcpyVibes.cpp
)
# variable with all headers of this library
diff --git a/common/src/Utilities/x86/MemcpyFast.cpp b/common/src/Utilities/x86/MemcpyFast.cpp
index 2ae3f34fb2..afae09d709 100644
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@@ -31,290 +31,19 @@
3dsdk.support@amd.com
******************************************************************************/
+// GH: AMD memcpy was removed. The remaining part (memcmp_mmx) is likely from Zerofrog.
+// Hopefully memcmp_mmx will be dropped in the future.
+
#include "PrecompiledHeader.h"
#ifdef _MSC_VER
#pragma warning(disable:4414)
#endif
-/*****************************************************************************
-MEMCPY_AMD.CPP
-******************************************************************************/
-
-// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
-// "Streaming Store"), and also uses the software prefetch instructions,
-// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
-// calling!
-
-#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".
-
-#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch
-// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
-// also using the "unrolled loop" optimization. This code uses
-// the software prefetch instruction to get the data into the cache.
-
-#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ. This write instruction
-// bypasses the cache and writes straight to main memory. This code also
-// uses the software prefetch instruction to pre-read the data.
-// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
-
// Inline assembly syntax for use with Visual C++
#if defined(_MSC_VER)
-
-// Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
-__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
-{
- __asm
- {
- push edi
- push esi
-
- mov edi, ecx ; destination
- mov esi, edx ; source
- mov ecx, [esp+12] ; number of bytes to copy
- mov eax, ecx ; keep a copy of count
-
- cld
- cmp eax, TINY_BLOCK_COPY
- jb $memcpy_ic_3 ; tiny? skip mmx copy
-
- cmp eax, 32*1024 ; dont align between 32k-64k because
- jbe $memcpy_do_align ; it appears to be slower
- cmp eax, 64*1024
- jbe $memcpy_align_done
-
-$memcpy_do_align:
- mov eax, 8 ; a trick that s faster than rep movsb...
- sub eax, edi ; align destination to qword
- and eax, 111b ; get the low bits
- sub ecx, eax ; update copy count
- neg eax ; set up to jump into the array
- add eax, offset $memcpy_align_done
- jmp eax ; jump to array of movsb s
-
-align 4
- movsb
- movsb
- movsb
- movsb
- movsb
- movsb
- movsb
- movsb
-
-$memcpy_align_done: ; destination is dword aligned
- mov eax, ecx ; number of bytes left to copy
- shr eax, 6 ; get 64-byte block count
- jz $memcpy_ic_2 ; finish the last few bytes
-
- cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
- jae $memcpy_uc_test
-
-// This is small block copy that uses the MMX registers to copy 8 bytes
-// at a time. It uses the "unrolled loop" optimization, and also uses
-// the software prefetch instruction to get the data into the cache.
-align 16
-$memcpy_ic_1: ; 64-byte block copies, in-cache copy
-
- prefetchnta [esi + (200*64/34+192)] ; start reading ahead
-
- movq mm0, [esi+0] ; read 64 bits
- movq mm1, [esi+8]
- movq [edi+0], mm0 ; write 64 bits
- movq [edi+8], mm1 ; note: the normal movq writes the
- movq mm2, [esi+16] ; data to cache; a cache line will be
- movq mm3, [esi+24] ; allocated as needed, to store the data
- movq [edi+16], mm2
- movq [edi+24], mm3
- movq mm0, [esi+32]
- movq mm1, [esi+40]
- movq [edi+32], mm0
- movq [edi+40], mm1
- movq mm2, [esi+48]
- movq mm3, [esi+56]
- movq [edi+48], mm2
- movq [edi+56], mm3
-
- add esi, 64 ; update source pointer
- add edi, 64 ; update destination pointer
- sub eax, 1
- jnz $memcpy_ic_1 ; last 64-byte block?
-
-$memcpy_ic_2:
- mov eax, ecx ; has valid low 6 bits of the byte count
-$memcpy_ic_3:
- shr eax, 2 ; dword count
- and eax, 1111b ; only look at the "remainder" bits
- neg eax ; set up to jump into the array
- add eax, offset $memcpy_last_few
- jmp eax ; jump to array of movsd s
-
-$memcpy_uc_test:
- or eax, eax ; tail end of block prefetch will jump here
- jz $memcpy_ic_2 ; no more 64-byte blocks left
-
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ. This write instruction
-// bypasses the cache and writes straight to main memory. This code also
-// uses the software prefetch instruction to pre-read the data.
-
-align 16
-$memcpy_uc_1: ; 64-byte blocks, uncached copy
-
- prefetchnta [esi + (200*64/34+192)] ; start reading ahead
-
- movq mm0,[esi+0] ; read 64 bits
- add edi,64 ; update destination pointer
- movq mm1,[esi+8]
- add esi,64 ; update source pointer
- movq mm2,[esi-48]
- movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
- movq mm0,[esi-40] ; note: movntq also prevents the CPU
- movntq [edi-56], mm1 ; from READING the destination address
- movq mm1,[esi-32] ; into the cache, only to be over-written
- movntq [edi-48], mm2 ; so that also helps performance
- movq mm2,[esi-24]
- movntq [edi-40], mm0
- movq mm0,[esi-16]
- movntq [edi-32], mm1
- movq mm1,[esi-8]
- movntq [edi-24], mm2
- movntq [edi-16], mm0
- movntq [edi-8], mm1
-
- sub eax, 1
- jnz $memcpy_uc_1 ; last 64-byte block?
-
- jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
-
-// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
-// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
-
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop". Then it handles the last few bytes.
-align 16
- movsd
- movsd ; perform last 1-15 dword copies
- movsd
- movsd
- movsd
- movsd
- movsd
- movsd
- movsd
- movsd ; perform last 1-7 dword copies
- movsd
- movsd
- movsd
- movsd
- movsd
- movsd
-
-$memcpy_last_few: ; dword aligned from before movsd s
- and ecx, 11b ; the last few cows must come home
- jz $memcpy_final ; no more, let s leave
- rep movsb ; the last 1, 2, or 3 bytes
-
-$memcpy_final:
- pop esi
- pop edi
-
- emms ; clean up the MMX state
- sfence ; flush the write buffer
- //mov eax, [dest] ; ret value = destination pointer
-
- ret 4
- }
-}
-
-// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
-__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
-{
- // Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
- // registers will improve copy performance, because they won't. Use of XMMs is only
- // warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
- // and even then the benefits are typically minimal (sometimes slower depending on the
- // amount of data being copied).
- //
- // Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
- // --air
-
- // Linux Conversion note:
- // This code would benefit nicely from having inline-able GAS syntax, since it should
- // allow GCC to optimize the first 3 instructions out of existence in many scenarios.
- // And its called enough times to probably merit the extra effort to ensure proper
- // optimization. --air
-
- __asm
- {
- mov ecx, dest
- mov edx, src
- mov eax, qwc ; keep a copy of count
- shr eax, 1
- jz $memcpy_qwc_1 ; only one 16 byte block to copy?
-
- cmp eax, IN_CACHE_COPY/32
- jb $memcpy_qwc_loop1 ; small copies should be cached (definite speedup --air)
-
-$memcpy_qwc_loop2: ; 32-byte blocks, uncached copy
- prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
-
- movq mm0,[edx+0] ; read 64 bits
- movq mm1,[edx+8]
- movq mm2,[edx+16]
- movntq [ecx+0], mm0 ; write 64 bits, bypassing the cache
- movntq [ecx+8], mm1
- movq mm3,[edx+24]
- movntq [ecx+16], mm2
- movntq [ecx+24], mm3
-
- add edx,32 ; update source pointer
- add ecx,32 ; update destination pointer
- sub eax,1
- jnz $memcpy_qwc_loop2 ; last 64-byte block?
- sfence ; flush the write buffer
- jmp $memcpy_qwc_1
-
-; 32-byte blocks, cached!
-; This *is* important. Removing this and using exclusively non-temporal stores
-; results in noticable speed loss!
-
-$memcpy_qwc_loop1:
- prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
-
- movq mm0,[edx+0] ; read 64 bits
- movq mm1,[edx+8]
- movq mm2,[edx+16]
- movq [ecx+0], mm0 ; write 64 bits, bypassing the cache
- movq [ecx+8], mm1
- movq mm3,[edx+24]
- movq [ecx+16], mm2
- movq [ecx+24], mm3
-
- add edx,32 ; update source pointer
- add ecx,32 ; update destination pointer
- sub eax,1
- jnz $memcpy_qwc_loop1 ; last 64-byte block?
-
-$memcpy_qwc_1:
- test qwc,1
- jz $memcpy_qwc_final
- movq mm0,[edx]
- movq mm1,[edx+8]
- movq [ecx], mm0
- movq [ecx+8], mm1
-
-$memcpy_qwc_final:
- emms ; clean up the MMX state
- }
-}
-
// mmx mem-compare implementation, size has to be a multiple of 8
// returns 0 is equal, nonzero value if not equal
// ~10 times faster than standard memcmp
@@ -489,112 +218,4 @@ End:
}
}
-
-// returns the xor of all elements, cmpsize has to be mult of 8
-void memxor_mmx(void* dst, const void* src1, int cmpsize)
-{
- pxAssert( (cmpsize&7) == 0 );
-
- __asm {
- mov ecx, cmpsize
- mov eax, src1
- mov edx, dst
-
- cmp ecx, 64
- jl Setup4
-
- movq mm0, [eax]
- movq mm1, [eax+8]
- movq mm2, [eax+16]
- movq mm3, [eax+24]
- movq mm4, [eax+32]
- movq mm5, [eax+40]
- movq mm6, [eax+48]
- movq mm7, [eax+56]
- sub ecx, 64
- add eax, 64
- cmp ecx, 64
- jl End8
-
-Cmp8:
- pxor mm0, [eax]
- pxor mm1, [eax+8]
- pxor mm2, [eax+16]
- pxor mm3, [eax+24]
- pxor mm4, [eax+32]
- pxor mm5, [eax+40]
- pxor mm6, [eax+48]
- pxor mm7, [eax+56]
-
- sub ecx, 64
- add eax, 64
- cmp ecx, 64
- jge Cmp8
-
-End8:
- pxor mm0, mm4
- pxor mm1, mm5
- pxor mm2, mm6
- pxor mm3, mm7
-
- cmp ecx, 32
- jl End4
- pxor mm0, [eax]
- pxor mm1, [eax+8]
- pxor mm2, [eax+16]
- pxor mm3, [eax+24]
- sub ecx, 32
- add eax, 32
- jmp End4
-
-Setup4:
- cmp ecx, 32
- jl Setup2
-
- movq mm0, [eax]
- movq mm1, [eax+8]
- movq mm2, [eax+16]
- movq mm3, [eax+24]
- sub ecx, 32
- add eax, 32
-
-End4:
- pxor mm0, mm2
- pxor mm1, mm3
-
- cmp ecx, 16
- jl End2
- pxor mm0, [eax]
- pxor mm1, [eax+8]
- sub ecx, 16
- add eax, 16
- jmp End2
-
-Setup2:
- cmp ecx, 16
- jl Setup1
-
- movq mm0, [eax]
- movq mm1, [eax+8]
- sub ecx, 16
- add eax, 16
-
-End2:
- pxor mm0, mm1
-
- cmp ecx, 8
- jl End1
- pxor mm0, [eax]
-End1:
- movq [edx], mm0
- jmp End
-
-Setup1:
- movq mm0, [eax]
- movq [edx], mm0
-End:
- emms
- }
-}
-
#endif
diff --git a/common/src/Utilities/x86/MemcpyVibes.cpp b/common/src/Utilities/x86/MemcpyVibes.cpp
deleted file mode 100644
index 6d4ff9f934..0000000000
--- a/common/src/Utilities/x86/MemcpyVibes.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/* PCSX2 - PS2 Emulator for PCs
- * Copyright (C) 2002-2010 PCSX2 Dev Team
- *
- * PCSX2 is free software: you can redistribute it and/or modify it under the terms
- * of the GNU Lesser General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with PCSX2.
- * If not, see .
- */
-
-#include "PrecompiledHeader.h"
-#include "x86emitter/x86emitter.h"
-#include
-
-using namespace x86Emitter;
-
-// Max Number of qwc supported
-#define _maxSize 0x400
-
-typedef void (__fastcall *_memCpyCall)(void*, void*);
-__aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
-
-#if 1
-
-// this version uses SSE intrinsics to perform an inline copy. MSVC disasm shows pretty
-// decent code generation on whole, but it hasn't been benchmarked at all yet --air
-__fi void memcpy_vibes(void * dest, const void * src, int size) {
-
- float (*destxmm)[4] = (float(*)[4])dest, (*srcxmm)[4] = (float(*)[4])src;
- size_t count = size & ~15, extra = size & 15;
-
- destxmm -= 8 - extra, srcxmm -= 8 - extra;
- switch (extra) {
- do {
- destxmm += 16, srcxmm += 16, count -= 16;
- _mm_store_ps(&destxmm[-8][0], _mm_load_ps(&srcxmm[-8][0]));
- case 15:
- _mm_store_ps(&destxmm[-7][0], _mm_load_ps(&srcxmm[-7][0]));
- case 14:
- _mm_store_ps(&destxmm[-6][0], _mm_load_ps(&srcxmm[-6][0]));
- case 13:
- _mm_store_ps(&destxmm[-5][0], _mm_load_ps(&srcxmm[-5][0]));
- case 12:
- _mm_store_ps(&destxmm[-4][0], _mm_load_ps(&srcxmm[-4][0]));
- case 11:
- _mm_store_ps(&destxmm[-3][0], _mm_load_ps(&srcxmm[-3][0]));
- case 10:
- _mm_store_ps(&destxmm[-2][0], _mm_load_ps(&srcxmm[-2][0]));
- case 9:
- _mm_store_ps(&destxmm[-1][0], _mm_load_ps(&srcxmm[-1][0]));
- case 8:
- _mm_store_ps(&destxmm[ 0][0], _mm_load_ps(&srcxmm[ 0][0]));
- case 7:
- _mm_store_ps(&destxmm[ 1][0], _mm_load_ps(&srcxmm[ 1][0]));
- case 6:
- _mm_store_ps(&destxmm[ 2][0], _mm_load_ps(&srcxmm[ 2][0]));
- case 5:
- _mm_store_ps(&destxmm[ 3][0], _mm_load_ps(&srcxmm[ 3][0]));
- case 4:
- _mm_store_ps(&destxmm[ 4][0], _mm_load_ps(&srcxmm[ 4][0]));
- case 3:
- _mm_store_ps(&destxmm[ 5][0], _mm_load_ps(&srcxmm[ 5][0]));
- case 2:
- _mm_store_ps(&destxmm[ 6][0], _mm_load_ps(&srcxmm[ 6][0]));
- case 1:
- _mm_store_ps(&destxmm[ 7][0], _mm_load_ps(&srcxmm[ 7][0]));
- case 0: NULL;
- } while (count);
- }
-}
-
-#else
-#if 1
-// This version creates one function with a lot of movaps
-// It jumps to the correct movaps entry-point while adding
-// the proper offset for adjustment...
-
-static __pagealigned u8 _memCpyExec[__pagesize*16];
-
-void gen_memcpy_vibes() {
- HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
- memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
- xSetPtr(_memCpyExec);
-
- int off =-(((_maxSize & 0xf) - 7) << 4);
- for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) {
-
- _memcpy_vibes[i] = (_memCpyCall)xGetPtr();
-
- if (off >= 128) {
- off = -128;
- xADD(edx, 256);
- xADD(ecx, 256);
- }
- const xRegisterSSE xmm_t(x);
- xMOVAPS (xmm_t, ptr32[edx+off]);
- xMOVNTPS(ptr32[ecx+off], xmm_t);
- }
-
- _memcpy_vibes[0] = (_memCpyCall)xGetPtr();
-
- xRET();
- pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
-
- HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
-}
-
-__fi void memcpy_vibes(void * dest, const void * src, int size) {
- int offset = ((size & 0xf) - 7) << 4;
- _memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
-}
-
-#else
-
-// This version creates '_maxSize' number of different functions,
-// and calls the appropriate one...
-
-static __pagealigned u8 _memCpyExec[__pagesize*_maxSize*2];
-
-void gen_memcpy_vibes() {
- HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
- memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
- xSetPtr(_memCpyExec);
-
- for (int i = 0; i < _maxSize+1; i++)
- {
- int off = 0;
- _memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget();
-
- for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) {
- if (off >= 128) {
- off = -128;
- xADD(edx, 256);
- xADD(ecx, 256);
- }
- const xRegisterSSE xmm_t(x);
- xMOVAPS(xmm_t, ptr32[edx+off]);
- xMOVAPS(ptr32[ecx+off], xmm_t);
- }
-
- xRET();
- pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
- }
-
- HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
-}
-
-__fi void memcpy_vibes(void * dest, const void * src, int size) {
- _memcpy_vibes[size](dest, src);
-}
-
-#endif
-#endif
-
-// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
-// to get around compilation issues with having it in the headers.
-#ifdef __linux__
-
- // This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
- // Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
- __fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
- {
- // Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
- // registers will improve copy performance, because they won't. Use of XMMs is only
- // warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
- // and even then the benefits are typically minimal (sometimes slower depending on the
- // amount of data being copied).
- //
- // Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
- // --air
-
- // Linux Conversion note:
- // This code would benefit nicely from having inline-able GAS syntax, since it should
- // allow GCC to optimize the first 3 instructions out of existence in many scenarios.
- // And its called enough times to probably merit the extra effort to ensure proper
- // optimization. --air
-
- __asm__ __volatile__
- (
- ".intel_syntax noprefix\n"
- "sub %[qwc], 1\n" // dec the counter to ease the count of 16bytes block later (optimization)
- // Note after this line, real value of the counter is %[qwc] + 1
- "jle memcpy_qwc_1_%=\n" // only one 16 byte block to copy? Or nothing.
-
- "cmp %[qwc], 127\n" // "IN_CACHE_COPY/16"
- "jb memcpy_qwc_loop1_%=\n" // small copies should be cached (definite speedup --air)
-
- "memcpy_qwc_loop2_%=:\n" // 32-byte blocks, uncached copy
- "prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
-
- "movq mm0,[%[src]+0]\n" // read 64 bits
- "movq mm1,[%[src]+8]\n"
- "movq mm2,[%[src]+16]\n"
- "movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
- "movntq [%[dest]+8], mm1\n"
- "movq mm3,[%[src]+24]\n"
- "movntq [%[dest]+16], mm2\n"
- "movntq [%[dest]+24], mm3\n"
-
- "add %[src],32\n" // update source pointer
- "add %[dest],32\n" // update destination pointer
- "sub %[qwc],2\n"
- "jg memcpy_qwc_loop2_%=\n" // last 64-byte block?
- "sfence\n" // flush the write buffer
- "jmp memcpy_qwc_1_%=\n"
-
- // 32-byte blocks, cached!
- // This *is* important. Removing this and using exclusively non-temporal stores
- // results in noticeable speed loss!
-
- "memcpy_qwc_loop1_%=:\n"
- "prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
-
- "movq mm0,[%[src]+0]\n" // read 64 bits
- "movq mm1,[%[src]+8]\n"
- "movq mm2,[%[src]+16]\n"
- "movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
- "movq [%[dest]+8], mm1\n"
- "movq mm3,[%[src]+24]\n"
- "movq [%[dest]+16], mm2\n"
- "movq [%[dest]+24], mm3\n"
-
- "add %[src],32\n" // update source pointer
- "add %[dest],32\n" // update destination pointer
- "sub %[qwc],2\n"
- "jg memcpy_qwc_loop2_%=\n" // last 64-byte block?
-
- "memcpy_qwc_1_%=:\n"
- "cmp %[qwc],0\n"
- "jne memcpy_qwc_final_%=\n"
- "movq mm0,[%[src]]\n"
- "movq mm1,[%[src]+8]\n"
- "movq [%[dest]], mm0\n"
- "movq [%[dest]+8], mm1\n"
-
- "memcpy_qwc_final_%=:\n"
- "emms\n" // clean up the MMX state
- ".att_syntax\n"
- : "=&r"(dest), "=&r"(src), "=&r"(qwc)
- : [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
- : "memory", "mm0", "mm1", "mm2", "mm3"
- );
- }
-#endif
-