From 69e88ffed009b4b0f312d81d6f4623f8fbd56d80 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Sun, 26 Oct 2014 22:33:42 +0100 Subject: [PATCH] common: remove old memcpy implementation PCSX2 used standard memcpy now (thanks to xsacha) --- common/include/Utilities/MemcpyFast.h | 4 - common/src/Utilities/CMakeLists.txt | 1 - common/src/Utilities/x86/MemcpyFast.cpp | 385 +---------------------- common/src/Utilities/x86/MemcpyVibes.cpp | 250 --------------- 4 files changed, 3 insertions(+), 637 deletions(-) delete mode 100644 common/src/Utilities/x86/MemcpyVibes.cpp diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h index 9f4df897ae..b3754d8725 100644 --- a/common/include/Utilities/MemcpyFast.h +++ b/common/include/Utilities/MemcpyFast.h @@ -32,10 +32,6 @@ extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize); // Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason. void _memset16_unaligned( void* dest, u16 data, size_t size ); -// MemcpyVibes.cpp functions -extern void memcpy_vibes(void * dest, const void * src, int size); -extern void gen_memcpy_vibes(); - #define memcpy_fast memcpy #define memcpy_aligned(d,s,c) memcpy(d,s,c) #define memcpy_const memcpy diff --git a/common/src/Utilities/CMakeLists.txt b/common/src/Utilities/CMakeLists.txt index 9e14b095b5..0765a33e54 100644 --- a/common/src/Utilities/CMakeLists.txt +++ b/common/src/Utilities/CMakeLists.txt @@ -128,7 +128,6 @@ set(UtilitiesSources wxAppWithHelpers.cpp wxGuiTools.cpp wxHelpers.cpp - x86/MemcpyVibes.cpp ) # variable with all headers of this library diff --git a/common/src/Utilities/x86/MemcpyFast.cpp b/common/src/Utilities/x86/MemcpyFast.cpp index 2ae3f34fb2..afae09d709 100644 --- a/common/src/Utilities/x86/MemcpyFast.cpp +++ b/common/src/Utilities/x86/MemcpyFast.cpp @@ -31,290 +31,19 @@ 3dsdk.support@amd.com ******************************************************************************/ +// GH: AMD memcpy was removed. The remaining part (memcmp_mmx) is likely from Zerofrog. +// Hopefully memcmp_mmx will be dropped in the future. + #include "PrecompiledHeader.h" #ifdef _MSC_VER #pragma warning(disable:4414) #endif -/***************************************************************************** -MEMCPY_AMD.CPP -******************************************************************************/ - -// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or -// "Streaming Store"), and also uses the software prefetch instructions, -// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before -// calling! - -#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy -// The smallest copy uses the X86 "movsd" instruction, in an optimized -// form which is an "unrolled loop". - -#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch -// Next is a copy that uses the MMX registers to copy 8 bytes at a time, -// also using the "unrolled loop" optimization. This code uses -// the software prefetch instruction to get the data into the cache. - -#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch -// For larger blocks, which will spill beyond the cache, it's faster to -// use the Streaming Store instruction MOVNTQ. This write instruction -// bypasses the cache and writes straight to main memory. This code also -// uses the software prefetch instruction to pre-read the data. -// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE" - // Inline assembly syntax for use with Visual C++ #if defined(_MSC_VER) - -// Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs. -__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n) -{ - __asm - { - push edi - push esi - - mov edi, ecx ; destination - mov esi, edx ; source - mov ecx, [esp+12] ; number of bytes to copy - mov eax, ecx ; keep a copy of count - - cld - cmp eax, TINY_BLOCK_COPY - jb $memcpy_ic_3 ; tiny? skip mmx copy - - cmp eax, 32*1024 ; dont align between 32k-64k because - jbe $memcpy_do_align ; it appears to be slower - cmp eax, 64*1024 - jbe $memcpy_align_done - -$memcpy_do_align: - mov eax, 8 ; a trick that s faster than rep movsb... - sub eax, edi ; align destination to qword - and eax, 111b ; get the low bits - sub ecx, eax ; update copy count - neg eax ; set up to jump into the array - add eax, offset $memcpy_align_done - jmp eax ; jump to array of movsb s - -align 4 - movsb - movsb - movsb - movsb - movsb - movsb - movsb - movsb - -$memcpy_align_done: ; destination is dword aligned - mov eax, ecx ; number of bytes left to copy - shr eax, 6 ; get 64-byte block count - jz $memcpy_ic_2 ; finish the last few bytes - - cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy - jae $memcpy_uc_test - -// This is small block copy that uses the MMX registers to copy 8 bytes -// at a time. It uses the "unrolled loop" optimization, and also uses -// the software prefetch instruction to get the data into the cache. -align 16 -$memcpy_ic_1: ; 64-byte block copies, in-cache copy - - prefetchnta [esi + (200*64/34+192)] ; start reading ahead - - movq mm0, [esi+0] ; read 64 bits - movq mm1, [esi+8] - movq [edi+0], mm0 ; write 64 bits - movq [edi+8], mm1 ; note: the normal movq writes the - movq mm2, [esi+16] ; data to cache; a cache line will be - movq mm3, [esi+24] ; allocated as needed, to store the data - movq [edi+16], mm2 - movq [edi+24], mm3 - movq mm0, [esi+32] - movq mm1, [esi+40] - movq [edi+32], mm0 - movq [edi+40], mm1 - movq mm2, [esi+48] - movq mm3, [esi+56] - movq [edi+48], mm2 - movq [edi+56], mm3 - - add esi, 64 ; update source pointer - add edi, 64 ; update destination pointer - sub eax, 1 - jnz $memcpy_ic_1 ; last 64-byte block? - -$memcpy_ic_2: - mov eax, ecx ; has valid low 6 bits of the byte count -$memcpy_ic_3: - shr eax, 2 ; dword count - and eax, 1111b ; only look at the "remainder" bits - neg eax ; set up to jump into the array - add eax, offset $memcpy_last_few - jmp eax ; jump to array of movsd s - -$memcpy_uc_test: - or eax, eax ; tail end of block prefetch will jump here - jz $memcpy_ic_2 ; no more 64-byte blocks left - -// For larger blocks, which will spill beyond the cache, it's faster to -// use the Streaming Store instruction MOVNTQ. This write instruction -// bypasses the cache and writes straight to main memory. This code also -// uses the software prefetch instruction to pre-read the data. - -align 16 -$memcpy_uc_1: ; 64-byte blocks, uncached copy - - prefetchnta [esi + (200*64/34+192)] ; start reading ahead - - movq mm0,[esi+0] ; read 64 bits - add edi,64 ; update destination pointer - movq mm1,[esi+8] - add esi,64 ; update source pointer - movq mm2,[esi-48] - movntq [edi-64], mm0 ; write 64 bits, bypassing the cache - movq mm0,[esi-40] ; note: movntq also prevents the CPU - movntq [edi-56], mm1 ; from READING the destination address - movq mm1,[esi-32] ; into the cache, only to be over-written - movntq [edi-48], mm2 ; so that also helps performance - movq mm2,[esi-24] - movntq [edi-40], mm0 - movq mm0,[esi-16] - movntq [edi-32], mm1 - movq mm1,[esi-8] - movntq [edi-24], mm2 - movntq [edi-16], mm0 - movntq [edi-8], mm1 - - sub eax, 1 - jnz $memcpy_uc_1 ; last 64-byte block? - - jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed) - -// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been -// disabled to help keep the code cache footprint of memcpy_fast to a minimum. - -// The smallest copy uses the X86 "movsd" instruction, in an optimized -// form which is an "unrolled loop". Then it handles the last few bytes. -align 16 - movsd - movsd ; perform last 1-15 dword copies - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd ; perform last 1-7 dword copies - movsd - movsd - movsd - movsd - movsd - movsd - -$memcpy_last_few: ; dword aligned from before movsd s - and ecx, 11b ; the last few cows must come home - jz $memcpy_final ; no more, let s leave - rep movsb ; the last 1, 2, or 3 bytes - -$memcpy_final: - pop esi - pop edi - - emms ; clean up the MMX state - sfence ; flush the write buffer - //mov eax, [dest] ; ret value = destination pointer - - ret 4 - } -} - -// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned. -__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc) -{ - // Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM - // registers will improve copy performance, because they won't. Use of XMMs is only - // warranted in situations where both source and dest are guaranteed aligned to 16 bytes, - // and even then the benefits are typically minimal (sometimes slower depending on the - // amount of data being copied). - // - // Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them. - // --air - - // Linux Conversion note: - // This code would benefit nicely from having inline-able GAS syntax, since it should - // allow GCC to optimize the first 3 instructions out of existence in many scenarios. - // And its called enough times to probably merit the extra effort to ensure proper - // optimization. --air - - __asm - { - mov ecx, dest - mov edx, src - mov eax, qwc ; keep a copy of count - shr eax, 1 - jz $memcpy_qwc_1 ; only one 16 byte block to copy? - - cmp eax, IN_CACHE_COPY/32 - jb $memcpy_qwc_loop1 ; small copies should be cached (definite speedup --air) - -$memcpy_qwc_loop2: ; 32-byte blocks, uncached copy - prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air) - - movq mm0,[edx+0] ; read 64 bits - movq mm1,[edx+8] - movq mm2,[edx+16] - movntq [ecx+0], mm0 ; write 64 bits, bypassing the cache - movntq [ecx+8], mm1 - movq mm3,[edx+24] - movntq [ecx+16], mm2 - movntq [ecx+24], mm3 - - add edx,32 ; update source pointer - add ecx,32 ; update destination pointer - sub eax,1 - jnz $memcpy_qwc_loop2 ; last 64-byte block? - sfence ; flush the write buffer - jmp $memcpy_qwc_1 - -; 32-byte blocks, cached! -; This *is* important. Removing this and using exclusively non-temporal stores -; results in noticable speed loss! - -$memcpy_qwc_loop1: - prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air) - - movq mm0,[edx+0] ; read 64 bits - movq mm1,[edx+8] - movq mm2,[edx+16] - movq [ecx+0], mm0 ; write 64 bits, bypassing the cache - movq [ecx+8], mm1 - movq mm3,[edx+24] - movq [ecx+16], mm2 - movq [ecx+24], mm3 - - add edx,32 ; update source pointer - add ecx,32 ; update destination pointer - sub eax,1 - jnz $memcpy_qwc_loop1 ; last 64-byte block? - -$memcpy_qwc_1: - test qwc,1 - jz $memcpy_qwc_final - movq mm0,[edx] - movq mm1,[edx+8] - movq [ecx], mm0 - movq [ecx+8], mm1 - -$memcpy_qwc_final: - emms ; clean up the MMX state - } -} - // mmx mem-compare implementation, size has to be a multiple of 8 // returns 0 is equal, nonzero value if not equal // ~10 times faster than standard memcmp @@ -489,112 +218,4 @@ End: } } - -// returns the xor of all elements, cmpsize has to be mult of 8 -void memxor_mmx(void* dst, const void* src1, int cmpsize) -{ - pxAssert( (cmpsize&7) == 0 ); - - __asm { - mov ecx, cmpsize - mov eax, src1 - mov edx, dst - - cmp ecx, 64 - jl Setup4 - - movq mm0, [eax] - movq mm1, [eax+8] - movq mm2, [eax+16] - movq mm3, [eax+24] - movq mm4, [eax+32] - movq mm5, [eax+40] - movq mm6, [eax+48] - movq mm7, [eax+56] - sub ecx, 64 - add eax, 64 - cmp ecx, 64 - jl End8 - -Cmp8: - pxor mm0, [eax] - pxor mm1, [eax+8] - pxor mm2, [eax+16] - pxor mm3, [eax+24] - pxor mm4, [eax+32] - pxor mm5, [eax+40] - pxor mm6, [eax+48] - pxor mm7, [eax+56] - - sub ecx, 64 - add eax, 64 - cmp ecx, 64 - jge Cmp8 - -End8: - pxor mm0, mm4 - pxor mm1, mm5 - pxor mm2, mm6 - pxor mm3, mm7 - - cmp ecx, 32 - jl End4 - pxor mm0, [eax] - pxor mm1, [eax+8] - pxor mm2, [eax+16] - pxor mm3, [eax+24] - sub ecx, 32 - add eax, 32 - jmp End4 - -Setup4: - cmp ecx, 32 - jl Setup2 - - movq mm0, [eax] - movq mm1, [eax+8] - movq mm2, [eax+16] - movq mm3, [eax+24] - sub ecx, 32 - add eax, 32 - -End4: - pxor mm0, mm2 - pxor mm1, mm3 - - cmp ecx, 16 - jl End2 - pxor mm0, [eax] - pxor mm1, [eax+8] - sub ecx, 16 - add eax, 16 - jmp End2 - -Setup2: - cmp ecx, 16 - jl Setup1 - - movq mm0, [eax] - movq mm1, [eax+8] - sub ecx, 16 - add eax, 16 - -End2: - pxor mm0, mm1 - - cmp ecx, 8 - jl End1 - pxor mm0, [eax] -End1: - movq [edx], mm0 - jmp End - -Setup1: - movq mm0, [eax] - movq [edx], mm0 -End: - emms - } -} - #endif diff --git a/common/src/Utilities/x86/MemcpyVibes.cpp b/common/src/Utilities/x86/MemcpyVibes.cpp deleted file mode 100644 index 6d4ff9f934..0000000000 --- a/common/src/Utilities/x86/MemcpyVibes.cpp +++ /dev/null @@ -1,250 +0,0 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2010 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#include "PrecompiledHeader.h" -#include "x86emitter/x86emitter.h" -#include - -using namespace x86Emitter; - -// Max Number of qwc supported -#define _maxSize 0x400 - -typedef void (__fastcall *_memCpyCall)(void*, void*); -__aligned16 _memCpyCall _memcpy_vibes[_maxSize+1]; - -#if 1 - -// this version uses SSE intrinsics to perform an inline copy. MSVC disasm shows pretty -// decent code generation on whole, but it hasn't been benchmarked at all yet --air -__fi void memcpy_vibes(void * dest, const void * src, int size) { - - float (*destxmm)[4] = (float(*)[4])dest, (*srcxmm)[4] = (float(*)[4])src; - size_t count = size & ~15, extra = size & 15; - - destxmm -= 8 - extra, srcxmm -= 8 - extra; - switch (extra) { - do { - destxmm += 16, srcxmm += 16, count -= 16; - _mm_store_ps(&destxmm[-8][0], _mm_load_ps(&srcxmm[-8][0])); - case 15: - _mm_store_ps(&destxmm[-7][0], _mm_load_ps(&srcxmm[-7][0])); - case 14: - _mm_store_ps(&destxmm[-6][0], _mm_load_ps(&srcxmm[-6][0])); - case 13: - _mm_store_ps(&destxmm[-5][0], _mm_load_ps(&srcxmm[-5][0])); - case 12: - _mm_store_ps(&destxmm[-4][0], _mm_load_ps(&srcxmm[-4][0])); - case 11: - _mm_store_ps(&destxmm[-3][0], _mm_load_ps(&srcxmm[-3][0])); - case 10: - _mm_store_ps(&destxmm[-2][0], _mm_load_ps(&srcxmm[-2][0])); - case 9: - _mm_store_ps(&destxmm[-1][0], _mm_load_ps(&srcxmm[-1][0])); - case 8: - _mm_store_ps(&destxmm[ 0][0], _mm_load_ps(&srcxmm[ 0][0])); - case 7: - _mm_store_ps(&destxmm[ 1][0], _mm_load_ps(&srcxmm[ 1][0])); - case 6: - _mm_store_ps(&destxmm[ 2][0], _mm_load_ps(&srcxmm[ 2][0])); - case 5: - _mm_store_ps(&destxmm[ 3][0], _mm_load_ps(&srcxmm[ 3][0])); - case 4: - _mm_store_ps(&destxmm[ 4][0], _mm_load_ps(&srcxmm[ 4][0])); - case 3: - _mm_store_ps(&destxmm[ 5][0], _mm_load_ps(&srcxmm[ 5][0])); - case 2: - _mm_store_ps(&destxmm[ 6][0], _mm_load_ps(&srcxmm[ 6][0])); - case 1: - _mm_store_ps(&destxmm[ 7][0], _mm_load_ps(&srcxmm[ 7][0])); - case 0: NULL; - } while (count); - } -} - -#else -#if 1 -// This version creates one function with a lot of movaps -// It jumps to the correct movaps entry-point while adding -// the proper offset for adjustment... - -static __pagealigned u8 _memCpyExec[__pagesize*16]; - -void gen_memcpy_vibes() { - HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false); - memset (_memCpyExec, 0xcc, sizeof(_memCpyExec)); - xSetPtr(_memCpyExec); - - int off =-(((_maxSize & 0xf) - 7) << 4); - for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) { - - _memcpy_vibes[i] = (_memCpyCall)xGetPtr(); - - if (off >= 128) { - off = -128; - xADD(edx, 256); - xADD(ecx, 256); - } - const xRegisterSSE xmm_t(x); - xMOVAPS (xmm_t, ptr32[edx+off]); - xMOVNTPS(ptr32[ecx+off], xmm_t); - } - - _memcpy_vibes[0] = (_memCpyCall)xGetPtr(); - - xRET(); - pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec)); - - HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true); -} - -__fi void memcpy_vibes(void * dest, const void * src, int size) { - int offset = ((size & 0xf) - 7) << 4; - _memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset)); -} - -#else - -// This version creates '_maxSize' number of different functions, -// and calls the appropriate one... - -static __pagealigned u8 _memCpyExec[__pagesize*_maxSize*2]; - -void gen_memcpy_vibes() { - HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false); - memset (_memCpyExec, 0xcc, sizeof(_memCpyExec)); - xSetPtr(_memCpyExec); - - for (int i = 0; i < _maxSize+1; i++) - { - int off = 0; - _memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget(); - - for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) { - if (off >= 128) { - off = -128; - xADD(edx, 256); - xADD(ecx, 256); - } - const xRegisterSSE xmm_t(x); - xMOVAPS(xmm_t, ptr32[edx+off]); - xMOVAPS(ptr32[ecx+off], xmm_t); - } - - xRET(); - pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec)); - } - - HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true); -} - -__fi void memcpy_vibes(void * dest, const void * src, int size) { - _memcpy_vibes[size](dest, src); -} - -#endif -#endif - -// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment, -// to get around compilation issues with having it in the headers. -#ifdef __linux__ - - // This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now. - // Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned. - __fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc) - { - // Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM - // registers will improve copy performance, because they won't. Use of XMMs is only - // warranted in situations where both source and dest are guaranteed aligned to 16 bytes, - // and even then the benefits are typically minimal (sometimes slower depending on the - // amount of data being copied). - // - // Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them. - // --air - - // Linux Conversion note: - // This code would benefit nicely from having inline-able GAS syntax, since it should - // allow GCC to optimize the first 3 instructions out of existence in many scenarios. - // And its called enough times to probably merit the extra effort to ensure proper - // optimization. --air - - __asm__ __volatile__ - ( - ".intel_syntax noprefix\n" - "sub %[qwc], 1\n" // dec the counter to ease the count of 16bytes block later (optimization) - // Note after this line, real value of the counter is %[qwc] + 1 - "jle memcpy_qwc_1_%=\n" // only one 16 byte block to copy? Or nothing. - - "cmp %[qwc], 127\n" // "IN_CACHE_COPY/16" - "jb memcpy_qwc_loop1_%=\n" // small copies should be cached (definite speedup --air) - - "memcpy_qwc_loop2_%=:\n" // 32-byte blocks, uncached copy - "prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air) - - "movq mm0,[%[src]+0]\n" // read 64 bits - "movq mm1,[%[src]+8]\n" - "movq mm2,[%[src]+16]\n" - "movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache - "movntq [%[dest]+8], mm1\n" - "movq mm3,[%[src]+24]\n" - "movntq [%[dest]+16], mm2\n" - "movntq [%[dest]+24], mm3\n" - - "add %[src],32\n" // update source pointer - "add %[dest],32\n" // update destination pointer - "sub %[qwc],2\n" - "jg memcpy_qwc_loop2_%=\n" // last 64-byte block? - "sfence\n" // flush the write buffer - "jmp memcpy_qwc_1_%=\n" - - // 32-byte blocks, cached! - // This *is* important. Removing this and using exclusively non-temporal stores - // results in noticeable speed loss! - - "memcpy_qwc_loop1_%=:\n" - "prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air) - - "movq mm0,[%[src]+0]\n" // read 64 bits - "movq mm1,[%[src]+8]\n" - "movq mm2,[%[src]+16]\n" - "movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache - "movq [%[dest]+8], mm1\n" - "movq mm3,[%[src]+24]\n" - "movq [%[dest]+16], mm2\n" - "movq [%[dest]+24], mm3\n" - - "add %[src],32\n" // update source pointer - "add %[dest],32\n" // update destination pointer - "sub %[qwc],2\n" - "jg memcpy_qwc_loop2_%=\n" // last 64-byte block? - - "memcpy_qwc_1_%=:\n" - "cmp %[qwc],0\n" - "jne memcpy_qwc_final_%=\n" - "movq mm0,[%[src]]\n" - "movq mm1,[%[src]+8]\n" - "movq [%[dest]], mm0\n" - "movq [%[dest]+8], mm1\n" - - "memcpy_qwc_final_%=:\n" - "emms\n" // clean up the MMX state - ".att_syntax\n" - : "=&r"(dest), "=&r"(src), "=&r"(qwc) - : [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc) - : "memory", "mm0", "mm1", "mm2", "mm3" - ); - } -#endif -