From d1e0922417b4c69c2b66311023ba9464235428c7 Mon Sep 17 00:00:00 2001 From: arcum42 Date: Tue, 13 Jul 2010 10:48:35 +0000 Subject: [PATCH] ReorderingMTGS: Initial Linux version of memcpy_amd_qwc. Disabled for now, till I get a chance to look it over better. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3477 96395faa-99c1-11dd-bbfe-3dabce05a288 --- common/build/Utilities/Utilities.cbp | 4 ++ common/include/Utilities/MemcpyFast.h | 92 +++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/common/build/Utilities/Utilities.cbp b/common/build/Utilities/Utilities.cbp index 0e98314365..0d7fbbe596 100644 --- a/common/build/Utilities/Utilities.cbp +++ b/common/build/Utilities/Utilities.cbp @@ -208,6 +208,10 @@ + + diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h index d78f9f8a8e..9fc9331651 100644 --- a/common/include/Utilities/MemcpyFast.h +++ b/common/include/Utilities/MemcpyFast.h @@ -23,6 +23,95 @@ extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize); extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize); +#if 0 + // This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now. + // Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned. + static __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc) + { + // Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM + // registers will improve copy performance, because they won't. Use of XMMs is only + // warranted in situations where both source and dest are guaranteed aligned to 16 bytes, + // and even then the benefits are typically minimal (sometimes slower depending on the + // amount of data being copied). + // + // Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them. + // --air + + // Linux Conversion note: + // This code would benefit nicely from having inline-able GAS syntax, since it should + // allow GCC to optimize the first 3 instructions out of existence in many scenarios. + // And its called enough times to probably merit the extra effort to ensure proper + // optimization. --air + + __asm__ + ( + ".intel_syntax noprefix\n" + "mov ecx, [%[dest]]\n" + "mov edx, [%[src]]\n" + "mov eax, [%[qwc]]\n" // keep a copy of count + "shr eax, 1\n" + "jz memcpy_qwc_1\n" // only one 16 byte block to copy? + + "cmp eax, 64\n" // "IN_CACHE_COPY/32" + "jb memcpy_qwc_loop1\n" // small copies should be cached (definite speedup --air) + + "memcpy_qwc_loop2:\n" // 32-byte blocks, uncached copy + "prefetchnta [edx + 568]\n" // start reading ahead (tested: it helps! --air) + + "movq mm0,[edx+0]\n" // read 64 bits + "movq mm1,[edx+8]\n" + "movq mm2,[edx+16]\n" + "movntq [ecx+0], mm0\n" // write 64 bits, bypassing the cache + "movntq [ecx+8], mm1\n" + "movq mm3,[edx+24]\n" + "movntq [ecx+16], mm2\n" + "movntq [ecx+24], mm3\n" + + "add edx,32\n" // update source pointer + "add ecx,32\n" // update destination pointer + "sub eax,1\n" + "jnz memcpy_qwc_loop2\n" // last 64-byte block? + "sfence\n" // flush the write buffer + "jmp memcpy_qwc_1\n" + + // 32-byte blocks, cached! + // This *is* important. Removing this and using exclusively non-temporal stores + // results in noticable speed loss! + + "memcpy_qwc_loop1:\n" + "prefetchnta [edx + 568]\n" // start reading ahead (tested: it helps! --air) + + "movq mm0,[edx+0]\n" // read 64 bits + "movq mm1,[edx+8]\n" + "movq mm2,[edx+16]\n" + "movq [ecx+0], mm0\n" // write 64 bits, bypassing the cache + "movq [ecx+8], mm1\n" + "movq mm3,[edx+24]\n" + "movq [ecx+16], mm2\n" + "movq [ecx+24], mm3\n" + + "add edx,32\n" // update source pointer + "add ecx,32\n" // update destination pointer + "sub eax,1\n" + "jnz memcpy_qwc_loop1\n" // last 64-byte block? + + "memcpy_qwc_1:\n" + "test [%[qwc]],dword ptr 1\n" + "jz memcpy_qwc_final\n" + "movq mm0,[edx]\n" + "movq mm1,[edx+8]\n" + "movq [ecx], mm0\n" + "movq [ecx+8], mm1\n" + + "memcpy_qwc_final:\n" + "emms\n" // clean up the MMX state + ".att_syntax\n" + : "=r"(dest), "=r"(src), "=r"(qwc) + : [dest]"r"(dest), [src]"r"(src), [qwc]"r"(qwc) + //: Needs a clobber list here + ); + } +#endif #else # include "win_memzero.h" @@ -41,8 +130,11 @@ void _memset16_unaligned( void* dest, u16 data, size_t size ); #define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses #define memcpy_const memcpy_amd_ // Memcpy with constant size #define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned + +//#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c) #ifndef __LINUX__ #define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c) #else #define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16) +//#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c) #endif