ReorderingMTGS: More tweaks to asm memcpy files (made code changes to Linux side, comment changes to Win32 side).

Linux Devs: Let's get this memcpy thing finalized, if its not already.  I'd like to merge the current state of this branch into trunk as soon as possible, since its currently looking very stable and has been, up to this point, a code cleanup and stabilization project.  (more invasive changes coming soon)

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3518 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2010-07-17 15:03:45 +00:00
parent 1c9cefd778
commit 2d4c7aaa25
3 changed files with 102 additions and 106 deletions

View File

@ -42,10 +42,12 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
extern void memcpy_vibes(void * dest, const void * src, int size);
extern void gen_memcpy_vibes();
#define memcpy_fast memcpy_amd_ // Fast memcpy
#define memcpy_fast memcpy_amd_ // Fast memcpy
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses
#define memcpy_const memcpy_amd_ // Memcpy with constant size
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
#define memcpy_qwc_ memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
#define memcpy_const memcpy_amd_ // Memcpy with constant size
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
#define memcpy_qwc_ memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)
// Useful alternative if we think memcpy_amd_qwc is buggy
//#define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16)

View File

@ -41,12 +41,10 @@
MEMCPY_AMD.CPP
******************************************************************************/
// Very optimized memcpy() routine for AMD Athlon and Duron family.
// This code uses any of FOUR different basic copy methods, depending
// on the transfer size.
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetch instructions,
// be sure you're running on Athlon/Duron or other recent CPU before calling!
// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
// calling!
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
// The smallest copy uses the X86 "movsd" instruction, in an optimized
@ -68,10 +66,8 @@ MEMCPY_AMD.CPP
#if defined(_MSC_VER)
// --------------------------------------------------------------------------------------
// Fast memcpy as coded by AMD, and then improved by air.
// --------------------------------------------------------------------------------------
// Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
{
__asm
@ -92,6 +88,7 @@ __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_
jbe $memcpy_do_align ; it appears to be slower
cmp eax, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov eax, 8 ; a trick that's faster than rep movsb...
sub eax, edi ; align destination to qword

View File

@ -155,99 +155,96 @@ __forceinline void memcpy_vibes(void * dest, const void * src, int size) {
}
#endif
#endif
// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
// to get around compilation issues with having it in the headers.
#ifdef __LINUX__
// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
{
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
// registers will improve copy performance, because they won't. Use of XMMs is only
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
// and even then the benefits are typically minimal (sometimes slower depending on the
// amount of data being copied).
//
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
// --air
// Linux Conversion note:
// This code would benefit nicely from having inline-able GAS syntax, since it should
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
// And its called enough times to probably merit the extra effort to ensure proper
// optimization. --air
__asm__
(
".intel_syntax noprefix\n"
//"mov ecx, [%[dest]]\n"
//"mov edx, [%[src]]\n"
//"mov eax, [%[qwc]]\n" // keep a copy of count
"mov eax, %[qwc]\n"
"shr eax, 1\n"
"jz memcpy_qwc_1\n" // only one 16 byte block to copy?
"cmp %[qwc], 64\n" // "IN_CACHE_COPY/32"
"jb memcpy_qwc_loop1\n" // small copies should be cached (definite speedup --air)
"memcpy_qwc_loop2:\n" // 32-byte blocks, uncached copy
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
"movq mm0,[%[src]+0]\n" // read 64 bits
"movq mm1,[%[src]+8]\n"
"movq mm2,[%[src]+16]\n"
"movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
"movntq [%[dest]+8], mm1\n"
"movq mm3,[%[src]+24]\n"
"movntq [%[dest]+16], mm2\n"
"movntq [%[dest]+24], mm3\n"
"add %[src],32\n" // update source pointer
"add %[dest],32\n" // update destination pointer
"sub eax,1\n"
"jnz memcpy_qwc_loop2\n" // last 64-byte block?
"sfence\n" // flush the write buffer
"jmp memcpy_qwc_1\n"
// 32-byte blocks, cached!
// This *is* important. Removing this and using exclusively non-temporal stores
// results in noticeable speed loss!
"memcpy_qwc_loop1:\n"
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
"movq mm0,[%[src]+0]\n" // read 64 bits
"movq mm1,[%[src]+8]\n"
"movq mm2,[%[src]+16]\n"
"movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
"movq [%[dest]+8], mm1\n"
"movq mm3,[%[src]+24]\n"
"movq [%[dest]+16], mm2\n"
"movq [%[dest]+24], mm3\n"
"add %[src],32\n" // update source pointer
"add %[dest],32\n" // update destination pointer
"sub eax,1\n"
"jnz memcpy_qwc_loop1\n" // last 64-byte block?
"memcpy_qwc_1:\n"
"test %[qwc],1\n"
"jz memcpy_qwc_final\n"
"movq mm0,[%[src]]\n"
"movq mm1,[%[src]+8]\n"
"movq [%[dest]], mm0\n"
"movq [%[dest]+8], mm1\n"
"memcpy_qwc_final:\n"
"emms\n" // clean up the MMX state
".att_syntax\n"
: "=&r"(dest), "=&r"(src), "=&r"(qwc)
: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
);
}
#endif
#endif
// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
// to get around compilation issues with having it in the headers.
#ifdef __LINUX__
// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
{
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
// registers will improve copy performance, because they won't. Use of XMMs is only
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
// and even then the benefits are typically minimal (sometimes slower depending on the
// amount of data being copied).
//
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
// --air
// Linux Conversion note:
// This code would benefit nicely from having inline-able GAS syntax, since it should
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
// And its called enough times to probably merit the extra effort to ensure proper
// optimization. --air
__asm__
(
".intel_syntax noprefix\n"
"mov eax, %[qwc]\n" // keep a copy of count for looping
"shr eax, 1\n"
"jz memcpy_qwc_1\n" // only one 16 byte block to copy?
"cmp eax, 64\n" // "IN_CACHE_COPY/32"
"jb memcpy_qwc_loop1\n" // small copies should be cached (definite speedup --air)
"memcpy_qwc_loop2:\n" // 32-byte blocks, uncached copy
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
"movq mm0,[%[src]+0]\n" // read 64 bits
"movq mm1,[%[src]+8]\n"
"movq mm2,[%[src]+16]\n"
"movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
"movntq [%[dest]+8], mm1\n"
"movq mm3,[%[src]+24]\n"
"movntq [%[dest]+16], mm2\n"
"movntq [%[dest]+24], mm3\n"
"add %[src],32\n" // update source pointer
"add %[dest],32\n" // update destination pointer
"sub eax,1\n"
"jnz memcpy_qwc_loop2\n" // last 64-byte block?
"sfence\n" // flush the write buffer
"jmp memcpy_qwc_1\n"
// 32-byte blocks, cached!
// This *is* important. Removing this and using exclusively non-temporal stores
// results in noticeable speed loss!
"memcpy_qwc_loop1:\n"
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
"movq mm0,[%[src]+0]\n" // read 64 bits
"movq mm1,[%[src]+8]\n"
"movq mm2,[%[src]+16]\n"
"movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
"movq [%[dest]+8], mm1\n"
"movq mm3,[%[src]+24]\n"
"movq [%[dest]+16], mm2\n"
"movq [%[dest]+24], mm3\n"
"add %[src],32\n" // update source pointer
"add %[dest],32\n" // update destination pointer
"sub eax,1\n"
"jnz memcpy_qwc_loop1\n" // last 64-byte block?
"memcpy_qwc_1:\n"
"testl %[qwc],1\n"
"jz memcpy_qwc_final\n"
"movq mm0,[%[src]]\n"
"movq mm1,[%[src]+8]\n"
"movq [%[dest]], mm0\n"
"movq [%[dest]+8], mm1\n"
"memcpy_qwc_final:\n"
"emms\n" // clean up the MMX state
".att_syntax\n"
: "=&r"(dest), "=&r"(src), "=&r"(qwc)
: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
);
}
#endif