mirror of https://github.com/PCSX2/pcsx2.git
ReorderingMTGS: More tweaks to asm memcpy files (made code changes to Linux side, comment changes to Win32 side).
Linux Devs: Let's get this memcpy thing finalized, if its not already. I'd like to merge the current state of this branch into trunk as soon as possible, since its currently looking very stable and has been, up to this point, a code cleanup and stabilization project. (more invasive changes coming soon) git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3518 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
1c9cefd778
commit
2d4c7aaa25
|
@ -42,10 +42,12 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||||
extern void memcpy_vibes(void * dest, const void * src, int size);
|
extern void memcpy_vibes(void * dest, const void * src, int size);
|
||||||
extern void gen_memcpy_vibes();
|
extern void gen_memcpy_vibes();
|
||||||
|
|
||||||
#define memcpy_fast memcpy_amd_ // Fast memcpy
|
#define memcpy_fast memcpy_amd_ // Fast memcpy
|
||||||
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses
|
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses
|
||||||
#define memcpy_const memcpy_amd_ // Memcpy with constant size
|
#define memcpy_const memcpy_amd_ // Memcpy with constant size
|
||||||
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
|
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
|
||||||
#define memcpy_qwc_ memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
|
#define memcpy_qwc_ memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
|
||||||
#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)
|
#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)
|
||||||
|
|
||||||
|
// Useful alternative if we think memcpy_amd_qwc is buggy
|
||||||
//#define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16)
|
//#define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16)
|
||||||
|
|
|
@ -41,12 +41,10 @@
|
||||||
MEMCPY_AMD.CPP
|
MEMCPY_AMD.CPP
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
// Very optimized memcpy() routine for AMD Athlon and Duron family.
|
|
||||||
// This code uses any of FOUR different basic copy methods, depending
|
|
||||||
// on the transfer size.
|
|
||||||
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
||||||
// "Streaming Store"), and also uses the software prefetch instructions,
|
// "Streaming Store"), and also uses the software prefetch instructions,
|
||||||
// be sure you're running on Athlon/Duron or other recent CPU before calling!
|
// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
|
||||||
|
// calling!
|
||||||
|
|
||||||
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
|
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
|
||||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||||
|
@ -68,10 +66,8 @@ MEMCPY_AMD.CPP
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------
|
|
||||||
// Fast memcpy as coded by AMD, and then improved by air.
|
|
||||||
// --------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
// Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
|
||||||
__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
||||||
{
|
{
|
||||||
__asm
|
__asm
|
||||||
|
@ -92,6 +88,7 @@ __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_
|
||||||
jbe $memcpy_do_align ; it appears to be slower
|
jbe $memcpy_do_align ; it appears to be slower
|
||||||
cmp eax, 64*1024
|
cmp eax, 64*1024
|
||||||
jbe $memcpy_align_done
|
jbe $memcpy_align_done
|
||||||
|
|
||||||
$memcpy_do_align:
|
$memcpy_do_align:
|
||||||
mov eax, 8 ; a trick that's faster than rep movsb...
|
mov eax, 8 ; a trick that's faster than rep movsb...
|
||||||
sub eax, edi ; align destination to qword
|
sub eax, edi ; align destination to qword
|
||||||
|
|
|
@ -155,99 +155,96 @@ __forceinline void memcpy_vibes(void * dest, const void * src, int size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
|
// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
|
||||||
// to get around compilation issues with having it in the headers.
|
// to get around compilation issues with having it in the headers.
|
||||||
#ifdef __LINUX__
|
#ifdef __LINUX__
|
||||||
|
|
||||||
// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
|
// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
|
||||||
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
|
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
|
||||||
__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
|
__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
|
||||||
{
|
{
|
||||||
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
|
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
|
||||||
// registers will improve copy performance, because they won't. Use of XMMs is only
|
// registers will improve copy performance, because they won't. Use of XMMs is only
|
||||||
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
|
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
|
||||||
// and even then the benefits are typically minimal (sometimes slower depending on the
|
// and even then the benefits are typically minimal (sometimes slower depending on the
|
||||||
// amount of data being copied).
|
// amount of data being copied).
|
||||||
//
|
//
|
||||||
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
|
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
|
||||||
// --air
|
// --air
|
||||||
|
|
||||||
// Linux Conversion note:
|
// Linux Conversion note:
|
||||||
// This code would benefit nicely from having inline-able GAS syntax, since it should
|
// This code would benefit nicely from having inline-able GAS syntax, since it should
|
||||||
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
|
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
|
||||||
// And its called enough times to probably merit the extra effort to ensure proper
|
// And its called enough times to probably merit the extra effort to ensure proper
|
||||||
// optimization. --air
|
// optimization. --air
|
||||||
|
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
".intel_syntax noprefix\n"
|
".intel_syntax noprefix\n"
|
||||||
//"mov ecx, [%[dest]]\n"
|
"mov eax, %[qwc]\n" // keep a copy of count for looping
|
||||||
//"mov edx, [%[src]]\n"
|
"shr eax, 1\n"
|
||||||
//"mov eax, [%[qwc]]\n" // keep a copy of count
|
"jz memcpy_qwc_1\n" // only one 16 byte block to copy?
|
||||||
"mov eax, %[qwc]\n"
|
|
||||||
"shr eax, 1\n"
|
"cmp eax, 64\n" // "IN_CACHE_COPY/32"
|
||||||
"jz memcpy_qwc_1\n" // only one 16 byte block to copy?
|
"jb memcpy_qwc_loop1\n" // small copies should be cached (definite speedup --air)
|
||||||
|
|
||||||
"cmp %[qwc], 64\n" // "IN_CACHE_COPY/32"
|
"memcpy_qwc_loop2:\n" // 32-byte blocks, uncached copy
|
||||||
"jb memcpy_qwc_loop1\n" // small copies should be cached (definite speedup --air)
|
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
|
||||||
|
|
||||||
"memcpy_qwc_loop2:\n" // 32-byte blocks, uncached copy
|
"movq mm0,[%[src]+0]\n" // read 64 bits
|
||||||
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
|
"movq mm1,[%[src]+8]\n"
|
||||||
|
"movq mm2,[%[src]+16]\n"
|
||||||
"movq mm0,[%[src]+0]\n" // read 64 bits
|
"movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
|
||||||
"movq mm1,[%[src]+8]\n"
|
"movntq [%[dest]+8], mm1\n"
|
||||||
"movq mm2,[%[src]+16]\n"
|
"movq mm3,[%[src]+24]\n"
|
||||||
"movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
|
"movntq [%[dest]+16], mm2\n"
|
||||||
"movntq [%[dest]+8], mm1\n"
|
"movntq [%[dest]+24], mm3\n"
|
||||||
"movq mm3,[%[src]+24]\n"
|
|
||||||
"movntq [%[dest]+16], mm2\n"
|
"add %[src],32\n" // update source pointer
|
||||||
"movntq [%[dest]+24], mm3\n"
|
"add %[dest],32\n" // update destination pointer
|
||||||
|
"sub eax,1\n"
|
||||||
"add %[src],32\n" // update source pointer
|
"jnz memcpy_qwc_loop2\n" // last 64-byte block?
|
||||||
"add %[dest],32\n" // update destination pointer
|
"sfence\n" // flush the write buffer
|
||||||
"sub eax,1\n"
|
"jmp memcpy_qwc_1\n"
|
||||||
"jnz memcpy_qwc_loop2\n" // last 64-byte block?
|
|
||||||
"sfence\n" // flush the write buffer
|
// 32-byte blocks, cached!
|
||||||
"jmp memcpy_qwc_1\n"
|
// This *is* important. Removing this and using exclusively non-temporal stores
|
||||||
|
// results in noticeable speed loss!
|
||||||
// 32-byte blocks, cached!
|
|
||||||
// This *is* important. Removing this and using exclusively non-temporal stores
|
"memcpy_qwc_loop1:\n"
|
||||||
// results in noticeable speed loss!
|
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
|
||||||
|
|
||||||
"memcpy_qwc_loop1:\n"
|
"movq mm0,[%[src]+0]\n" // read 64 bits
|
||||||
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
|
"movq mm1,[%[src]+8]\n"
|
||||||
|
"movq mm2,[%[src]+16]\n"
|
||||||
"movq mm0,[%[src]+0]\n" // read 64 bits
|
"movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
|
||||||
"movq mm1,[%[src]+8]\n"
|
"movq [%[dest]+8], mm1\n"
|
||||||
"movq mm2,[%[src]+16]\n"
|
"movq mm3,[%[src]+24]\n"
|
||||||
"movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
|
"movq [%[dest]+16], mm2\n"
|
||||||
"movq [%[dest]+8], mm1\n"
|
"movq [%[dest]+24], mm3\n"
|
||||||
"movq mm3,[%[src]+24]\n"
|
|
||||||
"movq [%[dest]+16], mm2\n"
|
"add %[src],32\n" // update source pointer
|
||||||
"movq [%[dest]+24], mm3\n"
|
"add %[dest],32\n" // update destination pointer
|
||||||
|
"sub eax,1\n"
|
||||||
"add %[src],32\n" // update source pointer
|
"jnz memcpy_qwc_loop1\n" // last 64-byte block?
|
||||||
"add %[dest],32\n" // update destination pointer
|
|
||||||
"sub eax,1\n"
|
"memcpy_qwc_1:\n"
|
||||||
"jnz memcpy_qwc_loop1\n" // last 64-byte block?
|
"testl %[qwc],1\n"
|
||||||
|
"jz memcpy_qwc_final\n"
|
||||||
"memcpy_qwc_1:\n"
|
"movq mm0,[%[src]]\n"
|
||||||
"test %[qwc],1\n"
|
"movq mm1,[%[src]+8]\n"
|
||||||
"jz memcpy_qwc_final\n"
|
"movq [%[dest]], mm0\n"
|
||||||
"movq mm0,[%[src]]\n"
|
"movq [%[dest]+8], mm1\n"
|
||||||
"movq mm1,[%[src]+8]\n"
|
|
||||||
"movq [%[dest]], mm0\n"
|
"memcpy_qwc_final:\n"
|
||||||
"movq [%[dest]+8], mm1\n"
|
"emms\n" // clean up the MMX state
|
||||||
|
".att_syntax\n"
|
||||||
"memcpy_qwc_final:\n"
|
: "=&r"(dest), "=&r"(src), "=&r"(qwc)
|
||||||
"emms\n" // clean up the MMX state
|
: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
|
||||||
".att_syntax\n"
|
: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
|
||||||
: "=&r"(dest), "=&r"(src), "=&r"(qwc)
|
);
|
||||||
: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
|
}
|
||||||
: "memory", "eax", "mm0", "mm1", "mm2", "mm3"
|
#endif
|
||||||
);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue