common: remove old memcpy implementation

PCSX2 used standard memcpy now (thanks to xsacha)
This commit is contained in:
Gregory Hainaut 2014-10-26 22:33:42 +01:00
parent 4d818f6cd9
commit 69e88ffed0
4 changed files with 3 additions and 637 deletions

View File

@ -32,10 +32,6 @@ extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason. // Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
void _memset16_unaligned( void* dest, u16 data, size_t size ); void _memset16_unaligned( void* dest, u16 data, size_t size );
// MemcpyVibes.cpp functions
extern void memcpy_vibes(void * dest, const void * src, int size);
extern void gen_memcpy_vibes();
#define memcpy_fast memcpy #define memcpy_fast memcpy
#define memcpy_aligned(d,s,c) memcpy(d,s,c) #define memcpy_aligned(d,s,c) memcpy(d,s,c)
#define memcpy_const memcpy #define memcpy_const memcpy

View File

@ -128,7 +128,6 @@ set(UtilitiesSources
wxAppWithHelpers.cpp wxAppWithHelpers.cpp
wxGuiTools.cpp wxGuiTools.cpp
wxHelpers.cpp wxHelpers.cpp
x86/MemcpyVibes.cpp
) )
# variable with all headers of this library # variable with all headers of this library

View File

@ -31,290 +31,19 @@
3dsdk.support@amd.com 3dsdk.support@amd.com
******************************************************************************/ ******************************************************************************/
// GH: AMD memcpy was removed. The remaining part (memcmp_mmx) is likely from Zerofrog.
// Hopefully memcmp_mmx will be dropped in the future.
#include "PrecompiledHeader.h" #include "PrecompiledHeader.h"
#ifdef _MSC_VER #ifdef _MSC_VER
#pragma warning(disable:4414) #pragma warning(disable:4414)
#endif #endif
/*****************************************************************************
MEMCPY_AMD.CPP
******************************************************************************/
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetch instructions,
// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
// calling!
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".
#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
// also using the "unrolled loop" optimization. This code uses
// the software prefetch instruction to get the data into the cache.
#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
// Inline assembly syntax for use with Visual C++ // Inline assembly syntax for use with Visual C++
#if defined(_MSC_VER) #if defined(_MSC_VER)
// Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
{
__asm
{
push edi
push esi
mov edi, ecx ; destination
mov esi, edx ; source
mov ecx, [esp+12] ; number of bytes to copy
mov eax, ecx ; keep a copy of count
cld
cmp eax, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy
cmp eax, 32*1024 ; dont align between 32k-64k because
jbe $memcpy_do_align ; it appears to be slower
cmp eax, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov eax, 8 ; a trick that s faster than rep movsb...
sub eax, edi ; align destination to qword
and eax, 111b ; get the low bits
sub ecx, eax ; update copy count
neg eax ; set up to jump into the array
add eax, offset $memcpy_align_done
jmp eax ; jump to array of movsb s
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$memcpy_align_done: ; destination is dword aligned
mov eax, ecx ; number of bytes left to copy
shr eax, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes
cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test
// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0, [esi+0] ; read 64 bits
movq mm1, [esi+8]
movq [edi+0], mm0 ; write 64 bits
movq [edi+8], mm1 ; note: the normal movq writes the
movq mm2, [esi+16] ; data to cache; a cache line will be
movq mm3, [esi+24] ; allocated as needed, to store the data
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
sub eax, 1
jnz $memcpy_ic_1 ; last 64-byte block?
$memcpy_ic_2:
mov eax, ecx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
shr eax, 2 ; dword count
and eax, 1111b ; only look at the "remainder" bits
neg eax ; set up to jump into the array
add eax, offset $memcpy_last_few
jmp eax ; jump to array of movsd s
$memcpy_uc_test:
or eax, eax ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0,[esi+0] ; read 64 bits
add edi,64 ; update destination pointer
movq mm1,[esi+8]
add esi,64 ; update source pointer
movq mm2,[esi-48]
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
movq mm0,[esi-40] ; note: movntq also prevents the CPU
movntq [edi-56], mm1 ; from READING the destination address
movq mm1,[esi-32] ; into the cache, only to be over-written
movntq [edi-48], mm2 ; so that also helps performance
movq mm2,[esi-24]
movntq [edi-40], mm0
movq mm0,[esi-16]
movntq [edi-32], mm1
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
movntq [edi-8], mm1
sub eax, 1
jnz $memcpy_uc_1 ; last 64-byte block?
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
align 16
movsd
movsd ; perform last 1-15 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd ; perform last 1-7 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
$memcpy_last_few: ; dword aligned from before movsd s
and ecx, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let s leave
rep movsb ; the last 1, 2, or 3 bytes
$memcpy_final:
pop esi
pop edi
emms ; clean up the MMX state
sfence ; flush the write buffer
//mov eax, [dest] ; ret value = destination pointer
ret 4
}
}
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
{
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
// registers will improve copy performance, because they won't. Use of XMMs is only
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
// and even then the benefits are typically minimal (sometimes slower depending on the
// amount of data being copied).
//
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
// --air
// Linux Conversion note:
// This code would benefit nicely from having inline-able GAS syntax, since it should
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
// And its called enough times to probably merit the extra effort to ensure proper
// optimization. --air
__asm
{
mov ecx, dest
mov edx, src
mov eax, qwc ; keep a copy of count
shr eax, 1
jz $memcpy_qwc_1 ; only one 16 byte block to copy?
cmp eax, IN_CACHE_COPY/32
jb $memcpy_qwc_loop1 ; small copies should be cached (definite speedup --air)
$memcpy_qwc_loop2: ; 32-byte blocks, uncached copy
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
movq mm0,[edx+0] ; read 64 bits
movq mm1,[edx+8]
movq mm2,[edx+16]
movntq [ecx+0], mm0 ; write 64 bits, bypassing the cache
movntq [ecx+8], mm1
movq mm3,[edx+24]
movntq [ecx+16], mm2
movntq [ecx+24], mm3
add edx,32 ; update source pointer
add ecx,32 ; update destination pointer
sub eax,1
jnz $memcpy_qwc_loop2 ; last 64-byte block?
sfence ; flush the write buffer
jmp $memcpy_qwc_1
; 32-byte blocks, cached!
; This *is* important. Removing this and using exclusively non-temporal stores
; results in noticable speed loss!
$memcpy_qwc_loop1:
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
movq mm0,[edx+0] ; read 64 bits
movq mm1,[edx+8]
movq mm2,[edx+16]
movq [ecx+0], mm0 ; write 64 bits, bypassing the cache
movq [ecx+8], mm1
movq mm3,[edx+24]
movq [ecx+16], mm2
movq [ecx+24], mm3
add edx,32 ; update source pointer
add ecx,32 ; update destination pointer
sub eax,1
jnz $memcpy_qwc_loop1 ; last 64-byte block?
$memcpy_qwc_1:
test qwc,1
jz $memcpy_qwc_final
movq mm0,[edx]
movq mm1,[edx+8]
movq [ecx], mm0
movq [ecx+8], mm1
$memcpy_qwc_final:
emms ; clean up the MMX state
}
}
// mmx mem-compare implementation, size has to be a multiple of 8 // mmx mem-compare implementation, size has to be a multiple of 8
// returns 0 is equal, nonzero value if not equal // returns 0 is equal, nonzero value if not equal
// ~10 times faster than standard memcmp // ~10 times faster than standard memcmp
@ -489,112 +218,4 @@ End:
} }
} }
// returns the xor of all elements, cmpsize has to be mult of 8
void memxor_mmx(void* dst, const void* src1, int cmpsize)
{
pxAssert( (cmpsize&7) == 0 );
__asm {
mov ecx, cmpsize
mov eax, src1
mov edx, dst
cmp ecx, 64
jl Setup4
movq mm0, [eax]
movq mm1, [eax+8]
movq mm2, [eax+16]
movq mm3, [eax+24]
movq mm4, [eax+32]
movq mm5, [eax+40]
movq mm6, [eax+48]
movq mm7, [eax+56]
sub ecx, 64
add eax, 64
cmp ecx, 64
jl End8
Cmp8:
pxor mm0, [eax]
pxor mm1, [eax+8]
pxor mm2, [eax+16]
pxor mm3, [eax+24]
pxor mm4, [eax+32]
pxor mm5, [eax+40]
pxor mm6, [eax+48]
pxor mm7, [eax+56]
sub ecx, 64
add eax, 64
cmp ecx, 64
jge Cmp8
End8:
pxor mm0, mm4
pxor mm1, mm5
pxor mm2, mm6
pxor mm3, mm7
cmp ecx, 32
jl End4
pxor mm0, [eax]
pxor mm1, [eax+8]
pxor mm2, [eax+16]
pxor mm3, [eax+24]
sub ecx, 32
add eax, 32
jmp End4
Setup4:
cmp ecx, 32
jl Setup2
movq mm0, [eax]
movq mm1, [eax+8]
movq mm2, [eax+16]
movq mm3, [eax+24]
sub ecx, 32
add eax, 32
End4:
pxor mm0, mm2
pxor mm1, mm3
cmp ecx, 16
jl End2
pxor mm0, [eax]
pxor mm1, [eax+8]
sub ecx, 16
add eax, 16
jmp End2
Setup2:
cmp ecx, 16
jl Setup1
movq mm0, [eax]
movq mm1, [eax+8]
sub ecx, 16
add eax, 16
End2:
pxor mm0, mm1
cmp ecx, 8
jl End1
pxor mm0, [eax]
End1:
movq [edx], mm0
jmp End
Setup1:
movq mm0, [eax]
movq [edx], mm0
End:
emms
}
}
#endif #endif

View File

@ -1,250 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2010 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "x86emitter/x86emitter.h"
#include <xmmintrin.h>
using namespace x86Emitter;
// Max Number of qwc supported
#define _maxSize 0x400
typedef void (__fastcall *_memCpyCall)(void*, void*);
__aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
#if 1
// this version uses SSE intrinsics to perform an inline copy. MSVC disasm shows pretty
// decent code generation on whole, but it hasn't been benchmarked at all yet --air
__fi void memcpy_vibes(void * dest, const void * src, int size) {
float (*destxmm)[4] = (float(*)[4])dest, (*srcxmm)[4] = (float(*)[4])src;
size_t count = size & ~15, extra = size & 15;
destxmm -= 8 - extra, srcxmm -= 8 - extra;
switch (extra) {
do {
destxmm += 16, srcxmm += 16, count -= 16;
_mm_store_ps(&destxmm[-8][0], _mm_load_ps(&srcxmm[-8][0]));
case 15:
_mm_store_ps(&destxmm[-7][0], _mm_load_ps(&srcxmm[-7][0]));
case 14:
_mm_store_ps(&destxmm[-6][0], _mm_load_ps(&srcxmm[-6][0]));
case 13:
_mm_store_ps(&destxmm[-5][0], _mm_load_ps(&srcxmm[-5][0]));
case 12:
_mm_store_ps(&destxmm[-4][0], _mm_load_ps(&srcxmm[-4][0]));
case 11:
_mm_store_ps(&destxmm[-3][0], _mm_load_ps(&srcxmm[-3][0]));
case 10:
_mm_store_ps(&destxmm[-2][0], _mm_load_ps(&srcxmm[-2][0]));
case 9:
_mm_store_ps(&destxmm[-1][0], _mm_load_ps(&srcxmm[-1][0]));
case 8:
_mm_store_ps(&destxmm[ 0][0], _mm_load_ps(&srcxmm[ 0][0]));
case 7:
_mm_store_ps(&destxmm[ 1][0], _mm_load_ps(&srcxmm[ 1][0]));
case 6:
_mm_store_ps(&destxmm[ 2][0], _mm_load_ps(&srcxmm[ 2][0]));
case 5:
_mm_store_ps(&destxmm[ 3][0], _mm_load_ps(&srcxmm[ 3][0]));
case 4:
_mm_store_ps(&destxmm[ 4][0], _mm_load_ps(&srcxmm[ 4][0]));
case 3:
_mm_store_ps(&destxmm[ 5][0], _mm_load_ps(&srcxmm[ 5][0]));
case 2:
_mm_store_ps(&destxmm[ 6][0], _mm_load_ps(&srcxmm[ 6][0]));
case 1:
_mm_store_ps(&destxmm[ 7][0], _mm_load_ps(&srcxmm[ 7][0]));
case 0: NULL;
} while (count);
}
}
#else
#if 1
// This version creates one function with a lot of movaps
// It jumps to the correct movaps entry-point while adding
// the proper offset for adjustment...
static __pagealigned u8 _memCpyExec[__pagesize*16];
void gen_memcpy_vibes() {
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
xSetPtr(_memCpyExec);
int off =-(((_maxSize & 0xf) - 7) << 4);
for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) {
_memcpy_vibes[i] = (_memCpyCall)xGetPtr();
if (off >= 128) {
off = -128;
xADD(edx, 256);
xADD(ecx, 256);
}
const xRegisterSSE xmm_t(x);
xMOVAPS (xmm_t, ptr32[edx+off]);
xMOVNTPS(ptr32[ecx+off], xmm_t);
}
_memcpy_vibes[0] = (_memCpyCall)xGetPtr();
xRET();
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
}
__fi void memcpy_vibes(void * dest, const void * src, int size) {
int offset = ((size & 0xf) - 7) << 4;
_memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
}
#else
// This version creates '_maxSize' number of different functions,
// and calls the appropriate one...
static __pagealigned u8 _memCpyExec[__pagesize*_maxSize*2];
void gen_memcpy_vibes() {
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
xSetPtr(_memCpyExec);
for (int i = 0; i < _maxSize+1; i++)
{
int off = 0;
_memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget();
for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) {
if (off >= 128) {
off = -128;
xADD(edx, 256);
xADD(ecx, 256);
}
const xRegisterSSE xmm_t(x);
xMOVAPS(xmm_t, ptr32[edx+off]);
xMOVAPS(ptr32[ecx+off], xmm_t);
}
xRET();
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
}
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
}
__fi void memcpy_vibes(void * dest, const void * src, int size) {
_memcpy_vibes[size](dest, src);
}
#endif
#endif
// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
// to get around compilation issues with having it in the headers.
#ifdef __linux__
// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
{
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
// registers will improve copy performance, because they won't. Use of XMMs is only
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
// and even then the benefits are typically minimal (sometimes slower depending on the
// amount of data being copied).
//
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
// --air
// Linux Conversion note:
// This code would benefit nicely from having inline-able GAS syntax, since it should
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
// And its called enough times to probably merit the extra effort to ensure proper
// optimization. --air
__asm__ __volatile__
(
".intel_syntax noprefix\n"
"sub %[qwc], 1\n" // dec the counter to ease the count of 16bytes block later (optimization)
// Note after this line, real value of the counter is %[qwc] + 1
"jle memcpy_qwc_1_%=\n" // only one 16 byte block to copy? Or nothing.
"cmp %[qwc], 127\n" // "IN_CACHE_COPY/16"
"jb memcpy_qwc_loop1_%=\n" // small copies should be cached (definite speedup --air)
"memcpy_qwc_loop2_%=:\n" // 32-byte blocks, uncached copy
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
"movq mm0,[%[src]+0]\n" // read 64 bits
"movq mm1,[%[src]+8]\n"
"movq mm2,[%[src]+16]\n"
"movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
"movntq [%[dest]+8], mm1\n"
"movq mm3,[%[src]+24]\n"
"movntq [%[dest]+16], mm2\n"
"movntq [%[dest]+24], mm3\n"
"add %[src],32\n" // update source pointer
"add %[dest],32\n" // update destination pointer
"sub %[qwc],2\n"
"jg memcpy_qwc_loop2_%=\n" // last 64-byte block?
"sfence\n" // flush the write buffer
"jmp memcpy_qwc_1_%=\n"
// 32-byte blocks, cached!
// This *is* important. Removing this and using exclusively non-temporal stores
// results in noticeable speed loss!
"memcpy_qwc_loop1_%=:\n"
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
"movq mm0,[%[src]+0]\n" // read 64 bits
"movq mm1,[%[src]+8]\n"
"movq mm2,[%[src]+16]\n"
"movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
"movq [%[dest]+8], mm1\n"
"movq mm3,[%[src]+24]\n"
"movq [%[dest]+16], mm2\n"
"movq [%[dest]+24], mm3\n"
"add %[src],32\n" // update source pointer
"add %[dest],32\n" // update destination pointer
"sub %[qwc],2\n"
"jg memcpy_qwc_loop2_%=\n" // last 64-byte block?
"memcpy_qwc_1_%=:\n"
"cmp %[qwc],0\n"
"jne memcpy_qwc_final_%=\n"
"movq mm0,[%[src]]\n"
"movq mm1,[%[src]+8]\n"
"movq [%[dest]], mm0\n"
"movq [%[dest]+8], mm1\n"
"memcpy_qwc_final_%=:\n"
"emms\n" // clean up the MMX state
".att_syntax\n"
: "=&r"(dest), "=&r"(src), "=&r"(qwc)
: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
: "memory", "mm0", "mm1", "mm2", "mm3"
);
}
#endif