mirror of https://github.com/PCSX2/pcsx2.git
common: remove old memcpy implementation
PCSX2 used standard memcpy now (thanks to xsacha)
This commit is contained in:
parent
4d818f6cd9
commit
69e88ffed0
|
@ -32,10 +32,6 @@ extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
|||
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
|
||||
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||
|
||||
// MemcpyVibes.cpp functions
|
||||
extern void memcpy_vibes(void * dest, const void * src, int size);
|
||||
extern void gen_memcpy_vibes();
|
||||
|
||||
#define memcpy_fast memcpy
|
||||
#define memcpy_aligned(d,s,c) memcpy(d,s,c)
|
||||
#define memcpy_const memcpy
|
||||
|
|
|
@ -128,7 +128,6 @@ set(UtilitiesSources
|
|||
wxAppWithHelpers.cpp
|
||||
wxGuiTools.cpp
|
||||
wxHelpers.cpp
|
||||
x86/MemcpyVibes.cpp
|
||||
)
|
||||
|
||||
# variable with all headers of this library
|
||||
|
|
|
@ -31,290 +31,19 @@
|
|||
3dsdk.support@amd.com
|
||||
******************************************************************************/
|
||||
|
||||
// GH: AMD memcpy was removed. The remaining part (memcmp_mmx) is likely from Zerofrog.
|
||||
// Hopefully memcmp_mmx will be dropped in the future.
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable:4414)
|
||||
#endif
|
||||
|
||||
/*****************************************************************************
|
||||
MEMCPY_AMD.CPP
|
||||
******************************************************************************/
|
||||
|
||||
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
||||
// "Streaming Store"), and also uses the software prefetch instructions,
|
||||
// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
|
||||
// calling!
|
||||
|
||||
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop".
|
||||
|
||||
#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch
|
||||
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
|
||||
// also using the "unrolled loop" optimization. This code uses
|
||||
// the software prefetch instruction to get the data into the cache.
|
||||
|
||||
#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
|
||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
|
||||
|
||||
// Inline assembly syntax for use with Visual C++
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
|
||||
// Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
|
||||
__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
||||
{
|
||||
__asm
|
||||
{
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov edi, ecx ; destination
|
||||
mov esi, edx ; source
|
||||
mov ecx, [esp+12] ; number of bytes to copy
|
||||
mov eax, ecx ; keep a copy of count
|
||||
|
||||
cld
|
||||
cmp eax, TINY_BLOCK_COPY
|
||||
jb $memcpy_ic_3 ; tiny? skip mmx copy
|
||||
|
||||
cmp eax, 32*1024 ; dont align between 32k-64k because
|
||||
jbe $memcpy_do_align ; it appears to be slower
|
||||
cmp eax, 64*1024
|
||||
jbe $memcpy_align_done
|
||||
|
||||
$memcpy_do_align:
|
||||
mov eax, 8 ; a trick that s faster than rep movsb...
|
||||
sub eax, edi ; align destination to qword
|
||||
and eax, 111b ; get the low bits
|
||||
sub ecx, eax ; update copy count
|
||||
neg eax ; set up to jump into the array
|
||||
add eax, offset $memcpy_align_done
|
||||
jmp eax ; jump to array of movsb s
|
||||
|
||||
align 4
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
|
||||
$memcpy_align_done: ; destination is dword aligned
|
||||
mov eax, ecx ; number of bytes left to copy
|
||||
shr eax, 6 ; get 64-byte block count
|
||||
jz $memcpy_ic_2 ; finish the last few bytes
|
||||
|
||||
cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
||||
jae $memcpy_uc_test
|
||||
|
||||
// This is small block copy that uses the MMX registers to copy 8 bytes
|
||||
// at a time. It uses the "unrolled loop" optimization, and also uses
|
||||
// the software prefetch instruction to get the data into the cache.
|
||||
align 16
|
||||
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
||||
|
||||
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
||||
|
||||
movq mm0, [esi+0] ; read 64 bits
|
||||
movq mm1, [esi+8]
|
||||
movq [edi+0], mm0 ; write 64 bits
|
||||
movq [edi+8], mm1 ; note: the normal movq writes the
|
||||
movq mm2, [esi+16] ; data to cache; a cache line will be
|
||||
movq mm3, [esi+24] ; allocated as needed, to store the data
|
||||
movq [edi+16], mm2
|
||||
movq [edi+24], mm3
|
||||
movq mm0, [esi+32]
|
||||
movq mm1, [esi+40]
|
||||
movq [edi+32], mm0
|
||||
movq [edi+40], mm1
|
||||
movq mm2, [esi+48]
|
||||
movq mm3, [esi+56]
|
||||
movq [edi+48], mm2
|
||||
movq [edi+56], mm3
|
||||
|
||||
add esi, 64 ; update source pointer
|
||||
add edi, 64 ; update destination pointer
|
||||
sub eax, 1
|
||||
jnz $memcpy_ic_1 ; last 64-byte block?
|
||||
|
||||
$memcpy_ic_2:
|
||||
mov eax, ecx ; has valid low 6 bits of the byte count
|
||||
$memcpy_ic_3:
|
||||
shr eax, 2 ; dword count
|
||||
and eax, 1111b ; only look at the "remainder" bits
|
||||
neg eax ; set up to jump into the array
|
||||
add eax, offset $memcpy_last_few
|
||||
jmp eax ; jump to array of movsd s
|
||||
|
||||
$memcpy_uc_test:
|
||||
or eax, eax ; tail end of block prefetch will jump here
|
||||
jz $memcpy_ic_2 ; no more 64-byte blocks left
|
||||
|
||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
|
||||
align 16
|
||||
$memcpy_uc_1: ; 64-byte blocks, uncached copy
|
||||
|
||||
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
||||
|
||||
movq mm0,[esi+0] ; read 64 bits
|
||||
add edi,64 ; update destination pointer
|
||||
movq mm1,[esi+8]
|
||||
add esi,64 ; update source pointer
|
||||
movq mm2,[esi-48]
|
||||
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
|
||||
movq mm0,[esi-40] ; note: movntq also prevents the CPU
|
||||
movntq [edi-56], mm1 ; from READING the destination address
|
||||
movq mm1,[esi-32] ; into the cache, only to be over-written
|
||||
movntq [edi-48], mm2 ; so that also helps performance
|
||||
movq mm2,[esi-24]
|
||||
movntq [edi-40], mm0
|
||||
movq mm0,[esi-16]
|
||||
movntq [edi-32], mm1
|
||||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16], mm0
|
||||
movntq [edi-8], mm1
|
||||
|
||||
sub eax, 1
|
||||
jnz $memcpy_uc_1 ; last 64-byte block?
|
||||
|
||||
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
|
||||
|
||||
// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
|
||||
// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
|
||||
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop". Then it handles the last few bytes.
|
||||
align 16
|
||||
movsd
|
||||
movsd ; perform last 1-15 dword copies
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd ; perform last 1-7 dword copies
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
|
||||
$memcpy_last_few: ; dword aligned from before movsd s
|
||||
and ecx, 11b ; the last few cows must come home
|
||||
jz $memcpy_final ; no more, let s leave
|
||||
rep movsb ; the last 1, 2, or 3 bytes
|
||||
|
||||
$memcpy_final:
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
emms ; clean up the MMX state
|
||||
sfence ; flush the write buffer
|
||||
//mov eax, [dest] ; ret value = destination pointer
|
||||
|
||||
ret 4
|
||||
}
|
||||
}
|
||||
|
||||
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
|
||||
__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
|
||||
{
|
||||
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
|
||||
// registers will improve copy performance, because they won't. Use of XMMs is only
|
||||
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
|
||||
// and even then the benefits are typically minimal (sometimes slower depending on the
|
||||
// amount of data being copied).
|
||||
//
|
||||
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
|
||||
// --air
|
||||
|
||||
// Linux Conversion note:
|
||||
// This code would benefit nicely from having inline-able GAS syntax, since it should
|
||||
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
|
||||
// And its called enough times to probably merit the extra effort to ensure proper
|
||||
// optimization. --air
|
||||
|
||||
__asm
|
||||
{
|
||||
mov ecx, dest
|
||||
mov edx, src
|
||||
mov eax, qwc ; keep a copy of count
|
||||
shr eax, 1
|
||||
jz $memcpy_qwc_1 ; only one 16 byte block to copy?
|
||||
|
||||
cmp eax, IN_CACHE_COPY/32
|
||||
jb $memcpy_qwc_loop1 ; small copies should be cached (definite speedup --air)
|
||||
|
||||
$memcpy_qwc_loop2: ; 32-byte blocks, uncached copy
|
||||
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
|
||||
|
||||
movq mm0,[edx+0] ; read 64 bits
|
||||
movq mm1,[edx+8]
|
||||
movq mm2,[edx+16]
|
||||
movntq [ecx+0], mm0 ; write 64 bits, bypassing the cache
|
||||
movntq [ecx+8], mm1
|
||||
movq mm3,[edx+24]
|
||||
movntq [ecx+16], mm2
|
||||
movntq [ecx+24], mm3
|
||||
|
||||
add edx,32 ; update source pointer
|
||||
add ecx,32 ; update destination pointer
|
||||
sub eax,1
|
||||
jnz $memcpy_qwc_loop2 ; last 64-byte block?
|
||||
sfence ; flush the write buffer
|
||||
jmp $memcpy_qwc_1
|
||||
|
||||
; 32-byte blocks, cached!
|
||||
; This *is* important. Removing this and using exclusively non-temporal stores
|
||||
; results in noticable speed loss!
|
||||
|
||||
$memcpy_qwc_loop1:
|
||||
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
|
||||
|
||||
movq mm0,[edx+0] ; read 64 bits
|
||||
movq mm1,[edx+8]
|
||||
movq mm2,[edx+16]
|
||||
movq [ecx+0], mm0 ; write 64 bits, bypassing the cache
|
||||
movq [ecx+8], mm1
|
||||
movq mm3,[edx+24]
|
||||
movq [ecx+16], mm2
|
||||
movq [ecx+24], mm3
|
||||
|
||||
add edx,32 ; update source pointer
|
||||
add ecx,32 ; update destination pointer
|
||||
sub eax,1
|
||||
jnz $memcpy_qwc_loop1 ; last 64-byte block?
|
||||
|
||||
$memcpy_qwc_1:
|
||||
test qwc,1
|
||||
jz $memcpy_qwc_final
|
||||
movq mm0,[edx]
|
||||
movq mm1,[edx+8]
|
||||
movq [ecx], mm0
|
||||
movq [ecx+8], mm1
|
||||
|
||||
$memcpy_qwc_final:
|
||||
emms ; clean up the MMX state
|
||||
}
|
||||
}
|
||||
|
||||
// mmx mem-compare implementation, size has to be a multiple of 8
|
||||
// returns 0 is equal, nonzero value if not equal
|
||||
// ~10 times faster than standard memcmp
|
||||
|
@ -489,112 +218,4 @@ End:
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// returns the xor of all elements, cmpsize has to be mult of 8
|
||||
void memxor_mmx(void* dst, const void* src1, int cmpsize)
|
||||
{
|
||||
pxAssert( (cmpsize&7) == 0 );
|
||||
|
||||
__asm {
|
||||
mov ecx, cmpsize
|
||||
mov eax, src1
|
||||
mov edx, dst
|
||||
|
||||
cmp ecx, 64
|
||||
jl Setup4
|
||||
|
||||
movq mm0, [eax]
|
||||
movq mm1, [eax+8]
|
||||
movq mm2, [eax+16]
|
||||
movq mm3, [eax+24]
|
||||
movq mm4, [eax+32]
|
||||
movq mm5, [eax+40]
|
||||
movq mm6, [eax+48]
|
||||
movq mm7, [eax+56]
|
||||
sub ecx, 64
|
||||
add eax, 64
|
||||
cmp ecx, 64
|
||||
jl End8
|
||||
|
||||
Cmp8:
|
||||
pxor mm0, [eax]
|
||||
pxor mm1, [eax+8]
|
||||
pxor mm2, [eax+16]
|
||||
pxor mm3, [eax+24]
|
||||
pxor mm4, [eax+32]
|
||||
pxor mm5, [eax+40]
|
||||
pxor mm6, [eax+48]
|
||||
pxor mm7, [eax+56]
|
||||
|
||||
sub ecx, 64
|
||||
add eax, 64
|
||||
cmp ecx, 64
|
||||
jge Cmp8
|
||||
|
||||
End8:
|
||||
pxor mm0, mm4
|
||||
pxor mm1, mm5
|
||||
pxor mm2, mm6
|
||||
pxor mm3, mm7
|
||||
|
||||
cmp ecx, 32
|
||||
jl End4
|
||||
pxor mm0, [eax]
|
||||
pxor mm1, [eax+8]
|
||||
pxor mm2, [eax+16]
|
||||
pxor mm3, [eax+24]
|
||||
sub ecx, 32
|
||||
add eax, 32
|
||||
jmp End4
|
||||
|
||||
Setup4:
|
||||
cmp ecx, 32
|
||||
jl Setup2
|
||||
|
||||
movq mm0, [eax]
|
||||
movq mm1, [eax+8]
|
||||
movq mm2, [eax+16]
|
||||
movq mm3, [eax+24]
|
||||
sub ecx, 32
|
||||
add eax, 32
|
||||
|
||||
End4:
|
||||
pxor mm0, mm2
|
||||
pxor mm1, mm3
|
||||
|
||||
cmp ecx, 16
|
||||
jl End2
|
||||
pxor mm0, [eax]
|
||||
pxor mm1, [eax+8]
|
||||
sub ecx, 16
|
||||
add eax, 16
|
||||
jmp End2
|
||||
|
||||
Setup2:
|
||||
cmp ecx, 16
|
||||
jl Setup1
|
||||
|
||||
movq mm0, [eax]
|
||||
movq mm1, [eax+8]
|
||||
sub ecx, 16
|
||||
add eax, 16
|
||||
|
||||
End2:
|
||||
pxor mm0, mm1
|
||||
|
||||
cmp ecx, 8
|
||||
jl End1
|
||||
pxor mm0, [eax]
|
||||
End1:
|
||||
movq [edx], mm0
|
||||
jmp End
|
||||
|
||||
Setup1:
|
||||
movq mm0, [eax]
|
||||
movq [edx], mm0
|
||||
End:
|
||||
emms
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,250 +0,0 @@
|
|||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2010 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "x86emitter/x86emitter.h"
|
||||
#include <xmmintrin.h>
|
||||
|
||||
using namespace x86Emitter;
|
||||
|
||||
// Max Number of qwc supported
|
||||
#define _maxSize 0x400
|
||||
|
||||
typedef void (__fastcall *_memCpyCall)(void*, void*);
|
||||
__aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
|
||||
|
||||
#if 1
|
||||
|
||||
// this version uses SSE intrinsics to perform an inline copy. MSVC disasm shows pretty
|
||||
// decent code generation on whole, but it hasn't been benchmarked at all yet --air
|
||||
__fi void memcpy_vibes(void * dest, const void * src, int size) {
|
||||
|
||||
float (*destxmm)[4] = (float(*)[4])dest, (*srcxmm)[4] = (float(*)[4])src;
|
||||
size_t count = size & ~15, extra = size & 15;
|
||||
|
||||
destxmm -= 8 - extra, srcxmm -= 8 - extra;
|
||||
switch (extra) {
|
||||
do {
|
||||
destxmm += 16, srcxmm += 16, count -= 16;
|
||||
_mm_store_ps(&destxmm[-8][0], _mm_load_ps(&srcxmm[-8][0]));
|
||||
case 15:
|
||||
_mm_store_ps(&destxmm[-7][0], _mm_load_ps(&srcxmm[-7][0]));
|
||||
case 14:
|
||||
_mm_store_ps(&destxmm[-6][0], _mm_load_ps(&srcxmm[-6][0]));
|
||||
case 13:
|
||||
_mm_store_ps(&destxmm[-5][0], _mm_load_ps(&srcxmm[-5][0]));
|
||||
case 12:
|
||||
_mm_store_ps(&destxmm[-4][0], _mm_load_ps(&srcxmm[-4][0]));
|
||||
case 11:
|
||||
_mm_store_ps(&destxmm[-3][0], _mm_load_ps(&srcxmm[-3][0]));
|
||||
case 10:
|
||||
_mm_store_ps(&destxmm[-2][0], _mm_load_ps(&srcxmm[-2][0]));
|
||||
case 9:
|
||||
_mm_store_ps(&destxmm[-1][0], _mm_load_ps(&srcxmm[-1][0]));
|
||||
case 8:
|
||||
_mm_store_ps(&destxmm[ 0][0], _mm_load_ps(&srcxmm[ 0][0]));
|
||||
case 7:
|
||||
_mm_store_ps(&destxmm[ 1][0], _mm_load_ps(&srcxmm[ 1][0]));
|
||||
case 6:
|
||||
_mm_store_ps(&destxmm[ 2][0], _mm_load_ps(&srcxmm[ 2][0]));
|
||||
case 5:
|
||||
_mm_store_ps(&destxmm[ 3][0], _mm_load_ps(&srcxmm[ 3][0]));
|
||||
case 4:
|
||||
_mm_store_ps(&destxmm[ 4][0], _mm_load_ps(&srcxmm[ 4][0]));
|
||||
case 3:
|
||||
_mm_store_ps(&destxmm[ 5][0], _mm_load_ps(&srcxmm[ 5][0]));
|
||||
case 2:
|
||||
_mm_store_ps(&destxmm[ 6][0], _mm_load_ps(&srcxmm[ 6][0]));
|
||||
case 1:
|
||||
_mm_store_ps(&destxmm[ 7][0], _mm_load_ps(&srcxmm[ 7][0]));
|
||||
case 0: NULL;
|
||||
} while (count);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
#if 1
|
||||
// This version creates one function with a lot of movaps
|
||||
// It jumps to the correct movaps entry-point while adding
|
||||
// the proper offset for adjustment...
|
||||
|
||||
static __pagealigned u8 _memCpyExec[__pagesize*16];
|
||||
|
||||
void gen_memcpy_vibes() {
|
||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
|
||||
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
|
||||
xSetPtr(_memCpyExec);
|
||||
|
||||
int off =-(((_maxSize & 0xf) - 7) << 4);
|
||||
for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) {
|
||||
|
||||
_memcpy_vibes[i] = (_memCpyCall)xGetPtr();
|
||||
|
||||
if (off >= 128) {
|
||||
off = -128;
|
||||
xADD(edx, 256);
|
||||
xADD(ecx, 256);
|
||||
}
|
||||
const xRegisterSSE xmm_t(x);
|
||||
xMOVAPS (xmm_t, ptr32[edx+off]);
|
||||
xMOVNTPS(ptr32[ecx+off], xmm_t);
|
||||
}
|
||||
|
||||
_memcpy_vibes[0] = (_memCpyCall)xGetPtr();
|
||||
|
||||
xRET();
|
||||
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
|
||||
|
||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
|
||||
}
|
||||
|
||||
__fi void memcpy_vibes(void * dest, const void * src, int size) {
|
||||
int offset = ((size & 0xf) - 7) << 4;
|
||||
_memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// This version creates '_maxSize' number of different functions,
|
||||
// and calls the appropriate one...
|
||||
|
||||
static __pagealigned u8 _memCpyExec[__pagesize*_maxSize*2];
|
||||
|
||||
void gen_memcpy_vibes() {
|
||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
|
||||
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
|
||||
xSetPtr(_memCpyExec);
|
||||
|
||||
for (int i = 0; i < _maxSize+1; i++)
|
||||
{
|
||||
int off = 0;
|
||||
_memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget();
|
||||
|
||||
for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) {
|
||||
if (off >= 128) {
|
||||
off = -128;
|
||||
xADD(edx, 256);
|
||||
xADD(ecx, 256);
|
||||
}
|
||||
const xRegisterSSE xmm_t(x);
|
||||
xMOVAPS(xmm_t, ptr32[edx+off]);
|
||||
xMOVAPS(ptr32[ecx+off], xmm_t);
|
||||
}
|
||||
|
||||
xRET();
|
||||
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
|
||||
}
|
||||
|
||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
|
||||
}
|
||||
|
||||
__fi void memcpy_vibes(void * dest, const void * src, int size) {
|
||||
_memcpy_vibes[size](dest, src);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
|
||||
// to get around compilation issues with having it in the headers.
|
||||
#ifdef __linux__
|
||||
|
||||
// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
|
||||
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
|
||||
__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
|
||||
{
|
||||
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
|
||||
// registers will improve copy performance, because they won't. Use of XMMs is only
|
||||
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
|
||||
// and even then the benefits are typically minimal (sometimes slower depending on the
|
||||
// amount of data being copied).
|
||||
//
|
||||
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
|
||||
// --air
|
||||
|
||||
// Linux Conversion note:
|
||||
// This code would benefit nicely from having inline-able GAS syntax, since it should
|
||||
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
|
||||
// And its called enough times to probably merit the extra effort to ensure proper
|
||||
// optimization. --air
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
".intel_syntax noprefix\n"
|
||||
"sub %[qwc], 1\n" // dec the counter to ease the count of 16bytes block later (optimization)
|
||||
// Note after this line, real value of the counter is %[qwc] + 1
|
||||
"jle memcpy_qwc_1_%=\n" // only one 16 byte block to copy? Or nothing.
|
||||
|
||||
"cmp %[qwc], 127\n" // "IN_CACHE_COPY/16"
|
||||
"jb memcpy_qwc_loop1_%=\n" // small copies should be cached (definite speedup --air)
|
||||
|
||||
"memcpy_qwc_loop2_%=:\n" // 32-byte blocks, uncached copy
|
||||
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
|
||||
|
||||
"movq mm0,[%[src]+0]\n" // read 64 bits
|
||||
"movq mm1,[%[src]+8]\n"
|
||||
"movq mm2,[%[src]+16]\n"
|
||||
"movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
|
||||
"movntq [%[dest]+8], mm1\n"
|
||||
"movq mm3,[%[src]+24]\n"
|
||||
"movntq [%[dest]+16], mm2\n"
|
||||
"movntq [%[dest]+24], mm3\n"
|
||||
|
||||
"add %[src],32\n" // update source pointer
|
||||
"add %[dest],32\n" // update destination pointer
|
||||
"sub %[qwc],2\n"
|
||||
"jg memcpy_qwc_loop2_%=\n" // last 64-byte block?
|
||||
"sfence\n" // flush the write buffer
|
||||
"jmp memcpy_qwc_1_%=\n"
|
||||
|
||||
// 32-byte blocks, cached!
|
||||
// This *is* important. Removing this and using exclusively non-temporal stores
|
||||
// results in noticeable speed loss!
|
||||
|
||||
"memcpy_qwc_loop1_%=:\n"
|
||||
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
|
||||
|
||||
"movq mm0,[%[src]+0]\n" // read 64 bits
|
||||
"movq mm1,[%[src]+8]\n"
|
||||
"movq mm2,[%[src]+16]\n"
|
||||
"movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
|
||||
"movq [%[dest]+8], mm1\n"
|
||||
"movq mm3,[%[src]+24]\n"
|
||||
"movq [%[dest]+16], mm2\n"
|
||||
"movq [%[dest]+24], mm3\n"
|
||||
|
||||
"add %[src],32\n" // update source pointer
|
||||
"add %[dest],32\n" // update destination pointer
|
||||
"sub %[qwc],2\n"
|
||||
"jg memcpy_qwc_loop2_%=\n" // last 64-byte block?
|
||||
|
||||
"memcpy_qwc_1_%=:\n"
|
||||
"cmp %[qwc],0\n"
|
||||
"jne memcpy_qwc_final_%=\n"
|
||||
"movq mm0,[%[src]]\n"
|
||||
"movq mm1,[%[src]+8]\n"
|
||||
"movq [%[dest]], mm0\n"
|
||||
"movq [%[dest]+8], mm1\n"
|
||||
|
||||
"memcpy_qwc_final_%=:\n"
|
||||
"emms\n" // clean up the MMX state
|
||||
".att_syntax\n"
|
||||
: "=&r"(dest), "=&r"(src), "=&r"(qwc)
|
||||
: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
|
||||
: "memory", "mm0", "mm1", "mm2", "mm3"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue