mirror of https://github.com/PCSX2/pcsx2.git
common: remove old memcpy implementation
PCSX2 used standard memcpy now (thanks to xsacha)
This commit is contained in:
parent
4d818f6cd9
commit
69e88ffed0
|
@ -32,10 +32,6 @@ extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
||||||
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
|
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
|
||||||
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||||
|
|
||||||
// MemcpyVibes.cpp functions
|
|
||||||
extern void memcpy_vibes(void * dest, const void * src, int size);
|
|
||||||
extern void gen_memcpy_vibes();
|
|
||||||
|
|
||||||
#define memcpy_fast memcpy
|
#define memcpy_fast memcpy
|
||||||
#define memcpy_aligned(d,s,c) memcpy(d,s,c)
|
#define memcpy_aligned(d,s,c) memcpy(d,s,c)
|
||||||
#define memcpy_const memcpy
|
#define memcpy_const memcpy
|
||||||
|
|
|
@ -128,7 +128,6 @@ set(UtilitiesSources
|
||||||
wxAppWithHelpers.cpp
|
wxAppWithHelpers.cpp
|
||||||
wxGuiTools.cpp
|
wxGuiTools.cpp
|
||||||
wxHelpers.cpp
|
wxHelpers.cpp
|
||||||
x86/MemcpyVibes.cpp
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# variable with all headers of this library
|
# variable with all headers of this library
|
||||||
|
|
|
@ -31,290 +31,19 @@
|
||||||
3dsdk.support@amd.com
|
3dsdk.support@amd.com
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
|
// GH: AMD memcpy was removed. The remaining part (memcmp_mmx) is likely from Zerofrog.
|
||||||
|
// Hopefully memcmp_mmx will be dropped in the future.
|
||||||
|
|
||||||
#include "PrecompiledHeader.h"
|
#include "PrecompiledHeader.h"
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#pragma warning(disable:4414)
|
#pragma warning(disable:4414)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*****************************************************************************
|
|
||||||
MEMCPY_AMD.CPP
|
|
||||||
******************************************************************************/
|
|
||||||
|
|
||||||
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
|
||||||
// "Streaming Store"), and also uses the software prefetch instructions,
|
|
||||||
// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before
|
|
||||||
// calling!
|
|
||||||
|
|
||||||
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
|
|
||||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
|
||||||
// form which is an "unrolled loop".
|
|
||||||
|
|
||||||
#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch
|
|
||||||
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
|
|
||||||
// also using the "unrolled loop" optimization. This code uses
|
|
||||||
// the software prefetch instruction to get the data into the cache.
|
|
||||||
|
|
||||||
#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
|
|
||||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
|
||||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
|
||||||
// bypasses the cache and writes straight to main memory. This code also
|
|
||||||
// uses the software prefetch instruction to pre-read the data.
|
|
||||||
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
|
|
||||||
|
|
||||||
// Inline assembly syntax for use with Visual C++
|
// Inline assembly syntax for use with Visual C++
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
|
|
||||||
|
|
||||||
// Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs.
|
|
||||||
__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
|
||||||
{
|
|
||||||
__asm
|
|
||||||
{
|
|
||||||
push edi
|
|
||||||
push esi
|
|
||||||
|
|
||||||
mov edi, ecx ; destination
|
|
||||||
mov esi, edx ; source
|
|
||||||
mov ecx, [esp+12] ; number of bytes to copy
|
|
||||||
mov eax, ecx ; keep a copy of count
|
|
||||||
|
|
||||||
cld
|
|
||||||
cmp eax, TINY_BLOCK_COPY
|
|
||||||
jb $memcpy_ic_3 ; tiny? skip mmx copy
|
|
||||||
|
|
||||||
cmp eax, 32*1024 ; dont align between 32k-64k because
|
|
||||||
jbe $memcpy_do_align ; it appears to be slower
|
|
||||||
cmp eax, 64*1024
|
|
||||||
jbe $memcpy_align_done
|
|
||||||
|
|
||||||
$memcpy_do_align:
|
|
||||||
mov eax, 8 ; a trick that s faster than rep movsb...
|
|
||||||
sub eax, edi ; align destination to qword
|
|
||||||
and eax, 111b ; get the low bits
|
|
||||||
sub ecx, eax ; update copy count
|
|
||||||
neg eax ; set up to jump into the array
|
|
||||||
add eax, offset $memcpy_align_done
|
|
||||||
jmp eax ; jump to array of movsb s
|
|
||||||
|
|
||||||
align 4
|
|
||||||
movsb
|
|
||||||
movsb
|
|
||||||
movsb
|
|
||||||
movsb
|
|
||||||
movsb
|
|
||||||
movsb
|
|
||||||
movsb
|
|
||||||
movsb
|
|
||||||
|
|
||||||
$memcpy_align_done: ; destination is dword aligned
|
|
||||||
mov eax, ecx ; number of bytes left to copy
|
|
||||||
shr eax, 6 ; get 64-byte block count
|
|
||||||
jz $memcpy_ic_2 ; finish the last few bytes
|
|
||||||
|
|
||||||
cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
|
||||||
jae $memcpy_uc_test
|
|
||||||
|
|
||||||
// This is small block copy that uses the MMX registers to copy 8 bytes
|
|
||||||
// at a time. It uses the "unrolled loop" optimization, and also uses
|
|
||||||
// the software prefetch instruction to get the data into the cache.
|
|
||||||
align 16
|
|
||||||
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
|
||||||
|
|
||||||
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
|
||||||
|
|
||||||
movq mm0, [esi+0] ; read 64 bits
|
|
||||||
movq mm1, [esi+8]
|
|
||||||
movq [edi+0], mm0 ; write 64 bits
|
|
||||||
movq [edi+8], mm1 ; note: the normal movq writes the
|
|
||||||
movq mm2, [esi+16] ; data to cache; a cache line will be
|
|
||||||
movq mm3, [esi+24] ; allocated as needed, to store the data
|
|
||||||
movq [edi+16], mm2
|
|
||||||
movq [edi+24], mm3
|
|
||||||
movq mm0, [esi+32]
|
|
||||||
movq mm1, [esi+40]
|
|
||||||
movq [edi+32], mm0
|
|
||||||
movq [edi+40], mm1
|
|
||||||
movq mm2, [esi+48]
|
|
||||||
movq mm3, [esi+56]
|
|
||||||
movq [edi+48], mm2
|
|
||||||
movq [edi+56], mm3
|
|
||||||
|
|
||||||
add esi, 64 ; update source pointer
|
|
||||||
add edi, 64 ; update destination pointer
|
|
||||||
sub eax, 1
|
|
||||||
jnz $memcpy_ic_1 ; last 64-byte block?
|
|
||||||
|
|
||||||
$memcpy_ic_2:
|
|
||||||
mov eax, ecx ; has valid low 6 bits of the byte count
|
|
||||||
$memcpy_ic_3:
|
|
||||||
shr eax, 2 ; dword count
|
|
||||||
and eax, 1111b ; only look at the "remainder" bits
|
|
||||||
neg eax ; set up to jump into the array
|
|
||||||
add eax, offset $memcpy_last_few
|
|
||||||
jmp eax ; jump to array of movsd s
|
|
||||||
|
|
||||||
$memcpy_uc_test:
|
|
||||||
or eax, eax ; tail end of block prefetch will jump here
|
|
||||||
jz $memcpy_ic_2 ; no more 64-byte blocks left
|
|
||||||
|
|
||||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
|
||||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
|
||||||
// bypasses the cache and writes straight to main memory. This code also
|
|
||||||
// uses the software prefetch instruction to pre-read the data.
|
|
||||||
|
|
||||||
align 16
|
|
||||||
$memcpy_uc_1: ; 64-byte blocks, uncached copy
|
|
||||||
|
|
||||||
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
|
||||||
|
|
||||||
movq mm0,[esi+0] ; read 64 bits
|
|
||||||
add edi,64 ; update destination pointer
|
|
||||||
movq mm1,[esi+8]
|
|
||||||
add esi,64 ; update source pointer
|
|
||||||
movq mm2,[esi-48]
|
|
||||||
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
|
|
||||||
movq mm0,[esi-40] ; note: movntq also prevents the CPU
|
|
||||||
movntq [edi-56], mm1 ; from READING the destination address
|
|
||||||
movq mm1,[esi-32] ; into the cache, only to be over-written
|
|
||||||
movntq [edi-48], mm2 ; so that also helps performance
|
|
||||||
movq mm2,[esi-24]
|
|
||||||
movntq [edi-40], mm0
|
|
||||||
movq mm0,[esi-16]
|
|
||||||
movntq [edi-32], mm1
|
|
||||||
movq mm1,[esi-8]
|
|
||||||
movntq [edi-24], mm2
|
|
||||||
movntq [edi-16], mm0
|
|
||||||
movntq [edi-8], mm1
|
|
||||||
|
|
||||||
sub eax, 1
|
|
||||||
jnz $memcpy_uc_1 ; last 64-byte block?
|
|
||||||
|
|
||||||
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
|
|
||||||
|
|
||||||
// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
|
|
||||||
// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
|
|
||||||
|
|
||||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
|
||||||
// form which is an "unrolled loop". Then it handles the last few bytes.
|
|
||||||
align 16
|
|
||||||
movsd
|
|
||||||
movsd ; perform last 1-15 dword copies
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
movsd ; perform last 1-7 dword copies
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
movsd
|
|
||||||
|
|
||||||
$memcpy_last_few: ; dword aligned from before movsd s
|
|
||||||
and ecx, 11b ; the last few cows must come home
|
|
||||||
jz $memcpy_final ; no more, let s leave
|
|
||||||
rep movsb ; the last 1, 2, or 3 bytes
|
|
||||||
|
|
||||||
$memcpy_final:
|
|
||||||
pop esi
|
|
||||||
pop edi
|
|
||||||
|
|
||||||
emms ; clean up the MMX state
|
|
||||||
sfence ; flush the write buffer
|
|
||||||
//mov eax, [dest] ; ret value = destination pointer
|
|
||||||
|
|
||||||
ret 4
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
|
|
||||||
__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
|
|
||||||
{
|
|
||||||
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
|
|
||||||
// registers will improve copy performance, because they won't. Use of XMMs is only
|
|
||||||
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
|
|
||||||
// and even then the benefits are typically minimal (sometimes slower depending on the
|
|
||||||
// amount of data being copied).
|
|
||||||
//
|
|
||||||
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
|
|
||||||
// --air
|
|
||||||
|
|
||||||
// Linux Conversion note:
|
|
||||||
// This code would benefit nicely from having inline-able GAS syntax, since it should
|
|
||||||
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
|
|
||||||
// And its called enough times to probably merit the extra effort to ensure proper
|
|
||||||
// optimization. --air
|
|
||||||
|
|
||||||
__asm
|
|
||||||
{
|
|
||||||
mov ecx, dest
|
|
||||||
mov edx, src
|
|
||||||
mov eax, qwc ; keep a copy of count
|
|
||||||
shr eax, 1
|
|
||||||
jz $memcpy_qwc_1 ; only one 16 byte block to copy?
|
|
||||||
|
|
||||||
cmp eax, IN_CACHE_COPY/32
|
|
||||||
jb $memcpy_qwc_loop1 ; small copies should be cached (definite speedup --air)
|
|
||||||
|
|
||||||
$memcpy_qwc_loop2: ; 32-byte blocks, uncached copy
|
|
||||||
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
|
|
||||||
|
|
||||||
movq mm0,[edx+0] ; read 64 bits
|
|
||||||
movq mm1,[edx+8]
|
|
||||||
movq mm2,[edx+16]
|
|
||||||
movntq [ecx+0], mm0 ; write 64 bits, bypassing the cache
|
|
||||||
movntq [ecx+8], mm1
|
|
||||||
movq mm3,[edx+24]
|
|
||||||
movntq [ecx+16], mm2
|
|
||||||
movntq [ecx+24], mm3
|
|
||||||
|
|
||||||
add edx,32 ; update source pointer
|
|
||||||
add ecx,32 ; update destination pointer
|
|
||||||
sub eax,1
|
|
||||||
jnz $memcpy_qwc_loop2 ; last 64-byte block?
|
|
||||||
sfence ; flush the write buffer
|
|
||||||
jmp $memcpy_qwc_1
|
|
||||||
|
|
||||||
; 32-byte blocks, cached!
|
|
||||||
; This *is* important. Removing this and using exclusively non-temporal stores
|
|
||||||
; results in noticable speed loss!
|
|
||||||
|
|
||||||
$memcpy_qwc_loop1:
|
|
||||||
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
|
|
||||||
|
|
||||||
movq mm0,[edx+0] ; read 64 bits
|
|
||||||
movq mm1,[edx+8]
|
|
||||||
movq mm2,[edx+16]
|
|
||||||
movq [ecx+0], mm0 ; write 64 bits, bypassing the cache
|
|
||||||
movq [ecx+8], mm1
|
|
||||||
movq mm3,[edx+24]
|
|
||||||
movq [ecx+16], mm2
|
|
||||||
movq [ecx+24], mm3
|
|
||||||
|
|
||||||
add edx,32 ; update source pointer
|
|
||||||
add ecx,32 ; update destination pointer
|
|
||||||
sub eax,1
|
|
||||||
jnz $memcpy_qwc_loop1 ; last 64-byte block?
|
|
||||||
|
|
||||||
$memcpy_qwc_1:
|
|
||||||
test qwc,1
|
|
||||||
jz $memcpy_qwc_final
|
|
||||||
movq mm0,[edx]
|
|
||||||
movq mm1,[edx+8]
|
|
||||||
movq [ecx], mm0
|
|
||||||
movq [ecx+8], mm1
|
|
||||||
|
|
||||||
$memcpy_qwc_final:
|
|
||||||
emms ; clean up the MMX state
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// mmx mem-compare implementation, size has to be a multiple of 8
|
// mmx mem-compare implementation, size has to be a multiple of 8
|
||||||
// returns 0 is equal, nonzero value if not equal
|
// returns 0 is equal, nonzero value if not equal
|
||||||
// ~10 times faster than standard memcmp
|
// ~10 times faster than standard memcmp
|
||||||
|
@ -489,112 +218,4 @@ End:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// returns the xor of all elements, cmpsize has to be mult of 8
|
|
||||||
void memxor_mmx(void* dst, const void* src1, int cmpsize)
|
|
||||||
{
|
|
||||||
pxAssert( (cmpsize&7) == 0 );
|
|
||||||
|
|
||||||
__asm {
|
|
||||||
mov ecx, cmpsize
|
|
||||||
mov eax, src1
|
|
||||||
mov edx, dst
|
|
||||||
|
|
||||||
cmp ecx, 64
|
|
||||||
jl Setup4
|
|
||||||
|
|
||||||
movq mm0, [eax]
|
|
||||||
movq mm1, [eax+8]
|
|
||||||
movq mm2, [eax+16]
|
|
||||||
movq mm3, [eax+24]
|
|
||||||
movq mm4, [eax+32]
|
|
||||||
movq mm5, [eax+40]
|
|
||||||
movq mm6, [eax+48]
|
|
||||||
movq mm7, [eax+56]
|
|
||||||
sub ecx, 64
|
|
||||||
add eax, 64
|
|
||||||
cmp ecx, 64
|
|
||||||
jl End8
|
|
||||||
|
|
||||||
Cmp8:
|
|
||||||
pxor mm0, [eax]
|
|
||||||
pxor mm1, [eax+8]
|
|
||||||
pxor mm2, [eax+16]
|
|
||||||
pxor mm3, [eax+24]
|
|
||||||
pxor mm4, [eax+32]
|
|
||||||
pxor mm5, [eax+40]
|
|
||||||
pxor mm6, [eax+48]
|
|
||||||
pxor mm7, [eax+56]
|
|
||||||
|
|
||||||
sub ecx, 64
|
|
||||||
add eax, 64
|
|
||||||
cmp ecx, 64
|
|
||||||
jge Cmp8
|
|
||||||
|
|
||||||
End8:
|
|
||||||
pxor mm0, mm4
|
|
||||||
pxor mm1, mm5
|
|
||||||
pxor mm2, mm6
|
|
||||||
pxor mm3, mm7
|
|
||||||
|
|
||||||
cmp ecx, 32
|
|
||||||
jl End4
|
|
||||||
pxor mm0, [eax]
|
|
||||||
pxor mm1, [eax+8]
|
|
||||||
pxor mm2, [eax+16]
|
|
||||||
pxor mm3, [eax+24]
|
|
||||||
sub ecx, 32
|
|
||||||
add eax, 32
|
|
||||||
jmp End4
|
|
||||||
|
|
||||||
Setup4:
|
|
||||||
cmp ecx, 32
|
|
||||||
jl Setup2
|
|
||||||
|
|
||||||
movq mm0, [eax]
|
|
||||||
movq mm1, [eax+8]
|
|
||||||
movq mm2, [eax+16]
|
|
||||||
movq mm3, [eax+24]
|
|
||||||
sub ecx, 32
|
|
||||||
add eax, 32
|
|
||||||
|
|
||||||
End4:
|
|
||||||
pxor mm0, mm2
|
|
||||||
pxor mm1, mm3
|
|
||||||
|
|
||||||
cmp ecx, 16
|
|
||||||
jl End2
|
|
||||||
pxor mm0, [eax]
|
|
||||||
pxor mm1, [eax+8]
|
|
||||||
sub ecx, 16
|
|
||||||
add eax, 16
|
|
||||||
jmp End2
|
|
||||||
|
|
||||||
Setup2:
|
|
||||||
cmp ecx, 16
|
|
||||||
jl Setup1
|
|
||||||
|
|
||||||
movq mm0, [eax]
|
|
||||||
movq mm1, [eax+8]
|
|
||||||
sub ecx, 16
|
|
||||||
add eax, 16
|
|
||||||
|
|
||||||
End2:
|
|
||||||
pxor mm0, mm1
|
|
||||||
|
|
||||||
cmp ecx, 8
|
|
||||||
jl End1
|
|
||||||
pxor mm0, [eax]
|
|
||||||
End1:
|
|
||||||
movq [edx], mm0
|
|
||||||
jmp End
|
|
||||||
|
|
||||||
Setup1:
|
|
||||||
movq mm0, [eax]
|
|
||||||
movq [edx], mm0
|
|
||||||
End:
|
|
||||||
emms
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,250 +0,0 @@
|
||||||
/* PCSX2 - PS2 Emulator for PCs
|
|
||||||
* Copyright (C) 2002-2010 PCSX2 Dev Team
|
|
||||||
*
|
|
||||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
|
||||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
|
||||||
* ation, either version 3 of the License, or (at your option) any later version.
|
|
||||||
*
|
|
||||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
|
||||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
||||||
* PURPOSE. See the GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
|
||||||
* If not, see <http://www.gnu.org/licenses/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "PrecompiledHeader.h"
|
|
||||||
#include "x86emitter/x86emitter.h"
|
|
||||||
#include <xmmintrin.h>
|
|
||||||
|
|
||||||
using namespace x86Emitter;
|
|
||||||
|
|
||||||
// Max Number of qwc supported
|
|
||||||
#define _maxSize 0x400
|
|
||||||
|
|
||||||
typedef void (__fastcall *_memCpyCall)(void*, void*);
|
|
||||||
__aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
|
|
||||||
// this version uses SSE intrinsics to perform an inline copy. MSVC disasm shows pretty
|
|
||||||
// decent code generation on whole, but it hasn't been benchmarked at all yet --air
|
|
||||||
__fi void memcpy_vibes(void * dest, const void * src, int size) {
|
|
||||||
|
|
||||||
float (*destxmm)[4] = (float(*)[4])dest, (*srcxmm)[4] = (float(*)[4])src;
|
|
||||||
size_t count = size & ~15, extra = size & 15;
|
|
||||||
|
|
||||||
destxmm -= 8 - extra, srcxmm -= 8 - extra;
|
|
||||||
switch (extra) {
|
|
||||||
do {
|
|
||||||
destxmm += 16, srcxmm += 16, count -= 16;
|
|
||||||
_mm_store_ps(&destxmm[-8][0], _mm_load_ps(&srcxmm[-8][0]));
|
|
||||||
case 15:
|
|
||||||
_mm_store_ps(&destxmm[-7][0], _mm_load_ps(&srcxmm[-7][0]));
|
|
||||||
case 14:
|
|
||||||
_mm_store_ps(&destxmm[-6][0], _mm_load_ps(&srcxmm[-6][0]));
|
|
||||||
case 13:
|
|
||||||
_mm_store_ps(&destxmm[-5][0], _mm_load_ps(&srcxmm[-5][0]));
|
|
||||||
case 12:
|
|
||||||
_mm_store_ps(&destxmm[-4][0], _mm_load_ps(&srcxmm[-4][0]));
|
|
||||||
case 11:
|
|
||||||
_mm_store_ps(&destxmm[-3][0], _mm_load_ps(&srcxmm[-3][0]));
|
|
||||||
case 10:
|
|
||||||
_mm_store_ps(&destxmm[-2][0], _mm_load_ps(&srcxmm[-2][0]));
|
|
||||||
case 9:
|
|
||||||
_mm_store_ps(&destxmm[-1][0], _mm_load_ps(&srcxmm[-1][0]));
|
|
||||||
case 8:
|
|
||||||
_mm_store_ps(&destxmm[ 0][0], _mm_load_ps(&srcxmm[ 0][0]));
|
|
||||||
case 7:
|
|
||||||
_mm_store_ps(&destxmm[ 1][0], _mm_load_ps(&srcxmm[ 1][0]));
|
|
||||||
case 6:
|
|
||||||
_mm_store_ps(&destxmm[ 2][0], _mm_load_ps(&srcxmm[ 2][0]));
|
|
||||||
case 5:
|
|
||||||
_mm_store_ps(&destxmm[ 3][0], _mm_load_ps(&srcxmm[ 3][0]));
|
|
||||||
case 4:
|
|
||||||
_mm_store_ps(&destxmm[ 4][0], _mm_load_ps(&srcxmm[ 4][0]));
|
|
||||||
case 3:
|
|
||||||
_mm_store_ps(&destxmm[ 5][0], _mm_load_ps(&srcxmm[ 5][0]));
|
|
||||||
case 2:
|
|
||||||
_mm_store_ps(&destxmm[ 6][0], _mm_load_ps(&srcxmm[ 6][0]));
|
|
||||||
case 1:
|
|
||||||
_mm_store_ps(&destxmm[ 7][0], _mm_load_ps(&srcxmm[ 7][0]));
|
|
||||||
case 0: NULL;
|
|
||||||
} while (count);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
#if 1
|
|
||||||
// This version creates one function with a lot of movaps
|
|
||||||
// It jumps to the correct movaps entry-point while adding
|
|
||||||
// the proper offset for adjustment...
|
|
||||||
|
|
||||||
static __pagealigned u8 _memCpyExec[__pagesize*16];
|
|
||||||
|
|
||||||
void gen_memcpy_vibes() {
|
|
||||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
|
|
||||||
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
|
|
||||||
xSetPtr(_memCpyExec);
|
|
||||||
|
|
||||||
int off =-(((_maxSize & 0xf) - 7) << 4);
|
|
||||||
for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) {
|
|
||||||
|
|
||||||
_memcpy_vibes[i] = (_memCpyCall)xGetPtr();
|
|
||||||
|
|
||||||
if (off >= 128) {
|
|
||||||
off = -128;
|
|
||||||
xADD(edx, 256);
|
|
||||||
xADD(ecx, 256);
|
|
||||||
}
|
|
||||||
const xRegisterSSE xmm_t(x);
|
|
||||||
xMOVAPS (xmm_t, ptr32[edx+off]);
|
|
||||||
xMOVNTPS(ptr32[ecx+off], xmm_t);
|
|
||||||
}
|
|
||||||
|
|
||||||
_memcpy_vibes[0] = (_memCpyCall)xGetPtr();
|
|
||||||
|
|
||||||
xRET();
|
|
||||||
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
|
|
||||||
|
|
||||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
__fi void memcpy_vibes(void * dest, const void * src, int size) {
|
|
||||||
int offset = ((size & 0xf) - 7) << 4;
|
|
||||||
_memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
// This version creates '_maxSize' number of different functions,
|
|
||||||
// and calls the appropriate one...
|
|
||||||
|
|
||||||
static __pagealigned u8 _memCpyExec[__pagesize*_maxSize*2];
|
|
||||||
|
|
||||||
void gen_memcpy_vibes() {
|
|
||||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
|
|
||||||
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
|
|
||||||
xSetPtr(_memCpyExec);
|
|
||||||
|
|
||||||
for (int i = 0; i < _maxSize+1; i++)
|
|
||||||
{
|
|
||||||
int off = 0;
|
|
||||||
_memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget();
|
|
||||||
|
|
||||||
for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) {
|
|
||||||
if (off >= 128) {
|
|
||||||
off = -128;
|
|
||||||
xADD(edx, 256);
|
|
||||||
xADD(ecx, 256);
|
|
||||||
}
|
|
||||||
const xRegisterSSE xmm_t(x);
|
|
||||||
xMOVAPS(xmm_t, ptr32[edx+off]);
|
|
||||||
xMOVAPS(ptr32[ecx+off], xmm_t);
|
|
||||||
}
|
|
||||||
|
|
||||||
xRET();
|
|
||||||
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
|
|
||||||
}
|
|
||||||
|
|
||||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
__fi void memcpy_vibes(void * dest, const void * src, int size) {
|
|
||||||
_memcpy_vibes[size](dest, src);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
|
|
||||||
// to get around compilation issues with having it in the headers.
|
|
||||||
#ifdef __linux__
|
|
||||||
|
|
||||||
// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
|
|
||||||
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
|
|
||||||
__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
|
|
||||||
{
|
|
||||||
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
|
|
||||||
// registers will improve copy performance, because they won't. Use of XMMs is only
|
|
||||||
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
|
|
||||||
// and even then the benefits are typically minimal (sometimes slower depending on the
|
|
||||||
// amount of data being copied).
|
|
||||||
//
|
|
||||||
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
|
|
||||||
// --air
|
|
||||||
|
|
||||||
// Linux Conversion note:
|
|
||||||
// This code would benefit nicely from having inline-able GAS syntax, since it should
|
|
||||||
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
|
|
||||||
// And its called enough times to probably merit the extra effort to ensure proper
|
|
||||||
// optimization. --air
|
|
||||||
|
|
||||||
__asm__ __volatile__
|
|
||||||
(
|
|
||||||
".intel_syntax noprefix\n"
|
|
||||||
"sub %[qwc], 1\n" // dec the counter to ease the count of 16bytes block later (optimization)
|
|
||||||
// Note after this line, real value of the counter is %[qwc] + 1
|
|
||||||
"jle memcpy_qwc_1_%=\n" // only one 16 byte block to copy? Or nothing.
|
|
||||||
|
|
||||||
"cmp %[qwc], 127\n" // "IN_CACHE_COPY/16"
|
|
||||||
"jb memcpy_qwc_loop1_%=\n" // small copies should be cached (definite speedup --air)
|
|
||||||
|
|
||||||
"memcpy_qwc_loop2_%=:\n" // 32-byte blocks, uncached copy
|
|
||||||
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
|
|
||||||
|
|
||||||
"movq mm0,[%[src]+0]\n" // read 64 bits
|
|
||||||
"movq mm1,[%[src]+8]\n"
|
|
||||||
"movq mm2,[%[src]+16]\n"
|
|
||||||
"movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
|
|
||||||
"movntq [%[dest]+8], mm1\n"
|
|
||||||
"movq mm3,[%[src]+24]\n"
|
|
||||||
"movntq [%[dest]+16], mm2\n"
|
|
||||||
"movntq [%[dest]+24], mm3\n"
|
|
||||||
|
|
||||||
"add %[src],32\n" // update source pointer
|
|
||||||
"add %[dest],32\n" // update destination pointer
|
|
||||||
"sub %[qwc],2\n"
|
|
||||||
"jg memcpy_qwc_loop2_%=\n" // last 64-byte block?
|
|
||||||
"sfence\n" // flush the write buffer
|
|
||||||
"jmp memcpy_qwc_1_%=\n"
|
|
||||||
|
|
||||||
// 32-byte blocks, cached!
|
|
||||||
// This *is* important. Removing this and using exclusively non-temporal stores
|
|
||||||
// results in noticeable speed loss!
|
|
||||||
|
|
||||||
"memcpy_qwc_loop1_%=:\n"
|
|
||||||
"prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
|
|
||||||
|
|
||||||
"movq mm0,[%[src]+0]\n" // read 64 bits
|
|
||||||
"movq mm1,[%[src]+8]\n"
|
|
||||||
"movq mm2,[%[src]+16]\n"
|
|
||||||
"movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
|
|
||||||
"movq [%[dest]+8], mm1\n"
|
|
||||||
"movq mm3,[%[src]+24]\n"
|
|
||||||
"movq [%[dest]+16], mm2\n"
|
|
||||||
"movq [%[dest]+24], mm3\n"
|
|
||||||
|
|
||||||
"add %[src],32\n" // update source pointer
|
|
||||||
"add %[dest],32\n" // update destination pointer
|
|
||||||
"sub %[qwc],2\n"
|
|
||||||
"jg memcpy_qwc_loop2_%=\n" // last 64-byte block?
|
|
||||||
|
|
||||||
"memcpy_qwc_1_%=:\n"
|
|
||||||
"cmp %[qwc],0\n"
|
|
||||||
"jne memcpy_qwc_final_%=\n"
|
|
||||||
"movq mm0,[%[src]]\n"
|
|
||||||
"movq mm1,[%[src]+8]\n"
|
|
||||||
"movq [%[dest]], mm0\n"
|
|
||||||
"movq [%[dest]+8], mm1\n"
|
|
||||||
|
|
||||||
"memcpy_qwc_final_%=:\n"
|
|
||||||
"emms\n" // clean up the MMX state
|
|
||||||
".att_syntax\n"
|
|
||||||
: "=&r"(dest), "=&r"(src), "=&r"(qwc)
|
|
||||||
: [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
|
|
||||||
: "memory", "mm0", "mm1", "mm2", "mm3"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
Loading…
Reference in New Issue