2009-02-09 21:15:56 +00:00
|
|
|
/******************************************************************************
|
|
|
|
|
|
|
|
Copyright (c) 2001 Advanced Micro Devices, Inc.
|
|
|
|
|
|
|
|
LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
|
|
|
|
EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
|
|
|
|
NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
|
|
|
|
PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
|
|
|
|
DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
|
|
|
|
BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
|
|
|
|
INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
|
|
|
|
OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
|
|
|
|
OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
|
|
|
|
NOT APPLY TO YOU.
|
|
|
|
|
|
|
|
AMD does not assume any responsibility for any errors which may appear in the
|
|
|
|
Materials nor any responsibility to support or update the Materials. AMD retains
|
|
|
|
the right to make changes to its test specifications at any time, without notice.
|
|
|
|
|
|
|
|
NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
|
|
|
|
further information, software, technical information, know-how, or show-how
|
|
|
|
available to you.
|
|
|
|
|
|
|
|
So that all may benefit from your experience, please report any problems
|
|
|
|
or suggestions about this software to 3dsdk.support@amd.com
|
|
|
|
|
|
|
|
AMD Developer Technologies, M/S 585
|
|
|
|
Advanced Micro Devices, Inc.
|
|
|
|
5900 E. Ben White Blvd.
|
|
|
|
Austin, TX 78741
|
|
|
|
3dsdk.support@amd.com
|
|
|
|
******************************************************************************/
|
|
|
|
|
2009-08-20 23:05:26 +00:00
|
|
|
#include "../PrecompiledHeader.h"
|
2009-02-09 21:15:56 +00:00
|
|
|
|
|
|
|
#ifdef _MSC_VER
|
|
|
|
#pragma warning(disable:4414)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*****************************************************************************
|
|
|
|
MEMCPY_AMD.CPP
|
|
|
|
******************************************************************************/
|
|
|
|
|
|
|
|
// Very optimized memcpy() routine for AMD Athlon and Duron family.
|
|
|
|
// This code uses any of FOUR different basic copy methods, depending
|
|
|
|
// on the transfer size.
|
|
|
|
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
|
|
|
// "Streaming Store"), and also uses the software prefetch instructions,
|
|
|
|
// be sure you're running on Athlon/Duron or other recent CPU before calling!
|
|
|
|
|
|
|
|
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
|
|
|
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
|
|
|
// form which is an "unrolled loop".
|
|
|
|
|
|
|
|
#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch
|
|
|
|
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
|
|
|
|
// also using the "unrolled loop" optimization. This code uses
|
|
|
|
// the software prefetch instruction to get the data into the cache.
|
|
|
|
|
|
|
|
#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
|
|
|
|
// For larger blocks, which will spill beyond the cache, it's faster to
|
|
|
|
// use the Streaming Store instruction MOVNTQ. This write instruction
|
|
|
|
// bypasses the cache and writes straight to main memory. This code also
|
|
|
|
// uses the software prefetch instruction to pre-read the data.
|
|
|
|
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
|
|
|
|
|
2009-08-20 23:05:26 +00:00
|
|
|
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
|
2009-02-09 21:15:56 +00:00
|
|
|
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
|
|
|
|
// For the largest size blocks, a special technique called Block Prefetch
|
|
|
|
// can be used to accelerate the read operations. Block Prefetch reads
|
|
|
|
// one address per cache line, for a series of cache lines, in a short loop.
|
|
|
|
// This is faster than using software prefetch. The technique is great for
|
|
|
|
// getting maximum read bandwidth, especially in DDR memory systems.
|
|
|
|
|
|
|
|
// Inline assembly syntax for use with Visual C++
|
|
|
|
|
|
|
|
#if defined(_MSC_VER)
|
|
|
|
|
2009-06-18 08:20:19 +00:00
|
|
|
#ifdef PCSX2_DEBUG
|
2009-02-09 21:15:56 +00:00
|
|
|
extern u8 g_globalMMXSaved;
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
PCSX2_ALIGNED16( static u8 _xmm_backup[16*2] );
|
|
|
|
PCSX2_ALIGNED16( static u8 _mmx_backup[8*4] );
|
|
|
|
|
|
|
|
static __declspec(naked) void __fastcall _memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
|
|
|
|
{
|
|
|
|
// MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
|
|
|
|
#define MOVSRC movdqu
|
|
|
|
#define MOVDST movdqa
|
|
|
|
|
|
|
|
__asm
|
|
|
|
{
|
|
|
|
//Reads before reads, to avoid stalls
|
|
|
|
mov eax,[esp+4];
|
|
|
|
//Make sure to save xmm0, it must be preserved ...
|
|
|
|
movaps [_xmm_backup],xmm0;
|
|
|
|
|
|
|
|
//if >=128 bytes use 128 byte unrolled loop
|
|
|
|
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
|
|
|
cmp eax,127;
|
|
|
|
jna _loop_1;
|
|
|
|
|
|
|
|
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
|
|
|
align 16
|
|
|
|
|
|
|
|
//128 byte unrolled loop
|
|
|
|
_loop_8:
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
|
|
|
MOVDST [ecx+0x00],xmm0; //then write :p
|
|
|
|
MOVSRC xmm0,[edx+0x10];
|
|
|
|
MOVDST [ecx+0x10],xmm0;
|
|
|
|
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
|
|
|
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x20-128];
|
|
|
|
MOVDST [ecx+0x20-128],xmm0;
|
|
|
|
MOVSRC xmm0,[edx+0x30-128];
|
|
|
|
MOVDST [ecx+0x30-128],xmm0;
|
|
|
|
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x40-128];
|
|
|
|
MOVDST [ecx+0x40-128],xmm0;
|
|
|
|
MOVSRC xmm0,[edx+0x50-128];
|
|
|
|
MOVDST [ecx+0x50-128],xmm0;
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x60-128];
|
|
|
|
MOVDST [ecx+0x60-128],xmm0;
|
|
|
|
MOVSRC xmm0,[edx+0x70-128];
|
|
|
|
MOVDST [ecx+0x70-128],xmm0;
|
|
|
|
|
|
|
|
//127~ja, 127 is encodable as simm8 :)
|
|
|
|
cmp eax,127;
|
|
|
|
ja _loop_8;
|
2009-08-20 23:05:26 +00:00
|
|
|
|
2009-02-09 21:15:56 +00:00
|
|
|
//direct copy for 0~7 qwords
|
|
|
|
//in order to avoid the inc/dec of all 3 registers
|
|
|
|
//i use negative relative addressing from the top of the buffers
|
|
|
|
//[top-current index]
|
|
|
|
|
|
|
|
_loop_1:
|
|
|
|
//prepare the regs for 'negative relative addressing'
|
|
|
|
add edx,eax;
|
|
|
|
add ecx,eax;
|
|
|
|
neg eax;
|
|
|
|
jz cleanup; //exit if nothing to do
|
|
|
|
|
|
|
|
_loop_1_inner:
|
|
|
|
MOVSRC xmm0,[edx+eax];
|
|
|
|
MOVDST [ecx+eax],xmm0;
|
2009-08-20 23:05:26 +00:00
|
|
|
|
2009-02-09 21:15:56 +00:00
|
|
|
add eax,16; //while the offset is still negative we have data to copy
|
|
|
|
js _loop_1_inner;
|
|
|
|
|
|
|
|
//done !
|
|
|
|
cleanup:
|
|
|
|
//restore xmm and exit ~)
|
|
|
|
movaps xmm0,[_xmm_backup];
|
|
|
|
ret 4;
|
|
|
|
}
|
|
|
|
#undef MOVSRC
|
|
|
|
#undef MOVDST
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static __declspec(naked) void __fastcall _memcpy_raz_udst(void *dest, const void *src, size_t bytes)
|
|
|
|
{
|
|
|
|
// MOVDST = opcode used to read. I use the same code for the aligned version, with a different define :)
|
|
|
|
#define MOVSRC movaps
|
|
|
|
#define MOVDST movups
|
|
|
|
__asm
|
|
|
|
{
|
|
|
|
//Reads before reads, to avoid stalls
|
|
|
|
mov eax,[esp+4];
|
|
|
|
//Make sure to save xmm0, it must be preserved ...
|
|
|
|
movaps [_xmm_backup],xmm0;
|
|
|
|
|
|
|
|
//if >=128 bytes use 128 byte unrolled loop
|
|
|
|
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
|
|
|
cmp eax,127;
|
|
|
|
jna _loop_1;
|
|
|
|
|
|
|
|
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
|
|
|
align 16
|
|
|
|
|
|
|
|
//128 byte unrolled loop
|
|
|
|
_loop_8:
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
|
|
|
MOVDST [ecx+0x00],xmm0; //then write :p
|
|
|
|
MOVSRC xmm0,[edx+0x10];
|
|
|
|
MOVDST [ecx+0x10],xmm0;
|
|
|
|
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
|
|
|
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x20-128];
|
|
|
|
MOVDST [ecx+0x20-128],xmm0;
|
|
|
|
MOVSRC xmm0,[edx+0x30-128];
|
|
|
|
MOVDST [ecx+0x30-128],xmm0;
|
|
|
|
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x40-128];
|
|
|
|
MOVDST [ecx+0x40-128],xmm0;
|
|
|
|
MOVSRC xmm0,[edx+0x50-128];
|
|
|
|
MOVDST [ecx+0x50-128],xmm0;
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x60-128];
|
|
|
|
MOVDST [ecx+0x60-128],xmm0;
|
|
|
|
MOVSRC xmm0,[edx+0x70-128];
|
|
|
|
MOVDST [ecx+0x70-128],xmm0;
|
|
|
|
|
|
|
|
//127~ja, 127 is encodable as simm8 :)
|
|
|
|
cmp eax,127;
|
|
|
|
ja _loop_8;
|
|
|
|
|
|
|
|
//direct copy for 0~7 qwords
|
|
|
|
//in order to avoid the inc/dec of all 3 registers
|
|
|
|
//i use negative relative addressing from the top of the buffers
|
|
|
|
//[top-current index]
|
|
|
|
|
|
|
|
_loop_1:
|
|
|
|
//prepare the regs for 'negative relative addressing'
|
|
|
|
add edx,eax;
|
|
|
|
add ecx,eax;
|
|
|
|
neg eax;
|
|
|
|
jz cleanup; //exit if nothing to do
|
|
|
|
|
|
|
|
_loop_1_inner:
|
|
|
|
MOVSRC xmm0,[edx+eax];
|
|
|
|
movaps [ecx+eax],xmm0;
|
|
|
|
|
|
|
|
add eax,16; //while the offset is still negative we have data to copy
|
|
|
|
js _loop_1_inner;
|
|
|
|
|
|
|
|
//done !
|
|
|
|
cleanup:
|
|
|
|
//restore xmm and exit ~)
|
|
|
|
movaps xmm0,[_xmm_backup];
|
|
|
|
ret 4;
|
|
|
|
}
|
|
|
|
#undef MOVSRC
|
|
|
|
#undef MOVDST
|
|
|
|
}
|
|
|
|
|
|
|
|
// Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
|
|
|
|
// This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
|
|
|
|
// used since the reads are linear and the cache logic can predict em :)
|
|
|
|
// *OBSOLETE* -- memcpy_amd_ has been optimized and is now faster.
|
|
|
|
__declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
|
|
|
|
{
|
|
|
|
// Code Implementation Notes:
|
|
|
|
// Uses a forward copy, in 128 byte blocks, and then does the remaining in 16 byte blocks :)
|
|
|
|
|
|
|
|
// MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
|
|
|
|
#define MOVSRC movaps
|
|
|
|
#define MOVDST movaps
|
|
|
|
__asm
|
|
|
|
{
|
|
|
|
//Reads before reads, to avoid stalls
|
|
|
|
mov eax,[esp+4];
|
|
|
|
//Make sure to save xmm0, it must be preserved ...
|
|
|
|
movaps [_xmm_backup],xmm0;
|
|
|
|
|
|
|
|
//if >=128 bytes use 128 byte unrolled loop
|
|
|
|
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
|
|
|
cmp eax,127;
|
|
|
|
jna _loop_1;
|
|
|
|
|
|
|
|
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
|
|
|
align 16
|
|
|
|
|
|
|
|
//128 byte unrolled loop
|
|
|
|
_loop_8:
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
|
|
|
MOVDST [ecx+0x00],xmm0; //then write :p
|
|
|
|
MOVSRC xmm0,[edx+0x10];
|
|
|
|
MOVDST [ecx+0x10],xmm0;
|
|
|
|
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
|
|
|
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x20-128];
|
|
|
|
MOVDST [ecx+0x20-128],xmm0;
|
|
|
|
MOVSRC xmm0,[edx+0x30-128];
|
|
|
|
MOVDST [ecx+0x30-128],xmm0;
|
|
|
|
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x40-128];
|
|
|
|
MOVDST [ecx+0x40-128],xmm0;
|
|
|
|
MOVSRC xmm0,[edx+0x50-128];
|
|
|
|
MOVDST [ecx+0x50-128],xmm0;
|
|
|
|
|
|
|
|
MOVSRC xmm0,[edx+0x60-128];
|
|
|
|
MOVDST [ecx+0x60-128],xmm0;
|
|
|
|
MOVSRC xmm0,[edx+0x70-128];
|
|
|
|
MOVDST [ecx+0x70-128],xmm0;
|
|
|
|
|
|
|
|
//127~ja, 127 is encodable as simm8 :)
|
|
|
|
cmp eax,127;
|
|
|
|
ja _loop_8;
|
2009-08-20 23:05:26 +00:00
|
|
|
|
2009-02-09 21:15:56 +00:00
|
|
|
//direct copy for 0~7 qwords
|
|
|
|
//in order to avoid the inc/dec of all 3 registers
|
|
|
|
//i use negative relative addressing from the top of the buffers
|
|
|
|
//[top-current index]
|
|
|
|
|
|
|
|
_loop_1:
|
|
|
|
//prepare the regs for 'negative relative addressing'
|
|
|
|
add edx,eax;
|
|
|
|
add ecx,eax;
|
|
|
|
neg eax;
|
|
|
|
jz cleanup; //exit if nothing to do
|
|
|
|
|
|
|
|
_loop_1_inner:
|
|
|
|
MOVSRC xmm0,[edx+eax];
|
|
|
|
MOVDST [ecx+eax],xmm0;
|
2009-08-20 23:05:26 +00:00
|
|
|
|
2009-02-09 21:15:56 +00:00
|
|
|
add eax,16; //while the offset is still negative we have data to copy
|
|
|
|
js _loop_1_inner;
|
|
|
|
|
|
|
|
//done !
|
|
|
|
cleanup:
|
|
|
|
//restore xmm and exit ~)
|
|
|
|
movaps xmm0,[_xmm_backup];
|
|
|
|
ret 4;
|
|
|
|
}
|
|
|
|
#undef MOVSRC
|
|
|
|
#undef MOVDST
|
|
|
|
}
|
|
|
|
|
|
|
|
// This memcpy routine is for use in situations where the source buffer's alignment is indeterminate.
|
|
|
|
__forceinline void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
|
|
|
|
{
|
|
|
|
if( ((uptr)src & 0xf) == 0 )
|
|
|
|
memcpy_raz_( dest, src, bytes );
|
|
|
|
else
|
|
|
|
_memcpy_raz_usrc( dest, src, bytes );
|
|
|
|
}
|
|
|
|
|
|
|
|
// This memcpy routine is for use in situations where the destination buffer's alignment is indeterminate.
|
|
|
|
__forceinline void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes)
|
|
|
|
{
|
|
|
|
if( ((uptr)dest & 0xf) == 0 )
|
|
|
|
memcpy_raz_( dest, src, bytes );
|
|
|
|
else
|
|
|
|
_memcpy_raz_udst( dest, src, bytes );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
// Fast memcpy as coded by AMD, and thn improved by air.
|
|
|
|
//
|
|
|
|
// This routine preserves mmx registers! It's the complete real deal!
|
|
|
|
__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
|
|
|
{
|
|
|
|
__asm
|
|
|
|
{
|
2009-08-20 23:05:26 +00:00
|
|
|
push edi
|
|
|
|
push esi
|
2009-02-09 21:15:56 +00:00
|
|
|
|
|
|
|
mov edi, ecx ; destination
|
|
|
|
mov esi, edx ; source
|
|
|
|
mov ecx, [esp+12] ; number of bytes to copy
|
|
|
|
mov eax, ecx ; keep a copy of count
|
|
|
|
|
|
|
|
cld
|
|
|
|
cmp eax, TINY_BLOCK_COPY
|
|
|
|
jb $memcpy_ic_3 ; tiny? skip mmx copy
|
|
|
|
|
|
|
|
cmp eax, 32*1024 ; don't align between 32k-64k because
|
|
|
|
jbe $memcpy_do_align ; it appears to be slower
|
|
|
|
cmp eax, 64*1024
|
|
|
|
jbe $memcpy_align_done
|
|
|
|
$memcpy_do_align:
|
|
|
|
mov eax, 8 ; a trick that's faster than rep movsb...
|
|
|
|
sub eax, edi ; align destination to qword
|
|
|
|
and eax, 111b ; get the low bits
|
|
|
|
sub ecx, eax ; update copy count
|
|
|
|
neg eax ; set up to jump into the array
|
|
|
|
add eax, offset $memcpy_align_done
|
|
|
|
jmp eax ; jump to array of movsb's
|
|
|
|
|
|
|
|
align 4
|
|
|
|
movsb
|
|
|
|
movsb
|
|
|
|
movsb
|
|
|
|
movsb
|
|
|
|
movsb
|
|
|
|
movsb
|
|
|
|
movsb
|
|
|
|
movsb
|
|
|
|
|
|
|
|
$memcpy_align_done: ; destination is dword aligned
|
|
|
|
mov eax, ecx ; number of bytes left to copy
|
|
|
|
shr eax, 6 ; get 64-byte block count
|
|
|
|
jz $memcpy_ic_2 ; finish the last few bytes
|
|
|
|
|
2009-03-09 21:52:33 +00:00
|
|
|
mov edx, offset _mmx_backup ; will probably need this to save/restore mmx
|
2009-02-09 21:15:56 +00:00
|
|
|
cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
|
|
|
jae $memcpy_uc_test
|
|
|
|
|
2009-03-09 21:52:33 +00:00
|
|
|
movq [edx+0x00],mm0
|
|
|
|
movq [edx+0x08],mm1
|
|
|
|
movq [edx+0x10],mm2
|
|
|
|
movq [edx+0x18],mm3
|
2009-02-09 21:15:56 +00:00
|
|
|
|
|
|
|
// This is small block copy that uses the MMX registers to copy 8 bytes
|
|
|
|
// at a time. It uses the "unrolled loop" optimization, and also uses
|
|
|
|
// the software prefetch instruction to get the data into the cache.
|
|
|
|
align 16
|
|
|
|
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
|
|
|
|
|
|
|
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
|
|
|
|
|
|
|
movq mm0, [esi+0] ; read 64 bits
|
|
|
|
movq mm1, [esi+8]
|
|
|
|
movq [edi+0], mm0 ; write 64 bits
|
|
|
|
movq [edi+8], mm1 ; note: the normal movq writes the
|
|
|
|
movq mm2, [esi+16] ; data to cache; a cache line will be
|
|
|
|
movq mm3, [esi+24] ; allocated as needed, to store the data
|
|
|
|
movq [edi+16], mm2
|
|
|
|
movq [edi+24], mm3
|
|
|
|
movq mm0, [esi+32]
|
|
|
|
movq mm1, [esi+40]
|
|
|
|
movq [edi+32], mm0
|
|
|
|
movq [edi+40], mm1
|
|
|
|
movq mm2, [esi+48]
|
|
|
|
movq mm3, [esi+56]
|
|
|
|
movq [edi+48], mm2
|
|
|
|
movq [edi+56], mm3
|
|
|
|
|
|
|
|
add esi, 64 ; update source pointer
|
|
|
|
add edi, 64 ; update destination pointer
|
|
|
|
dec eax ; count down
|
|
|
|
jnz $memcpy_ic_1 ; last 64-byte block?
|
|
|
|
|
2009-03-09 21:52:33 +00:00
|
|
|
movq mm0,[edx+0x00]
|
|
|
|
movq mm1,[edx+0x08]
|
|
|
|
movq mm2,[edx+0x10]
|
|
|
|
movq mm3,[edx+0x18]
|
2009-02-09 21:15:56 +00:00
|
|
|
|
|
|
|
$memcpy_ic_2:
|
|
|
|
mov eax, ecx ; has valid low 6 bits of the byte count
|
|
|
|
$memcpy_ic_3:
|
|
|
|
shr eax, 2 ; dword count
|
|
|
|
and eax, 1111b ; only look at the "remainder" bits
|
|
|
|
neg eax ; set up to jump into the array
|
|
|
|
add eax, offset $memcpy_last_few
|
|
|
|
jmp eax ; jump to array of movsd's
|
|
|
|
|
|
|
|
$memcpy_uc_test:
|
|
|
|
or eax, eax ; tail end of block prefetch will jump here
|
|
|
|
jz $memcpy_ic_2 ; no more 64-byte blocks left
|
|
|
|
|
|
|
|
// For larger blocks, which will spill beyond the cache, it's faster to
|
|
|
|
// use the Streaming Store instruction MOVNTQ. This write instruction
|
|
|
|
// bypasses the cache and writes straight to main memory. This code also
|
|
|
|
// uses the software prefetch instruction to pre-read the data.
|
|
|
|
|
2009-03-09 21:52:33 +00:00
|
|
|
movq [edx+0x00],mm0
|
|
|
|
movq [edx+0x08],mm1
|
|
|
|
movq [edx+0x10],mm2
|
2009-02-09 21:15:56 +00:00
|
|
|
|
|
|
|
align 16
|
|
|
|
$memcpy_uc_1: ; 64-byte blocks, uncached copy
|
|
|
|
|
|
|
|
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
|
|
|
|
|
|
|
movq mm0,[esi+0] ; read 64 bits
|
|
|
|
add edi,64 ; update destination pointer
|
|
|
|
movq mm1,[esi+8]
|
|
|
|
add esi,64 ; update source pointer
|
|
|
|
movq mm2,[esi-48]
|
|
|
|
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
|
|
|
|
movq mm0,[esi-40] ; note: movntq also prevents the CPU
|
|
|
|
movntq [edi-56], mm1 ; from READING the destination address
|
|
|
|
movq mm1,[esi-32] ; into the cache, only to be over-written
|
|
|
|
movntq [edi-48], mm2 ; so that also helps performance
|
|
|
|
movq mm2,[esi-24]
|
|
|
|
movntq [edi-40], mm0
|
|
|
|
movq mm0,[esi-16]
|
|
|
|
movntq [edi-32], mm1
|
|
|
|
movq mm1,[esi-8]
|
|
|
|
movntq [edi-24], mm2
|
|
|
|
movntq [edi-16], mm0
|
|
|
|
dec eax
|
|
|
|
movntq [edi-8], mm1
|
|
|
|
jnz $memcpy_uc_1 ; last 64-byte block?
|
|
|
|
|
2009-03-09 21:52:33 +00:00
|
|
|
movq mm0,[edx+0x00]
|
|
|
|
movq mm1,[edx+0x08]
|
|
|
|
movq mm2,[edx+0x10]
|
2009-02-09 21:15:56 +00:00
|
|
|
|
|
|
|
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
|
|
|
|
|
|
|
|
// For the largest size blocks, a special technique called Block Prefetch
|
|
|
|
// can be used to accelerate the read operations. Block Prefetch reads
|
|
|
|
// one address per cache line, for a series of cache lines, in a short loop.
|
|
|
|
// This is faster than using software prefetch. The technique is great for
|
|
|
|
// getting maximum read bandwidth, especially in DDR memory systems.
|
|
|
|
|
|
|
|
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
|
|
|
|
// help keep the code cache footprint of memcpy_fast to a minimum.
|
|
|
|
/*
|
|
|
|
$memcpy_bp_1: ; large blocks, block prefetch copy
|
|
|
|
|
|
|
|
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
|
|
|
jl $memcpy_64_test ; no, back to regular uncached copy
|
|
|
|
|
|
|
|
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
|
|
|
|
add esi, CACHEBLOCK * 64 ; move to the top of the block
|
|
|
|
align 16
|
|
|
|
$memcpy_bp_2:
|
|
|
|
mov edx, [esi-64] ; grab one address per cache line
|
|
|
|
mov edx, [esi-128] ; grab one address per cache line
|
|
|
|
sub esi, 128 ; go reverse order to suppress HW prefetcher
|
|
|
|
dec eax ; count down the cache lines
|
|
|
|
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
|
|
|
|
|
|
|
|
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
|
|
|
|
align 16
|
|
|
|
$memcpy_bp_3:
|
|
|
|
movq mm0, [esi ] ; read 64 bits
|
|
|
|
movq mm1, [esi+ 8]
|
|
|
|
movq mm2, [esi+16]
|
|
|
|
movq mm3, [esi+24]
|
|
|
|
movq mm4, [esi+32]
|
|
|
|
movq mm5, [esi+40]
|
|
|
|
movq mm6, [esi+48]
|
|
|
|
movq mm7, [esi+56]
|
|
|
|
add esi, 64 ; update source pointer
|
|
|
|
movntq [edi ], mm0 ; write 64 bits, bypassing cache
|
|
|
|
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
|
2009-08-20 23:05:26 +00:00
|
|
|
movntq [edi+16], mm2 ; from READING the destination address
|
2009-02-09 21:15:56 +00:00
|
|
|
movntq [edi+24], mm3 ; into the cache, only to be over-written,
|
|
|
|
movntq [edi+32], mm4 ; so that also helps performance
|
|
|
|
movntq [edi+40], mm5
|
|
|
|
movntq [edi+48], mm6
|
|
|
|
movntq [edi+56], mm7
|
|
|
|
add edi, 64 ; update dest pointer
|
|
|
|
|
|
|
|
dec eax ; count down
|
|
|
|
|
|
|
|
jnz $memcpy_bp_3 ; keep copying
|
|
|
|
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
|
|
|
jmp $memcpy_bp_1 ; keep processing chunks
|
|
|
|
*/
|
|
|
|
|
|
|
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
|
|
|
// form which is an "unrolled loop". Then it handles the last few bytes.
|
2009-03-09 21:52:33 +00:00
|
|
|
align 16
|
2009-02-09 21:15:56 +00:00
|
|
|
movsd
|
|
|
|
movsd ; perform last 1-15 dword copies
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
movsd ; perform last 1-7 dword copies
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
movsd
|
|
|
|
|
|
|
|
$memcpy_last_few: ; dword aligned from before movsd's
|
2009-03-09 21:52:33 +00:00
|
|
|
and ecx, 11b ; the last few cows must come home
|
2009-02-09 21:15:56 +00:00
|
|
|
jz $memcpy_final ; no more, let's leave
|
|
|
|
rep movsb ; the last 1, 2, or 3 bytes
|
|
|
|
|
2009-08-20 23:05:26 +00:00
|
|
|
$memcpy_final:
|
2009-02-09 21:15:56 +00:00
|
|
|
emms ; clean up the MMX state
|
|
|
|
sfence ; flush the write buffer
|
|
|
|
//mov eax, [dest] ; ret value = destination pointer
|
|
|
|
|
2009-08-20 23:05:26 +00:00
|
|
|
pop esi
|
2009-02-09 21:15:56 +00:00
|
|
|
pop edi
|
|
|
|
|
|
|
|
ret 4
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// mmx mem-compare implementation, size has to be a multiple of 8
|
|
|
|
// returns 0 is equal, nonzero value if not equal
|
|
|
|
// ~10 times faster than standard memcmp
|
|
|
|
// (zerofrog)
|
|
|
|
u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
|
|
|
|
{
|
|
|
|
assert( (cmpsize&7) == 0 );
|
|
|
|
|
|
|
|
__asm {
|
|
|
|
push esi
|
|
|
|
mov ecx, cmpsize
|
|
|
|
mov edx, src1
|
|
|
|
mov esi, src2
|
|
|
|
|
|
|
|
cmp ecx, 32
|
|
|
|
jl Done4
|
|
|
|
|
|
|
|
// custom test first 8 to make sure things are ok
|
|
|
|
movq mm0, [esi]
|
|
|
|
movq mm1, [esi+8]
|
|
|
|
pcmpeqd mm0, [edx]
|
|
|
|
pcmpeqd mm1, [edx+8]
|
|
|
|
pand mm0, mm1
|
|
|
|
movq mm2, [esi+16]
|
|
|
|
pmovmskb eax, mm0
|
|
|
|
movq mm3, [esi+24]
|
|
|
|
|
|
|
|
// check if eq
|
|
|
|
cmp eax, 0xff
|
|
|
|
je NextComp
|
|
|
|
mov eax, 1
|
|
|
|
jmp End
|
|
|
|
|
|
|
|
NextComp:
|
|
|
|
pcmpeqd mm2, [edx+16]
|
|
|
|
pcmpeqd mm3, [edx+24]
|
|
|
|
pand mm2, mm3
|
|
|
|
pmovmskb eax, mm2
|
|
|
|
|
|
|
|
sub ecx, 32
|
|
|
|
add esi, 32
|
|
|
|
add edx, 32
|
|
|
|
|
|
|
|
// check if eq
|
|
|
|
cmp eax, 0xff
|
|
|
|
je ContinueTest
|
|
|
|
mov eax, 1
|
|
|
|
jmp End
|
|
|
|
|
|
|
|
cmp ecx, 64
|
|
|
|
jl Done8
|
|
|
|
|
|
|
|
Cmp8:
|
|
|
|
movq mm0, [esi]
|
|
|
|
movq mm1, [esi+8]
|
|
|
|
movq mm2, [esi+16]
|
|
|
|
movq mm3, [esi+24]
|
|
|
|
movq mm4, [esi+32]
|
|
|
|
movq mm5, [esi+40]
|
|
|
|
movq mm6, [esi+48]
|
|
|
|
movq mm7, [esi+56]
|
|
|
|
pcmpeqd mm0, [edx]
|
|
|
|
pcmpeqd mm1, [edx+8]
|
|
|
|
pcmpeqd mm2, [edx+16]
|
|
|
|
pcmpeqd mm3, [edx+24]
|
|
|
|
pand mm0, mm1
|
|
|
|
pcmpeqd mm4, [edx+32]
|
|
|
|
pand mm0, mm2
|
|
|
|
pcmpeqd mm5, [edx+40]
|
|
|
|
pand mm0, mm3
|
|
|
|
pcmpeqd mm6, [edx+48]
|
|
|
|
pand mm0, mm4
|
|
|
|
pcmpeqd mm7, [edx+56]
|
|
|
|
pand mm0, mm5
|
|
|
|
pand mm0, mm6
|
|
|
|
pand mm0, mm7
|
|
|
|
pmovmskb eax, mm0
|
2009-08-20 23:05:26 +00:00
|
|
|
|
2009-02-09 21:15:56 +00:00
|
|
|
// check if eq
|
|
|
|
cmp eax, 0xff
|
|
|
|
je Continue
|
|
|
|
mov eax, 1
|
|
|
|
jmp End
|
|
|
|
|
|
|
|
Continue:
|
|
|
|
sub ecx, 64
|
|
|
|
add esi, 64
|
|
|
|
add edx, 64
|
|
|
|
ContinueTest:
|
|
|
|
cmp ecx, 64
|
|
|
|
jge Cmp8
|
|
|
|
|
|
|
|
Done8:
|
|
|
|
test ecx, 0x20
|
|
|
|
jz Done4
|
|
|
|
movq mm0, [esi]
|
|
|
|
movq mm1, [esi+8]
|
|
|
|
movq mm2, [esi+16]
|
|
|
|
movq mm3, [esi+24]
|
|
|
|
pcmpeqd mm0, [edx]
|
|
|
|
pcmpeqd mm1, [edx+8]
|
|
|
|
pcmpeqd mm2, [edx+16]
|
|
|
|
pcmpeqd mm3, [edx+24]
|
|
|
|
pand mm0, mm1
|
|
|
|
pand mm0, mm2
|
|
|
|
pand mm0, mm3
|
|
|
|
pmovmskb eax, mm0
|
|
|
|
sub ecx, 32
|
|
|
|
add esi, 32
|
|
|
|
add edx, 32
|
|
|
|
|
|
|
|
// check if eq
|
|
|
|
cmp eax, 0xff
|
|
|
|
je Done4
|
|
|
|
mov eax, 1
|
|
|
|
jmp End
|
|
|
|
|
|
|
|
Done4:
|
|
|
|
cmp ecx, 24
|
|
|
|
jne Done2
|
|
|
|
movq mm0, [esi]
|
|
|
|
movq mm1, [esi+8]
|
|
|
|
movq mm2, [esi+16]
|
|
|
|
pcmpeqd mm0, [edx]
|
|
|
|
pcmpeqd mm1, [edx+8]
|
|
|
|
pcmpeqd mm2, [edx+16]
|
|
|
|
pand mm0, mm1
|
|
|
|
pand mm0, mm2
|
|
|
|
pmovmskb eax, mm0
|
|
|
|
|
|
|
|
// check if eq
|
|
|
|
cmp eax, 0xff
|
|
|
|
setne al
|
|
|
|
jmp End
|
|
|
|
|
|
|
|
Done2:
|
|
|
|
cmp ecx, 16
|
|
|
|
jne Done1
|
|
|
|
|
|
|
|
movq mm0, [esi]
|
|
|
|
movq mm1, [esi+8]
|
|
|
|
pcmpeqd mm0, [edx]
|
|
|
|
pcmpeqd mm1, [edx+8]
|
|
|
|
pand mm0, mm1
|
|
|
|
pmovmskb eax, mm0
|
|
|
|
|
|
|
|
// check if eq
|
|
|
|
cmp eax, 0xff
|
|
|
|
setne al
|
|
|
|
jmp End
|
|
|
|
|
|
|
|
Done1:
|
|
|
|
cmp ecx, 8
|
|
|
|
jne Done
|
|
|
|
|
|
|
|
mov eax, [esi]
|
|
|
|
mov esi, [esi+4]
|
|
|
|
cmp eax, [edx]
|
|
|
|
je Next
|
|
|
|
mov eax, 1
|
|
|
|
jmp End
|
|
|
|
|
|
|
|
Next:
|
|
|
|
cmp esi, [edx+4]
|
|
|
|
setne al
|
|
|
|
jmp End
|
|
|
|
|
|
|
|
Done:
|
|
|
|
xor eax, eax
|
|
|
|
|
|
|
|
End:
|
|
|
|
pop esi
|
|
|
|
emms
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// returns the xor of all elements, cmpsize has to be mult of 8
|
|
|
|
void memxor_mmx(void* dst, const void* src1, int cmpsize)
|
|
|
|
{
|
|
|
|
assert( (cmpsize&7) == 0 );
|
|
|
|
|
|
|
|
__asm {
|
|
|
|
mov ecx, cmpsize
|
|
|
|
mov eax, src1
|
|
|
|
mov edx, dst
|
|
|
|
|
|
|
|
cmp ecx, 64
|
|
|
|
jl Setup4
|
|
|
|
|
|
|
|
movq mm0, [eax]
|
|
|
|
movq mm1, [eax+8]
|
|
|
|
movq mm2, [eax+16]
|
|
|
|
movq mm3, [eax+24]
|
|
|
|
movq mm4, [eax+32]
|
|
|
|
movq mm5, [eax+40]
|
|
|
|
movq mm6, [eax+48]
|
|
|
|
movq mm7, [eax+56]
|
|
|
|
sub ecx, 64
|
|
|
|
add eax, 64
|
|
|
|
cmp ecx, 64
|
|
|
|
jl End8
|
|
|
|
|
|
|
|
Cmp8:
|
|
|
|
pxor mm0, [eax]
|
|
|
|
pxor mm1, [eax+8]
|
|
|
|
pxor mm2, [eax+16]
|
|
|
|
pxor mm3, [eax+24]
|
|
|
|
pxor mm4, [eax+32]
|
|
|
|
pxor mm5, [eax+40]
|
|
|
|
pxor mm6, [eax+48]
|
|
|
|
pxor mm7, [eax+56]
|
|
|
|
|
|
|
|
sub ecx, 64
|
|
|
|
add eax, 64
|
|
|
|
cmp ecx, 64
|
|
|
|
jge Cmp8
|
|
|
|
|
|
|
|
End8:
|
|
|
|
pxor mm0, mm4
|
|
|
|
pxor mm1, mm5
|
|
|
|
pxor mm2, mm6
|
|
|
|
pxor mm3, mm7
|
|
|
|
|
|
|
|
cmp ecx, 32
|
|
|
|
jl End4
|
|
|
|
pxor mm0, [eax]
|
|
|
|
pxor mm1, [eax+8]
|
|
|
|
pxor mm2, [eax+16]
|
|
|
|
pxor mm3, [eax+24]
|
|
|
|
sub ecx, 32
|
|
|
|
add eax, 32
|
|
|
|
jmp End4
|
|
|
|
|
|
|
|
Setup4:
|
|
|
|
cmp ecx, 32
|
|
|
|
jl Setup2
|
|
|
|
|
|
|
|
movq mm0, [eax]
|
|
|
|
movq mm1, [eax+8]
|
|
|
|
movq mm2, [eax+16]
|
|
|
|
movq mm3, [eax+24]
|
|
|
|
sub ecx, 32
|
|
|
|
add eax, 32
|
|
|
|
|
|
|
|
End4:
|
|
|
|
pxor mm0, mm2
|
|
|
|
pxor mm1, mm3
|
|
|
|
|
|
|
|
cmp ecx, 16
|
|
|
|
jl End2
|
|
|
|
pxor mm0, [eax]
|
|
|
|
pxor mm1, [eax+8]
|
|
|
|
sub ecx, 16
|
|
|
|
add eax, 16
|
|
|
|
jmp End2
|
|
|
|
|
|
|
|
Setup2:
|
|
|
|
cmp ecx, 16
|
|
|
|
jl Setup1
|
|
|
|
|
|
|
|
movq mm0, [eax]
|
|
|
|
movq mm1, [eax+8]
|
|
|
|
sub ecx, 16
|
|
|
|
add eax, 16
|
|
|
|
|
|
|
|
End2:
|
|
|
|
pxor mm0, mm1
|
|
|
|
|
|
|
|
cmp ecx, 8
|
|
|
|
jl End1
|
|
|
|
pxor mm0, [eax]
|
|
|
|
End1:
|
|
|
|
movq [edx], mm0
|
|
|
|
jmp End
|
|
|
|
|
|
|
|
Setup1:
|
|
|
|
movq mm0, [eax]
|
|
|
|
movq [edx], mm0
|
|
|
|
End:
|
|
|
|
emms
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|