mirror of https://github.com/PCSX2/pcsx2.git
90% of an implementation of memcpy_fast_ for Linux. And fix debug mode.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@642 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
44d47ca891
commit
ad0705de56
4
build.sh
4
build.sh
|
@ -9,10 +9,10 @@
|
||||||
#export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --prefix `pwd`"
|
#export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --prefix `pwd`"
|
||||||
|
|
||||||
#Optimized, but a devbuild
|
#Optimized, but a devbuild
|
||||||
#export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --enable-devbuild --prefix `pwd`"
|
export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --enable-devbuild --prefix `pwd`"
|
||||||
|
|
||||||
#Debug / Devbuild version
|
#Debug / Devbuild version
|
||||||
export PCSX2OPTIONS="--enable-debug --enable-devbuild --enable-sse3 --prefix `pwd`"
|
#export PCSX2OPTIONS="--enable-debug --enable-devbuild --enable-sse3 --prefix `pwd`"
|
||||||
|
|
||||||
#ZeroGS Normal mode
|
#ZeroGS Normal mode
|
||||||
export ZEROGSOPTIONS="--enable-sse2"
|
export ZEROGSOPTIONS="--enable-sse2"
|
||||||
|
|
|
@ -19,8 +19,6 @@
|
||||||
#ifndef __MEMCPY_FAST_H__
|
#ifndef __MEMCPY_FAST_H__
|
||||||
#define __MEMCPY_FAST_H__
|
#define __MEMCPY_FAST_H__
|
||||||
|
|
||||||
//#include "Misc.h"
|
|
||||||
|
|
||||||
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||||
|
|
||||||
#if defined(_WIN32) && !defined(__x86_64__)
|
#if defined(_WIN32) && !defined(__x86_64__)
|
||||||
|
@ -33,6 +31,8 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||||
//extern void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes);
|
//extern void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes);
|
||||||
//extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes);
|
//extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes);
|
||||||
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
||||||
|
extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
||||||
|
extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
||||||
|
|
||||||
# include "windows/memzero.h"
|
# include "windows/memzero.h"
|
||||||
# define memcpy_fast memcpy_amd_
|
# define memcpy_fast memcpy_amd_
|
||||||
|
@ -41,20 +41,23 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||||
#else
|
#else
|
||||||
|
|
||||||
// for now linux uses the GCC memcpy/memset implementations.
|
// for now linux uses the GCC memcpy/memset implementations.
|
||||||
#define memcpy_fast memcpy
|
//#define memcpy_raz_udst memcpy
|
||||||
#define memcpy_raz_ memcpy
|
//#define memcpy_raz_usrc memcpy
|
||||||
#define memcpy_raz_u memcpy
|
//#define memcpy_raz_ memcpy
|
||||||
|
|
||||||
|
// fast_routines.S
|
||||||
|
extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
||||||
|
extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
||||||
|
|
||||||
#define memcpy_aligned memcpy
|
# include "Linux/memzero.h"
|
||||||
#define memcpy_raz_u memcpy
|
# define memcpy_fast memcpy
|
||||||
|
# define memcpy_aligned memcpy
|
||||||
|
|
||||||
#include "Linux/memzero.h"
|
// Currently broken.
|
||||||
|
//# define memcpy_fast memcpy_amd_
|
||||||
|
//# define memcpy_aligned memcpy_amd_
|
||||||
|
// extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef __LINUX__
|
|
||||||
extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
|
||||||
extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -40,6 +40,10 @@ void psxRecRecompile(u32 startpc);
|
||||||
|
|
||||||
// Linux specific
|
// Linux specific
|
||||||
#ifdef __LINUX__
|
#ifdef __LINUX__
|
||||||
|
|
||||||
|
PCSX2_ALIGNED16( u8 _xmm_backup[16*2] );
|
||||||
|
PCSX2_ALIGNED16( u8 _mmx_backup[8*4] );
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@ -57,10 +61,6 @@ void psxDispatcherReg();
|
||||||
void Dispatcher();
|
void Dispatcher();
|
||||||
void DispatcherClear();
|
void DispatcherClear();
|
||||||
void DispatcherReg();
|
void DispatcherReg();
|
||||||
|
|
||||||
// fast_routines.S
|
|
||||||
u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
|
||||||
void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
|
@ -11,7 +11,8 @@ ix86-32/iR5900Templates.cpp ix86-32/recVTLB.cpp
|
||||||
libx86recomp_a_SOURCES = \
|
libx86recomp_a_SOURCES = \
|
||||||
BaseblockEx.cpp iCOP0.cpp iCOP2.cpp iCore.cpp iFPU.cpp iGS.cpp iHw.cpp iIPU.cpp iMMI.cpp iPsxHw.cpp iPsxMem.cpp \
|
BaseblockEx.cpp iCOP0.cpp iCOP2.cpp iCore.cpp iFPU.cpp iGS.cpp iHw.cpp iIPU.cpp iMMI.cpp iPsxHw.cpp iPsxMem.cpp \
|
||||||
iR3000A.cpp iR3000Atables.cpp iR5900CoissuedLoadStore.cpp iR5900Misc.cpp iVU0micro.cpp iVU1micro.cpp iVUmicro.cpp \
|
iR3000A.cpp iR3000Atables.cpp iR5900CoissuedLoadStore.cpp iR5900Misc.cpp iVU0micro.cpp iVU1micro.cpp iVUmicro.cpp \
|
||||||
iVUmicroLower.cpp iVUmicroUpper.cpp iVUzerorec.cpp iVif.cpp ir5900tables.cpp fast_routines.S aR3000A.S aVUzerorec.S aVif.S $(archfiles)
|
iVUmicroLower.cpp iVUmicroUpper.cpp iVUzerorec.cpp iVif.cpp ir5900tables.cpp fast_routines.S aR3000A.S aVUzerorec.S \
|
||||||
|
aVif.S $(archfiles)
|
||||||
|
|
||||||
libx86recomp_a_SOURCES += \
|
libx86recomp_a_SOURCES += \
|
||||||
BaseblockEx.h iCOP0.h iCore.h iFPU.h iMMI.h iR3000A.h iR5900.h iR5900Arit.h iR5900AritImm.h iR5900Branch.h iR5900Jump.h \
|
BaseblockEx.h iCOP0.h iCore.h iFPU.h iMMI.h iR3000A.h iR5900.h iR5900Arit.h iR5900AritImm.h iR5900Branch.h iR5900Jump.h \
|
||||||
|
|
|
@ -15,12 +15,21 @@
|
||||||
* along with this program; if not, write to the Free Software
|
* along with this program; if not, write to the Free Software
|
||||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#define TINY_BLOCK_COPY 64
|
||||||
|
#define IN_CACHE_COPY 2 * 1024
|
||||||
|
#define UNCACHED_COPY 4 * 1024 /
|
||||||
|
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
|
||||||
|
#define CACHEBLOCK 80h
|
||||||
|
|
||||||
// Fast assembly routines for x86-64
|
// Fast assembly routines for x86-64
|
||||||
// zerofrog(@gmail.com)
|
// zerofrog(@gmail.com)
|
||||||
|
// and added to by arcum42@gmail.com
|
||||||
.intel_syntax
|
.intel_syntax
|
||||||
.extern g_EEFreezeRegs
|
.extern g_EEFreezeRegs
|
||||||
.extern FreezeMMXRegs_
|
.extern FreezeMMXRegs_
|
||||||
|
.extern _mmx_backup
|
||||||
|
|
||||||
// mmx memcmp implementation, size has to be a multiple of 8
|
// mmx memcmp implementation, size has to be a multiple of 8
|
||||||
// returns 0 is equal, nonzero value if not equal
|
// returns 0 is equal, nonzero value if not equal
|
||||||
|
@ -208,9 +217,7 @@ memcmp_Done:
|
||||||
|
|
||||||
memcmp_End:
|
memcmp_End:
|
||||||
emms
|
emms
|
||||||
#ifndef __x86_64__
|
|
||||||
pop %esi
|
pop %esi
|
||||||
#endif
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
// memxor_mmx
|
// memxor_mmx
|
||||||
|
@ -329,3 +336,192 @@ memxor_End:
|
||||||
emms
|
emms
|
||||||
pop %esi
|
pop %esi
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
// void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
||||||
|
.globl memcpy_amd_
|
||||||
|
memcpy_amd_:
|
||||||
|
push %edi
|
||||||
|
push %esi
|
||||||
|
|
||||||
|
mov %edi, %ecx // destination
|
||||||
|
mov %esi, %edx // source
|
||||||
|
mov %ecx, [%esp+12] // number of bytes to copy
|
||||||
|
mov %eax, %ecx // keep a copy of count
|
||||||
|
|
||||||
|
cld
|
||||||
|
cmp %eax, TINY_BLOCK_COPY
|
||||||
|
jb $memcpy_ic_3 // tiny? skip mmx copy
|
||||||
|
|
||||||
|
cmp %eax, 32*1024 // don't align between 32k-64k because
|
||||||
|
jbe $memcpy_do_align // it appears to be slower
|
||||||
|
cmp %eax, 64*1024
|
||||||
|
jbe $memcpy_align_done
|
||||||
|
$memcpy_do_align:
|
||||||
|
mov %eax, 8 // a trick that's faster than rep movsb...
|
||||||
|
sub %eax, %edi // align destination to qword
|
||||||
|
and %eax, 0x111b // get the low bits
|
||||||
|
sub %ecx, %eax // update copy count
|
||||||
|
neg %eax // set up to jump into the array
|
||||||
|
add %eax, offset $memcpy_align_done
|
||||||
|
jmp %eax // jump to array of movsb's
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
movsb
|
||||||
|
movsb
|
||||||
|
movsb
|
||||||
|
movsb
|
||||||
|
movsb
|
||||||
|
movsb
|
||||||
|
movsb
|
||||||
|
movsb
|
||||||
|
|
||||||
|
$memcpy_align_done: // destination is dword aligned
|
||||||
|
mov %eax, %ecx // number of bytes left to copy
|
||||||
|
shr %eax, 6 // get 64-byte block count
|
||||||
|
jz $memcpy_ic_2 // finish the last few bytes
|
||||||
|
|
||||||
|
cmp %eax, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy
|
||||||
|
jae $memcpy_uc_test
|
||||||
|
|
||||||
|
movq [_mmx_backup+0x00],%mm0
|
||||||
|
movq [_mmx_backup+0x08],%mm1
|
||||||
|
movq [_mmx_backup+0x10],%mm2
|
||||||
|
movq [_mmx_backup+0x18],%mm3
|
||||||
|
|
||||||
|
// This is small block copy that uses the MMX registers to copy 8 bytes
|
||||||
|
// at a time. It uses the "unrolled loop" optimization, and also uses
|
||||||
|
// the software prefetch instruction to get the data into the cache.
|
||||||
|
.align 16
|
||||||
|
$memcpy_ic_1: // 64-byte block copies, in-cache copy
|
||||||
|
|
||||||
|
prefetchnta [%esi + (200*64/34+192)] // start reading ahead
|
||||||
|
|
||||||
|
movq %mm0, [%esi+0] // read 64 bits
|
||||||
|
movq %mm1, [%esi+8]
|
||||||
|
movq [%edi+0], %mm0 //write 64 bits
|
||||||
|
movq [%edi+8], %mm1 // note: the normal movq writes the
|
||||||
|
movq %mm2, [%esi+16] // data to cache; a cache line will be
|
||||||
|
movq %mm3, [%esi+24] // allocated as needed, to store the data
|
||||||
|
movq [%edi+16], %mm2
|
||||||
|
movq [%edi+24], %mm3
|
||||||
|
movq %mm0, [%esi+32]
|
||||||
|
movq %mm1, [%esi+40]
|
||||||
|
movq [%edi+32], %mm0
|
||||||
|
movq [%edi+40], %mm1
|
||||||
|
movq %mm2, [%esi+48]
|
||||||
|
movq %mm3, [%esi+56]
|
||||||
|
movq [%edi+48], %mm2
|
||||||
|
movq [%edi+56], %mm3
|
||||||
|
|
||||||
|
add %esi, 64 // update source pointer
|
||||||
|
add %edi, 64 // update destination pointer
|
||||||
|
dec %eax // count down
|
||||||
|
jnz $memcpy_ic_1 // last 64-byte block?
|
||||||
|
|
||||||
|
movq %mm0,[_mmx_backup+0x00]
|
||||||
|
movq %mm1,[_mmx_backup+0x08]
|
||||||
|
movq %mm2,[_mmx_backup+0x10]
|
||||||
|
movq %mm3,[_mmx_backup+0x18]
|
||||||
|
|
||||||
|
$memcpy_ic_2:
|
||||||
|
mov %eax, %ecx // has valid low 6 bits of the byte count
|
||||||
|
$memcpy_ic_3:
|
||||||
|
shr %eax, 2 // dword count
|
||||||
|
and %eax, 0x1111b // only look at the "remainder" bits
|
||||||
|
neg %eax // set up to jump into the array
|
||||||
|
add %eax, offset $memcpy_last_few
|
||||||
|
jmp %eax // jump to array of movsd's
|
||||||
|
|
||||||
|
$memcpy_uc_test:
|
||||||
|
// cmp %ecx, UNCACHED_COPY/64 // big enough? use block prefetch copy
|
||||||
|
// jae $memcpy_bp_1
|
||||||
|
//$memcpy_64_test:
|
||||||
|
or %eax, %eax // tail end of block prefetch will jump here
|
||||||
|
jz $memcpy_ic_2 // no more 64-byte blocks left
|
||||||
|
|
||||||
|
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||||
|
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||||
|
// bypasses the cache and writes straight to main memory. This code also
|
||||||
|
// uses the software prefetch instruction to pre-read the data.
|
||||||
|
|
||||||
|
movq [_mmx_backup+0x00],%mm0
|
||||||
|
movq [_mmx_backup+0x08],%mm1
|
||||||
|
movq [_mmx_backup+0x10],%mm2
|
||||||
|
|
||||||
|
.align 16
|
||||||
|
$memcpy_uc_1: // 64-byte blocks, uncached copy
|
||||||
|
|
||||||
|
prefetchnta [%esi + (200*64/34+192)] // start reading ahead
|
||||||
|
|
||||||
|
movq %mm0,[%esi+0] // read 64 bits
|
||||||
|
add %edi,64 // update destination pointer
|
||||||
|
movq %mm1,[%esi+8]
|
||||||
|
add %esi,64 // update source pointer
|
||||||
|
movq %mm2,[%esi-48]
|
||||||
|
movntq [%edi-64], %mm0 // write 64 bits, bypassing the cache
|
||||||
|
movq %mm0,[%esi-40] // note: movntq also prevents the CPU
|
||||||
|
movntq [%edi-56], %mm1 // from READING the destination address
|
||||||
|
movq %mm1,[%esi-32] // into the cache, only to be over-written
|
||||||
|
movntq [%edi-48], %mm2 // so that also helps performance
|
||||||
|
movq %mm2,[%esi-24]
|
||||||
|
movntq [%edi-40], %mm0
|
||||||
|
movq %mm0,[%esi-16]
|
||||||
|
movntq [%edi-32], %mm1
|
||||||
|
movq %mm1,[%esi-8]
|
||||||
|
movntq [%edi-24], %mm2
|
||||||
|
movntq [%edi-16], %mm0
|
||||||
|
dec %eax
|
||||||
|
movntq [%edi-8], %mm1
|
||||||
|
jnz $memcpy_uc_1 // last 64-byte block?
|
||||||
|
|
||||||
|
movq %mm0,[_mmx_backup+0x00]
|
||||||
|
movq %mm1,[_mmx_backup+0x08]
|
||||||
|
movq %mm2,[_mmx_backup+0x10]
|
||||||
|
|
||||||
|
jmp $memcpy_ic_2 // almost done (not needed because large copy below was removed)
|
||||||
|
|
||||||
|
// For the largest size blocks, a special technique called Block Prefetch
|
||||||
|
// can be used to accelerate the read operations. Block Prefetch reads
|
||||||
|
// one address per cache line, for a series of cache lines, in a short loop.
|
||||||
|
// This is faster than using software prefetch. The technique is great for
|
||||||
|
// getting maximum read bandwidth, especially in DDR memory systems.
|
||||||
|
|
||||||
|
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
|
||||||
|
// help keep the code cache footprint of memcpy_fast to a minimum.
|
||||||
|
// <Code removed here>
|
||||||
|
|
||||||
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||||
|
// form which is an "unrolled loop". Then it handles the last few bytes.
|
||||||
|
.align 4
|
||||||
|
movsd
|
||||||
|
movsd // perform last 1-15 dword copies
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
movsd // perform last 1-7 dword copies
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
movsd
|
||||||
|
|
||||||
|
$memcpy_last_few: // dword aligned from before movsd's
|
||||||
|
mov %eax, %ecx // has valid low 2 bits of the byte count
|
||||||
|
and %eax, 0x11b // the last few cows must come home
|
||||||
|
jz $memcpy_final // no more, let's leave
|
||||||
|
rep movsb // the last 1, 2, or 3 bytes
|
||||||
|
|
||||||
|
$memcpy_final:
|
||||||
|
emms // clean up the MMX state
|
||||||
|
sfence // flush the write buffer
|
||||||
|
//mov %eax, [dest] // ret value = destination pointer
|
||||||
|
|
||||||
|
pop %esi
|
||||||
|
pop %edi
|
||||||
|
|
||||||
|
ret 4
|
|
@ -194,7 +194,7 @@ static void iIopDumpBlock( int startpc, u8 * ptr )
|
||||||
system( command );
|
system( command );
|
||||||
sprintf(command, "mv tempdump %s", filename);
|
sprintf(command, "mv tempdump %s", filename);
|
||||||
system(command);
|
system(command);
|
||||||
f = fopen( filename, "a+" );
|
f = fopen( filename.c_str(), "a+" );
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -517,7 +517,8 @@ void SuperVUDumpBlock(list<VuBaseBlock*>& blocks, int vuindex)
|
||||||
u32 i;
|
u32 i;
|
||||||
|
|
||||||
Path::CreateDirectory( "dumps" );
|
Path::CreateDirectory( "dumps" );
|
||||||
ssprintf( filename, "dumps\\svu%cdump%.4X.txt", s_vu?'0':'1', s_pFnHeader->startpc );
|
ssprintf( filename, "svu%cdump%.4X.txt", s_vu?'0':'1', s_pFnHeader->startpc );
|
||||||
|
filename = Path::Combine( "dumps", filename );
|
||||||
|
|
||||||
//SysPrintf( "dump1 %x => %s\n", s_pFnHeader->startpc, filename );
|
//SysPrintf( "dump1 %x => %s\n", s_pFnHeader->startpc, filename );
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue