From 2270ba4eeeb6d71d3525475dd3586d2ea6d5289e Mon Sep 17 00:00:00 2001 From: arcum42 Date: Sun, 1 Mar 2009 08:21:14 +0000 Subject: [PATCH] Finish the Linux implementation of memcpy_fast_. I've disabled it by default until I'm sure it's working right, but it can easily be enabled in build.sh. Should be a speed boost in Linux (which Windows already had), but I haven't tested it enough to be able to tell yet. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@643 96395faa-99c1-11dd-bbfe-3dabce05a288 --- build.sh | 4 ++++ pcsx2/MemcpyFast.h | 15 ++++++++------- pcsx2/configure.ac | 10 ++++++++++ pcsx2/x86/fast_routines.S | 6 +++--- 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/build.sh b/build.sh index e409a5c185..966770bf59 100644 --- a/build.sh +++ b/build.sh @@ -11,6 +11,10 @@ #Optimized, but a devbuild export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --enable-devbuild --prefix `pwd`" + +#Optimized, but a devbuild - with memcpy_fast_ enabled. +#export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --enable-devbuild --enable-memcpyfast --prefix `pwd`" + #Debug / Devbuild version #export PCSX2OPTIONS="--enable-debug --enable-devbuild --enable-sse3 --prefix `pwd`" diff --git a/pcsx2/MemcpyFast.h b/pcsx2/MemcpyFast.h index 8e2ef8c90a..844de2ff62 100644 --- a/pcsx2/MemcpyFast.h +++ b/pcsx2/MemcpyFast.h @@ -50,14 +50,15 @@ void _memset16_unaligned( void* dest, u16 data, size_t size ); extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize); # include "Linux/memzero.h" +#if defined(LINUX_USE_FAST_MEMORY) +# define memcpy_fast memcpy_amd_ +# define memcpy_aligned memcpy_amd_ + extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes); +#else # define memcpy_fast memcpy # define memcpy_aligned memcpy +#endif // LINUX_USE_FAST_MEMORY - // Currently broken. -//# define memcpy_fast memcpy_amd_ -//# define memcpy_aligned memcpy_amd_ -// extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes); +#endif // WIN32 -#endif - -#endif +#endif //Header diff --git a/pcsx2/configure.ac b/pcsx2/configure.ac index 4e28aa7368..acd3813e59 100644 --- a/pcsx2/configure.ac +++ b/pcsx2/configure.ac @@ -63,6 +63,15 @@ AC_MSG_RESULT($debug) AC_CHECK_FUNCS([ _aligned_malloc _aligned_free ], AC_DEFINE(HAVE_ALIGNED_MALLOC)) +AC_MSG_CHECKING(turn on memcpy_fast_) +AC_ARG_ENABLE(memcpyfast, AC_HELP_STRING([--enable-memcpyfast], [Turns on memcpy_fast - EXPERIMENTAL]), + memcpyfast=$enableval,memcpyfast=no) +if test "x$memcpyfast" == xyes +then + AC_DEFINE(LINUX_USE_FAST_MEMORY,1,[LINUX_USE_FAST_MEMORY]) +fi +AC_MSG_RESULT($memcpyfast) + dnl Check for dev build AC_MSG_CHECKING(for development build) AC_ARG_ENABLE(devbuild, AC_HELP_STRING([--enable-devbuild], [Special Build for developers that simplifies testing and adds extra checks]), @@ -138,3 +147,4 @@ echo " Force sse3? $sse3" echo " nls support? $nls" echo " local plugin inis? $localinis" echo " custom cflags? $customcflags" +echo " memcpy_fast? $memcpyfast" diff --git a/pcsx2/x86/fast_routines.S b/pcsx2/x86/fast_routines.S index a06f3749ec..be2b118af8 100644 --- a/pcsx2/x86/fast_routines.S +++ b/pcsx2/x86/fast_routines.S @@ -359,7 +359,7 @@ memcpy_amd_: $memcpy_do_align: mov %eax, 8 // a trick that's faster than rep movsb... sub %eax, %edi // align destination to qword - and %eax, 0x111b // get the low bits + andb %eax, 111 // get the low bits sub %ecx, %eax // update copy count neg %eax // set up to jump into the array add %eax, offset $memcpy_align_done @@ -427,7 +427,7 @@ $memcpy_ic_2: mov %eax, %ecx // has valid low 6 bits of the byte count $memcpy_ic_3: shr %eax, 2 // dword count - and %eax, 0x1111b // only look at the "remainder" bits + andb %eax, 1111 // only look at the "remainder" bits neg %eax // set up to jump into the array add %eax, offset $memcpy_last_few jmp %eax // jump to array of movsd's @@ -512,7 +512,7 @@ $memcpy_uc_1: // 64-byte blocks, uncached copy $memcpy_last_few: // dword aligned from before movsd's mov %eax, %ecx // has valid low 2 bits of the byte count - and %eax, 0x11b // the last few cows must come home + andb %eax, 11 // the last few cows must come home jz $memcpy_final // no more, let's leave rep movsb // the last 1, 2, or 3 bytes