90% of an implementation of memcpy_fast_ for Linux. And fix debug mode.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@642 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-03-01 06:31:33 +00:00 · 2009-03-01 06:31:33 +00:00 · ad0705de56
parent 44d47ca891
commit ad0705de56
7 changed files with 225 additions and 24 deletions
--- a/build.sh
+++ b/build.sh
@ -9,10 +9,10 @@
 #export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --prefix `pwd`"

 #Optimized, but a devbuild
-#export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --enable-devbuild --prefix `pwd`"
+export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --enable-devbuild --prefix `pwd`"

 #Debug / Devbuild version
-export PCSX2OPTIONS="--enable-debug --enable-devbuild --enable-sse3 --prefix `pwd`"
+#export PCSX2OPTIONS="--enable-debug --enable-devbuild --enable-sse3 --prefix `pwd`"

 #ZeroGS Normal mode
 export ZEROGSOPTIONS="--enable-sse2"
--- a/pcsx2/MemcpyFast.h
+++ b/pcsx2/MemcpyFast.h
@ -19,8 +19,6 @@
 #ifndef __MEMCPY_FAST_H__
 #define __MEMCPY_FAST_H__

-//#include "Misc.h"
-
 void _memset16_unaligned( void* dest, u16 data, size_t size );

 #if defined(_WIN32) && !defined(__x86_64__)
@ -33,6 +31,8 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 	//extern void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes);
 	//extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes);
 	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
+	extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
+	extern void memxor_mmx(void* dst, const void* src1, int cmpsize);

 #	include "windows/memzero.h"
 #	define memcpy_fast memcpy_amd_
@ -41,20 +41,23 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
 #else

 	// for now linux uses the GCC memcpy/memset implementations.
-	#define memcpy_fast memcpy
-	#define memcpy_raz_ memcpy
-	#define memcpy_raz_u memcpy
+	//#define memcpy_raz_udst memcpy
+	//#define memcpy_raz_usrc memcpy
+	//#define memcpy_raz_ memcpy
+	
+	// fast_routines.S
+	extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
+	extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);

-	#define memcpy_aligned memcpy
-	#define memcpy_raz_u memcpy
+#	include "Linux/memzero.h"
+#	define memcpy_fast memcpy
+#	define memcpy_aligned memcpy

-	#include "Linux/memzero.h"
+	// Currently broken.
+//#	define memcpy_fast memcpy_amd_
+//#	define memcpy_aligned memcpy_amd_
+//	extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);

 #endif

-#ifndef __LINUX__
-extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
-extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
-#endif
-
 #endif
--- a/pcsx2/NakedAsm.h
+++ b/pcsx2/NakedAsm.h
@ -40,6 +40,10 @@ void psxRecRecompile(u32 startpc);

 // Linux specific
 #ifdef __LINUX__
+
+PCSX2_ALIGNED16( u8 _xmm_backup[16*2] );
+PCSX2_ALIGNED16( u8 _mmx_backup[8*4] );
+
 extern "C" 
 {

@ -57,10 +61,6 @@ void psxDispatcherReg();
 void Dispatcher();
 void DispatcherClear();
 void DispatcherReg();
-
-// fast_routines.S
-u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
-void memxor_mmx(void* dst, const void* src1, int cmpsize);
 }
 #endif
 #endif
--- a/pcsx2/x86/Makefile.am
+++ b/pcsx2/x86/Makefile.am
@ -11,7 +11,8 @@ ix86-32/iR5900Templates.cpp ix86-32/recVTLB.cpp
 libx86recomp_a_SOURCES = \
 BaseblockEx.cpp iCOP0.cpp iCOP2.cpp iCore.cpp iFPU.cpp iGS.cpp iHw.cpp iIPU.cpp iMMI.cpp iPsxHw.cpp iPsxMem.cpp \
 iR3000A.cpp iR3000Atables.cpp iR5900CoissuedLoadStore.cpp iR5900Misc.cpp iVU0micro.cpp iVU1micro.cpp iVUmicro.cpp \
-iVUmicroLower.cpp iVUmicroUpper.cpp iVUzerorec.cpp iVif.cpp ir5900tables.cpp  fast_routines.S aR3000A.S aVUzerorec.S aVif.S $(archfiles)
+iVUmicroLower.cpp iVUmicroUpper.cpp iVUzerorec.cpp iVif.cpp ir5900tables.cpp  fast_routines.S aR3000A.S aVUzerorec.S \
+aVif.S $(archfiles)

 libx86recomp_a_SOURCES += \
 BaseblockEx.h iCOP0.h iCore.h iFPU.h iMMI.h iR3000A.h iR5900.h iR5900Arit.h iR5900AritImm.h iR5900Branch.h iR5900Jump.h \
--- a/pcsx2/x86/fast_routines.S
+++ b/pcsx2/x86/fast_routines.S
@ -15,12 +15,21 @@
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+ 
+
+#define TINY_BLOCK_COPY 64  
+#define IN_CACHE_COPY 2 * 1024 
+#define UNCACHED_COPY 4 * 1024 /
+#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch 
+#define CACHEBLOCK 80h 

 // Fast assembly routines for x86-64
 // zerofrog(@gmail.com)
+// and added to by arcum42@gmail.com
 .intel_syntax
 .extern g_EEFreezeRegs
 .extern FreezeMMXRegs_
+.extern _mmx_backup

 // mmx memcmp implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
@ -208,9 +217,7 @@ memcmp_Done:

 memcmp_End:
 		emms
-#ifndef __x86_64__
 		pop %esi
-#endif
 		ret
        
 // memxor_mmx
@ -329,3 +336,192 @@ memxor_End:
 	emms
 	pop %esi
 	ret
+
+// void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
+.globl memcpy_amd_
+memcpy_amd_:
+	push    %edi  
+	push    %esi  
+
+	mov		%edi, %ecx		// destination
+	mov		%esi, %edx		// source
+	mov		%ecx, [%esp+12]	// number of bytes to copy
+	mov		%eax, %ecx		// keep a copy of count
+
+	cld
+	cmp		%eax, TINY_BLOCK_COPY
+	jb		$memcpy_ic_3	// tiny? skip mmx copy
+
+	cmp		%eax, 32*1024		// don't align between 32k-64k because
+	jbe		$memcpy_do_align //  it appears to be slower
+	cmp		%eax, 64*1024
+	jbe		$memcpy_align_done
+$memcpy_do_align:
+	mov		%eax, 8			// a trick that's faster than rep movsb...
+	sub		%eax, %edi		// align destination to qword
+	and		%eax, 0x111b		// get the low bits
+	sub		%ecx, %eax		// update copy count
+	neg		%eax				// set up to jump into the array
+	add		%eax, offset $memcpy_align_done
+	jmp		%eax				// jump to array of movsb's
+
+.align 4
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+
+$memcpy_align_done:			// destination is dword aligned
+	mov		%eax, %ecx		// number of bytes left to copy
+	shr		%eax, 6			// get 64-byte block count
+	jz		$memcpy_ic_2	// finish the last few bytes
+
+	cmp		%eax, IN_CACHE_COPY/64	// too big 4 cache? use uncached copy
+	jae		$memcpy_uc_test
+
+	movq	[_mmx_backup+0x00],%mm0
+	movq	[_mmx_backup+0x08],%mm1
+	movq	[_mmx_backup+0x10],%mm2
+	movq	[_mmx_backup+0x18],%mm3
+
+// This is small block copy that uses the MMX registers to copy 8 bytes
+// at a time.  It uses the "unrolled loop" optimization, and also uses
+// the software prefetch instruction to get the data into the cache.
+.align 16
+$memcpy_ic_1:			// 64-byte block copies, in-cache copy
+
+	prefetchnta [%esi + (200*64/34+192)]		// start reading ahead
+
+	movq	%mm0, [%esi+0]	// read 64 bits
+	movq	%mm1, [%esi+8]
+	movq	[%edi+0], %mm0	//write 64 bits
+	movq	[%edi+8], %mm1	//    note:  the normal movq writes the
+	movq	%mm2, [%esi+16]	//    data to cache; a cache line will be
+	movq	%mm3, [%esi+24]	//    allocated as needed, to store the data
+	movq	[%edi+16], %mm2
+	movq	[%edi+24], %mm3
+	movq	%mm0, [%esi+32]
+	movq	%mm1, [%esi+40]
+	movq	[%edi+32], %mm0
+	movq	[%edi+40], %mm1
+	movq	%mm2, [%esi+48]
+	movq	%mm3, [%esi+56]
+	movq	[%edi+48], %mm2
+	movq	[%edi+56], %mm3
+
+	add		%esi, 64			// update source pointer
+	add		%edi, 64			// update destination pointer
+	dec		%eax				// count down
+	jnz		$memcpy_ic_1	// last 64-byte block?
+
+	movq	%mm0,[_mmx_backup+0x00]
+	movq	%mm1,[_mmx_backup+0x08]
+	movq	%mm2,[_mmx_backup+0x10]
+	movq	%mm3,[_mmx_backup+0x18]
+
+$memcpy_ic_2:
+	mov		%eax, %ecx		// has valid low 6 bits of the byte count
+$memcpy_ic_3:
+	shr		%eax, 2			// dword count
+	and		%eax, 0x1111b		// only look at the "remainder" bits
+	neg		%eax				// set up to jump into the array
+	add		%eax, offset $memcpy_last_few
+	jmp		%eax				// jump to array of movsd's
+
+$memcpy_uc_test:
+//	cmp		%ecx, UNCACHED_COPY/64	// big enough? use block prefetch copy
+//	jae		$memcpy_bp_1
+//$memcpy_64_test:
+	or		%eax, %eax		// tail end of block prefetch will jump here
+	jz		$memcpy_ic_2	// no more 64-byte blocks left
+
+// For larger blocks, which will spill beyond the cache, it's faster to
+// use the Streaming Store instruction MOVNTQ.   This write instruction
+// bypasses the cache and writes straight to main memory.  This code also
+// uses the software prefetch instruction to pre-read the data.
+
+	movq	[_mmx_backup+0x00],%mm0
+	movq	[_mmx_backup+0x08],%mm1
+	movq	[_mmx_backup+0x10],%mm2
+
+.align 16
+$memcpy_uc_1:				// 64-byte blocks, uncached copy
+
+	prefetchnta [%esi + (200*64/34+192)]		// start reading ahead
+
+	movq	%mm0,[%esi+0]		// read 64 bits
+	add		%edi,64			// update destination pointer
+	movq	%mm1,[%esi+8]
+	add		%esi,64			// update source pointer
+	movq	%mm2,[%esi-48]
+	movntq	[%edi-64], %mm0	// write 64 bits, bypassing the cache
+	movq	%mm0,[%esi-40]	//    note: movntq also prevents the CPU
+	movntq	[%edi-56], %mm1	//    from READING the destination address
+	movq	%mm1,[%esi-32]	//    into the cache, only to be over-written
+	movntq	[%edi-48], %mm2	//    so that also helps performance
+	movq	%mm2,[%esi-24]
+	movntq	[%edi-40], %mm0
+	movq	%mm0,[%esi-16]
+	movntq	[%edi-32], %mm1
+	movq	%mm1,[%esi-8]
+	movntq	[%edi-24], %mm2
+	movntq	[%edi-16], %mm0
+	dec		%eax
+	movntq	[%edi-8], %mm1
+	jnz		$memcpy_uc_1	// last 64-byte block?
+
+	movq	%mm0,[_mmx_backup+0x00]
+	movq	%mm1,[_mmx_backup+0x08]
+	movq	%mm2,[_mmx_backup+0x10]
+
+	jmp		$memcpy_ic_2		// almost done  (not needed because large copy below was removed)
+
+// For the largest size blocks, a special technique called Block Prefetch
+// can be used to accelerate the read operations.   Block Prefetch reads
+// one address per cache line, for a series of cache lines, in a short loop.
+// This is faster than using software prefetch.  The technique is great for
+// getting maximum read bandwidth, especially in DDR memory systems.
+
+// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
+// help keep the code cache footprint of memcpy_fast to a minimum.
+// <Code removed here>
+
+// The smallest copy uses the X86 "movsd" instruction, in an optimized
+// form which is an "unrolled loop".   Then it handles the last few bytes.
+.align 4
+	movsd
+	movsd			// perform last 1-15 dword copies
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd			// perform last 1-7 dword copies
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+
+$memcpy_last_few:		// dword aligned from before movsd's
+	mov		%eax, %ecx	// has valid low 2 bits of the byte count
+	and		%eax, 0x11b	// the last few cows must come home
+	jz		$memcpy_final	// no more, let's leave
+	rep		movsb		// the last 1, 2, or 3 bytes
+
+$memcpy_final: 
+	emms				// clean up the MMX state
+	sfence				// flush the write buffer
+	//mov		%eax, [dest]	// ret value = destination pointer
+
+	pop    %esi  
+	pop    %edi
+
+	ret 4
--- a/pcsx2/x86/iR3000A.cpp
+++ b/pcsx2/x86/iR3000A.cpp
@ -194,7 +194,7 @@ static void iIopDumpBlock( int startpc, u8 * ptr )
 	system( command );
    sprintf(command, "mv tempdump %s", filename);
    system(command);
-    f = fopen( filename, "a+" );
+    f = fopen( filename.c_str(), "a+" );
 #endif
 }
 #endif
--- a/pcsx2/x86/iVUzerorec.cpp
+++ b/pcsx2/x86/iVUzerorec.cpp
@ -517,7 +517,8 @@ void SuperVUDumpBlock(list<VuBaseBlock*>& blocks, int vuindex)
 	u32 i;

 	Path::CreateDirectory( "dumps" );
-	ssprintf( filename, "dumps\\svu%cdump%.4X.txt", s_vu?'0':'1', s_pFnHeader->startpc );
+	ssprintf( filename, "svu%cdump%.4X.txt", s_vu?'0':'1', s_pFnHeader->startpc );
+	filename = Path::Combine( "dumps", filename );

 	//SysPrintf( "dump1 %x => %s\n", s_pFnHeader->startpc, filename );