ReorderingMTGS: Added a new optimized memcpy_amd_qwc, for use by GIFpath copies. After much studying, we determined this is about as efficient as memcpy will ever get, for what we're doing with it.

DevNot: Win32-only at the moment -- needs a GAS port (but that shouldn't be hard). I made some notes in the code about it. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3472 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-07-12 19:40:30 +00:00 · 2010-07-12 19:40:30 +00:00 · 934578c8fe
parent a6b3acb5d0
commit 934578c8fe
9 changed files with 133 additions and 85 deletions
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@ -28,6 +28,7 @@
 #	include "win_memzero.h"
 	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
 	extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
 	extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 	extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
@ -37,6 +38,8 @@
 void _memset16_unaligned( void* dest, u16 data, size_t size );
 #define memcpy_fast				memcpy_amd_ // Fast memcpy
-#define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c*16)	// Memcpy with 16-byte Aligned addresses
+#define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
 #define memcpy_const			memcpy_amd_	// Memcpy with constant size
 #define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned
 #define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
 //#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
--- a/common/include/Utilities/Threading.h
+++ b/common/include/Utilities/Threading.h
@ -129,6 +129,10 @@ namespace Threading
 	// For use in spin/wait loops.
 	extern void SpinWait();
 	// Use prior to committing data to another thread (internal memcpy_qwc does not use fencing,
 	// so that many memcpys can be issued in a row more efficiently)
 	extern void StoreFence();
 	// Optional implementation to enable hires thread/process scheduler for the operating system.
 	// Needed by Windows, but might not be relevant to other platforms.
--- a/common/src/Utilities/Windows/WinThreads.cpp
+++ b/common/src/Utilities/Windows/WinThreads.cpp
@ -36,6 +36,11 @@ __forceinline void Threading::SpinWait()
 	__asm pause;
 }
 __forceinline void Threading::StoreFence()
 {
 	__asm sfence;
 }
 __forceinline void Threading::EnableHiresScheduler()
 {
 	// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@ -146,7 +146,7 @@ $memcpy_ic_1:			; 64-byte block copies, in-cache copy
 	add		esi, 64			; update source pointer
 	add		edi, 64			; update destination pointer
-	dec		eax				; count down
+	sub		eax, 1
 	jnz		$memcpy_ic_1	; last 64-byte block?
 $memcpy_ic_2:
@ -189,64 +189,15 @@ $memcpy_uc_1:				; 64-byte blocks, uncached copy
 	movq	mm1,[esi-8]
 	movntq	[edi-24], mm2
 	movntq	[edi-16], mm0
 	dec		eax
 	movntq	[edi-8], mm1
 	sub		eax, 1
 	jnz		$memcpy_uc_1	; last 64-byte block?
 	jmp		$memcpy_ic_2		; almost done  (not needed because large copy below was removed)
-// For the largest size blocks, a special technique called Block Prefetch
+// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
-// can be used to accelerate the read operations.   Block Prefetch reads
+// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
 // one address per cache line, for a series of cache lines, in a short loop.
 // This is faster than using software prefetch.  The technique is great for
 // getting maximum read bandwidth, especially in DDR memory systems.
 // Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
 // help keep the code cache footprint of memcpy_fast to a minimum.
 /*
 $memcpy_bp_1:			; large blocks, block prefetch copy
 	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
 	jl		$memcpy_64_test			; no, back to regular uncached copy
 	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
 	add		esi, CACHEBLOCK * 64	; move to the top of the block
 align 16
 $memcpy_bp_2:
 	mov		edx, [esi-64]		; grab one address per cache line
 	mov		edx, [esi-128]		; grab one address per cache line
 	sub		esi, 128			; go reverse order to suppress HW prefetcher
 	dec		eax					; count down the cache lines
 	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
 	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
 align 16
 $memcpy_bp_3:
 	movq	mm0, [esi   ]		; read 64 bits
 	movq	mm1, [esi+ 8]
 	movq	mm2, [esi+16]
 	movq	mm3, [esi+24]
 	movq	mm4, [esi+32]
 	movq	mm5, [esi+40]
 	movq	mm6, [esi+48]
 	movq	mm7, [esi+56]
 	add		esi, 64				; update source pointer
 	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
 	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
 	movntq	[edi+16], mm2		;    from READING the destination address
 	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
 	movntq	[edi+32], mm4		;    so that also helps performance
 	movntq	[edi+40], mm5
 	movntq	[edi+48], mm6
 	movntq	[edi+56], mm7
 	add		edi, 64				; update dest pointer
 	dec		eax					; count down
 	jnz		$memcpy_bp_3		; keep copying
 	sub		ecx, CACHEBLOCK		; update the 64-byte block count
 	jmp		$memcpy_bp_1		; keep processing chunks
 */
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".   Then it handles the last few bytes.
@ -274,17 +225,99 @@ $memcpy_last_few:		; dword aligned from before movsd's
 	rep		movsb		; the last 1, 2, or 3 bytes
 $memcpy_final:
 	pop    esi
 	pop    edi
 	emms				; clean up the MMX state
 	sfence				; flush the write buffer
 	//mov		eax, [dest]	; ret value = destination pointer
 	pop    esi
 	pop    edi
 	ret 4
    }
 }
 // Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
 __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
 {
 	// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
 	// registers will improve copy performance, because they won't.  Use of XMMs is only
 	// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
 	// and even then the benefits are typically minimal (sometimes slower depending on the
 	// amount of data being copied).
 	//
 	// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
 	//   --air
 	// Linux Conversion note:
 	//  This code would benefit nicely from having inline-able GAS syntax, since it should
 	//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
 	//  And its called enough times to probably merit the extra effort to ensure proper
 	//  optimization. --air
    __asm
 	{
 	mov		ecx, [dest]
 	mov		edx, [src]
 	mov		eax, [qwc]			; keep a copy of count
 	shr		eax, 1
 	jz		$memcpy_qwc_1		; only one 16 byte block to copy?
 	cmp		eax, IN_CACHE_COPY/32
 	jb		$memcpy_qwc_loop1	; small copies should be cached (definite speedup --air)
 $memcpy_qwc_loop2:				; 32-byte blocks, uncached copy
 	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
 	movq	mm0,[edx+0]			; read 64 bits
 	movq	mm1,[edx+8]
 	movq	mm2,[edx+16]
 	movntq	[ecx+0], mm0		; write 64 bits, bypassing the cache
 	movntq	[ecx+8], mm1
 	movq	mm3,[edx+24]
 	movntq	[ecx+16], mm2
 	movntq	[ecx+24], mm3
 	add		edx,32				; update source pointer
 	add		ecx,32				; update destination pointer
 	sub		eax,1
 	jnz		$memcpy_qwc_loop2	; last 64-byte block?
 	sfence						; flush the write buffer
 	jmp		$memcpy_qwc_1
 ; 32-byte blocks, cached!
 ; This *is* important.  Removing this and using exclusively non-temporal stores
 ; results in noticable speed loss!
 $memcpy_qwc_loop1:				
 	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
 	movq	mm0,[edx+0]			; read 64 bits
 	movq	mm1,[edx+8]
 	movq	mm2,[edx+16]
 	movq	[ecx+0], mm0		; write 64 bits, bypassing the cache
 	movq	[ecx+8], mm1
 	movq	mm3,[edx+24]
 	movq	[ecx+16], mm2
 	movq	[ecx+24], mm3
 	add		edx,32				; update source pointer
 	add		ecx,32				; update destination pointer
 	sub		eax,1
 	jnz		$memcpy_qwc_loop1	; last 64-byte block?
 $memcpy_qwc_1:
 	test	[qwc],1
 	jz		$memcpy_qwc_final
 	movq	mm0,[edx]
 	movq	mm1,[edx+8]
 	movq	[ecx], mm0
 	movq	[ecx+8], mm1
 $memcpy_qwc_final:
 	emms				; clean up the MMX state
    }
 }
 // mmx mem-compare implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
 // ~10 times faster than standard memcmp
--- a/pcsx2/Config.h
+++ b/pcsx2/Config.h
@ -395,6 +395,7 @@ struct Pcsx2Config
 		// style.  Useful for debugging potential bugs in the MTGS pipeline.
 		bool	SynchronousMTGS;
 		bool	DisableOutput;
 		int		VsyncQueueSize;
 		bool	FrameLimitEnable;
 		bool	FrameSkipEnable;
@ -420,6 +421,8 @@ struct Pcsx2Config
 			return
 				OpEqu( SynchronousMTGS )		&&
 				OpEqu( DisableOutput )			&&
 				OpEqu( VsyncQueueSize )			&&
 				OpEqu( FrameSkipEnable )		&&
 				OpEqu( FrameLimitEnable )		&&
 				OpEqu( VsyncEnable )			&&
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@ -142,14 +142,11 @@ void SysMtgsThread::PostVsyncEnd()
 	SendDataPacket();
-	// Alter-frame flushing!  Restarts the ringbuffer (wraps) on every other frame.  This is a
+	// If the MTGS is allowed to queue a lot of frames in advance, it creates input lag.
-	// mandatory feature that prevents the MTGS from queuing more than 2 frames at any time.
+	// Use the Queued FrameCount to stall the EE if another vsync is already queued in
-	// (queued frames cause input lag and desynced audio -- bad!).  Ring restarts work for this
+	// the ringbuffer.
 	// because they act as sync points where the EE must stall to wait for the GS to catch-up,
 	// and they also allow us to reuse the front of the ringbuffer more often, which should improve
 	// L2 cache performance.
-	if( AtomicIncrement(m_QueuedFrameCount) < 2 ) return;
+	if( AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize ) return;
 	uint readpos = volatize(m_RingPos);
 	uint freeroom;
@ -190,7 +187,7 @@ void SysMtgsThread::OpenPlugin()
 {
 	if( m_PluginOpened ) return;
-	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 );
+	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
 	GSsetBaseMem( RingBuffer.Regs );
 	GSirqCallback( dummyIrqCallback );
@ -624,6 +621,7 @@ void SysMtgsThread::SendDataPacket()
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
 	tag.data[0] = actualSize;
 	//Threading::StoreFence();
 	m_WritePos = m_packet_ringpos;
 	if( EmuConfig.GS.SynchronousMTGS )
--- a/pcsx2/Pcsx2Config.cpp
+++ b/pcsx2/Pcsx2Config.cpp
@ -217,6 +217,7 @@ Pcsx2Config::GSOptions::GSOptions()
 	SynchronousMTGS			= false;
 	DisableOutput			= false;
 	VsyncQueueSize			= 2;
 	DefaultRegionMode		= Region_NTSC;
 	FramesToDraw			= 2;
@ -234,6 +235,7 @@ void Pcsx2Config::GSOptions::LoadSave( IniInterface& ini )
 	IniEntry( SynchronousMTGS );
 	IniEntry( DisableOutput );
 	IniEntry( VsyncQueueSize );
 	IniEntry( FrameLimitEnable );
 	IniEntry( FrameSkipEnable );
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@ -526,36 +526,36 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
 {
 	uint endpos = destStart + len;
-	if( endpos >= destSize )
+	if( endpos < destSize )
 	{
-		uint firstcopylen = destSize - destStart;
+		memcpy_qwc(&destBase[destStart], src, len );
-		memcpy_aligned(&destBase[destStart], src, firstcopylen );
+		destStart += len;
 		destStart = endpos % destSize;
 		memcpy_aligned(destBase, src+firstcopylen, destStart );
 	}
 	else
 	{
-		memcpy_aligned(&destBase[destStart], src, len );
+		uint firstcopylen = destSize - destStart;
-		destStart += len;
+		memcpy_qwc(&destBase[destStart], src, firstcopylen );
 		destStart = endpos % destSize;
 		memcpy_qwc(destBase, src+firstcopylen, destStart );
 	}
 }
 void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
 {
 	uint endpos = srcStart + len;
-	if( endpos >= srcSize )
+	if( endpos < srcSize )
 	{
-		uint firstcopylen = srcSize - srcStart;
+		memcpy_qwc(dest, &srcBase[srcStart], len );
-		memcpy_aligned(dest, &srcBase[srcStart], firstcopylen );
+		srcStart += len;
 		srcStart = endpos & srcSize;
 		memcpy_aligned(dest+firstcopylen, srcBase, srcStart );
 	}
 	else
 	{
-		memcpy_aligned(dest, &srcBase[srcStart], len );
+		uint firstcopylen = srcSize - srcStart;
-		srcStart += len;
+		memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
 		srcStart = endpos & srcSize;
 		memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
 	}
 }
--- a/pcsx2/x86/microVU_Lower.inl
+++ b/pcsx2/x86/microVU_Lower.inl
@ -1129,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 			// fixme: one of these days the following *16's will get cleaned up when we introduce
 			// a special qwc/simd16 optimized version of memcpy_aligned. :)
 			//DevCon.Status("XGkick Wrap!");
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff);
+			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
 			Path1WritePos += size;
 			size  -= diff;
 			pDest += diff*16;
-			memcpy_aligned(pDest, microVU1.regs->Mem, size);			
+			memcpy_qwc(pDest, microVU1.regs->Mem, size);			
 		}
 		else {
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size);
+			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
 			Path1WritePos += size;
 		}
 		//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);