ReorderingMTGS: Added a new optimized memcpy_amd_qwc, for use by GIFpath copies. After much studying, we determined this is about as efficient as memcpy will ever get, for what we're doing with it.

DevNot: Win32-only at the moment -- needs a GAS port (but that shouldn't be hard). I made some notes in the code about it. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3472 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-07-12 19:40:30 +00:00 · 2010-07-12 19:40:30 +00:00 · 934578c8fe
parent a6b3acb5d0
commit 934578c8fe
9 changed files with 133 additions and 85 deletions
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@ -28,6 +28,7 @@
 #	include "win_memzero.h"

 	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
+	extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
 	extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
 	extern void memxor_mmx(void* dst, const void* src1, int cmpsize);

@ -37,6 +38,8 @@
 void _memset16_unaligned( void* dest, u16 data, size_t size );

 #define memcpy_fast				memcpy_amd_ // Fast memcpy
-#define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c*16)	// Memcpy with 16-byte Aligned addresses
+#define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
 #define memcpy_const			memcpy_amd_	// Memcpy with constant size
 #define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned
+#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
+//#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
--- a/common/include/Utilities/Threading.h
+++ b/common/include/Utilities/Threading.h
@ -129,6 +129,10 @@ namespace Threading

 	// For use in spin/wait loops.
 	extern void SpinWait();
+	
+	// Use prior to committing data to another thread (internal memcpy_qwc does not use fencing,
+	// so that many memcpys can be issued in a row more efficiently)
+	extern void StoreFence();

 	// Optional implementation to enable hires thread/process scheduler for the operating system.
 	// Needed by Windows, but might not be relevant to other platforms.
--- a/common/src/Utilities/Windows/WinThreads.cpp
+++ b/common/src/Utilities/Windows/WinThreads.cpp
@ -36,6 +36,11 @@ __forceinline void Threading::SpinWait()
 	__asm pause;
 }

+__forceinline void Threading::StoreFence()
+{
+	__asm sfence;
+}
+
 __forceinline void Threading::EnableHiresScheduler()
 {
 	// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@ -146,7 +146,7 @@ $memcpy_ic_1:			; 64-byte block copies, in-cache copy

 	add		esi, 64			; update source pointer
 	add		edi, 64			; update destination pointer
-	dec		eax				; count down
+	sub		eax, 1
 	jnz		$memcpy_ic_1	; last 64-byte block?

 $memcpy_ic_2:
@ -189,64 +189,15 @@ $memcpy_uc_1:				; 64-byte blocks, uncached copy
 	movq	mm1,[esi-8]
 	movntq	[edi-24], mm2
 	movntq	[edi-16], mm0
-	dec		eax
 	movntq	[edi-8], mm1
+
+	sub		eax, 1
 	jnz		$memcpy_uc_1	; last 64-byte block?

 	jmp		$memcpy_ic_2		; almost done  (not needed because large copy below was removed)

-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-
-// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
-// help keep the code cache footprint of memcpy_fast to a minimum.
-/*
-$memcpy_bp_1:			; large blocks, block prefetch copy
-
-	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
-	jl		$memcpy_64_test			; no, back to regular uncached copy
-
-	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
-	add		esi, CACHEBLOCK * 64	; move to the top of the block
-align 16
-$memcpy_bp_2:
-	mov		edx, [esi-64]		; grab one address per cache line
-	mov		edx, [esi-128]		; grab one address per cache line
-	sub		esi, 128			; go reverse order to suppress HW prefetcher
-	dec		eax					; count down the cache lines
-	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
-
-	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
-align 16
-$memcpy_bp_3:
-	movq	mm0, [esi   ]		; read 64 bits
-	movq	mm1, [esi+ 8]
-	movq	mm2, [esi+16]
-	movq	mm3, [esi+24]
-	movq	mm4, [esi+32]
-	movq	mm5, [esi+40]
-	movq	mm6, [esi+48]
-	movq	mm7, [esi+56]
-	add		esi, 64				; update source pointer
-	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
-	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
-	movntq	[edi+16], mm2		;    from READING the destination address
-	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
-	movntq	[edi+32], mm4		;    so that also helps performance
-	movntq	[edi+40], mm5
-	movntq	[edi+48], mm6
-	movntq	[edi+56], mm7
-	add		edi, 64				; update dest pointer
-
-	dec		eax					; count down
-
-	jnz		$memcpy_bp_3		; keep copying
-	sub		ecx, CACHEBLOCK		; update the 64-byte block count
-	jmp		$memcpy_bp_1		; keep processing chunks
-*/
+// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
+// disabled to help keep the code cache footprint of memcpy_fast to a minimum.

 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".   Then it handles the last few bytes.
@ -274,17 +225,99 @@ $memcpy_last_few:		; dword aligned from before movsd's
 	rep		movsb		; the last 1, 2, or 3 bytes

 $memcpy_final:
+	pop    esi
+	pop    edi
+
 	emms				; clean up the MMX state
 	sfence				; flush the write buffer
 	//mov		eax, [dest]	; ret value = destination pointer

-	pop    esi
-	pop    edi
-
 	ret 4
    }
 }

+// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
+__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+{
+	// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
+	// registers will improve copy performance, because they won't.  Use of XMMs is only
+	// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
+	// and even then the benefits are typically minimal (sometimes slower depending on the
+	// amount of data being copied).
+	//
+	// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
+	//   --air
+
+	// Linux Conversion note:
+	//  This code would benefit nicely from having inline-able GAS syntax, since it should
+	//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
+	//  And its called enough times to probably merit the extra effort to ensure proper
+	//  optimization. --air
+
+    __asm
+	{
+	mov		ecx, [dest]
+	mov		edx, [src]
+	mov		eax, [qwc]			; keep a copy of count
+	shr		eax, 1
+	jz		$memcpy_qwc_1		; only one 16 byte block to copy?
+
+	cmp		eax, IN_CACHE_COPY/32
+	jb		$memcpy_qwc_loop1	; small copies should be cached (definite speedup --air)
+	
+$memcpy_qwc_loop2:				; 32-byte blocks, uncached copy
+	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
+
+	movq	mm0,[edx+0]			; read 64 bits
+	movq	mm1,[edx+8]
+	movq	mm2,[edx+16]
+	movntq	[ecx+0], mm0		; write 64 bits, bypassing the cache
+	movntq	[ecx+8], mm1
+	movq	mm3,[edx+24]
+	movntq	[ecx+16], mm2
+	movntq	[ecx+24], mm3
+
+	add		edx,32				; update source pointer
+	add		ecx,32				; update destination pointer
+	sub		eax,1
+	jnz		$memcpy_qwc_loop2	; last 64-byte block?
+	sfence						; flush the write buffer
+	jmp		$memcpy_qwc_1
+
+; 32-byte blocks, cached!
+; This *is* important.  Removing this and using exclusively non-temporal stores
+; results in noticable speed loss!
+
+$memcpy_qwc_loop1:				
+	prefetchnta [edx + 568]		; start reading ahead (tested: it helps! --air)
+
+	movq	mm0,[edx+0]			; read 64 bits
+	movq	mm1,[edx+8]
+	movq	mm2,[edx+16]
+	movq	[ecx+0], mm0		; write 64 bits, bypassing the cache
+	movq	[ecx+8], mm1
+	movq	mm3,[edx+24]
+	movq	[ecx+16], mm2
+	movq	[ecx+24], mm3
+
+	add		edx,32				; update source pointer
+	add		ecx,32				; update destination pointer
+	sub		eax,1
+	jnz		$memcpy_qwc_loop1	; last 64-byte block?
+
+$memcpy_qwc_1:
+	test	[qwc],1
+	jz		$memcpy_qwc_final
+	movq	mm0,[edx]
+	movq	mm1,[edx+8]
+	movq	[ecx], mm0
+	movq	[ecx+8], mm1
+
+$memcpy_qwc_final:
+	emms				; clean up the MMX state
+    }
+}
+
 // mmx mem-compare implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
 // ~10 times faster than standard memcmp
--- a/pcsx2/Config.h
+++ b/pcsx2/Config.h
@ -395,6 +395,7 @@ struct Pcsx2Config
 		// style.  Useful for debugging potential bugs in the MTGS pipeline.
 		bool	SynchronousMTGS;
 		bool	DisableOutput;
+		int		VsyncQueueSize;

 		bool	FrameLimitEnable;
 		bool	FrameSkipEnable;
@ -420,6 +421,8 @@ struct Pcsx2Config
 			return
 				OpEqu( SynchronousMTGS )		&&
 				OpEqu( DisableOutput )			&&
+				OpEqu( VsyncQueueSize )			&&
+				
 				OpEqu( FrameSkipEnable )		&&
 				OpEqu( FrameLimitEnable )		&&
 				OpEqu( VsyncEnable )			&&
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@ -142,14 +142,11 @@ void SysMtgsThread::PostVsyncEnd()
 	
 	SendDataPacket();

-	// Alter-frame flushing!  Restarts the ringbuffer (wraps) on every other frame.  This is a
-	// mandatory feature that prevents the MTGS from queuing more than 2 frames at any time.
-	// (queued frames cause input lag and desynced audio -- bad!).  Ring restarts work for this
-	// because they act as sync points where the EE must stall to wait for the GS to catch-up,
-	// and they also allow us to reuse the front of the ringbuffer more often, which should improve
-	// L2 cache performance.
+	// If the MTGS is allowed to queue a lot of frames in advance, it creates input lag.
+	// Use the Queued FrameCount to stall the EE if another vsync is already queued in
+	// the ringbuffer.

-	if( AtomicIncrement(m_QueuedFrameCount) < 2 ) return;
+	if( AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize ) return;

 	uint readpos = volatize(m_RingPos);
 	uint freeroom;
@ -190,7 +187,7 @@ void SysMtgsThread::OpenPlugin()
 {
 	if( m_PluginOpened ) return;

-	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 );
+	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
 	GSsetBaseMem( RingBuffer.Regs );
 	GSirqCallback( dummyIrqCallback );

@ -624,6 +621,7 @@ void SysMtgsThread::SendDataPacket()
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
 	tag.data[0] = actualSize;

+	//Threading::StoreFence();
 	m_WritePos = m_packet_ringpos;

 	if( EmuConfig.GS.SynchronousMTGS )
--- a/pcsx2/Pcsx2Config.cpp
+++ b/pcsx2/Pcsx2Config.cpp
@ -217,6 +217,7 @@ Pcsx2Config::GSOptions::GSOptions()

 	SynchronousMTGS			= false;
 	DisableOutput			= false;
+	VsyncQueueSize			= 2;

 	DefaultRegionMode		= Region_NTSC;
 	FramesToDraw			= 2;
@ -234,6 +235,7 @@ void Pcsx2Config::GSOptions::LoadSave( IniInterface& ini )

 	IniEntry( SynchronousMTGS );
 	IniEntry( DisableOutput );
+	IniEntry( VsyncQueueSize );

 	IniEntry( FrameLimitEnable );
 	IniEntry( FrameSkipEnable );
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@ -526,36 +526,36 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
 {
 	uint endpos = destStart + len;
-	if( endpos >= destSize )
+	if( endpos < destSize )
 	{
-		uint firstcopylen = destSize - destStart;
-		memcpy_aligned(&destBase[destStart], src, firstcopylen );
-
-		destStart = endpos % destSize;
-		memcpy_aligned(destBase, src+firstcopylen, destStart );
+		memcpy_qwc(&destBase[destStart], src, len );
+		destStart += len;
 	}
 	else
 	{
-		memcpy_aligned(&destBase[destStart], src, len );
-		destStart += len;
+		uint firstcopylen = destSize - destStart;
+		memcpy_qwc(&destBase[destStart], src, firstcopylen );
+
+		destStart = endpos % destSize;
+		memcpy_qwc(destBase, src+firstcopylen, destStart );
 	}
 }

 void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
 {
 	uint endpos = srcStart + len;
-	if( endpos >= srcSize )
+	if( endpos < srcSize )
 	{
-		uint firstcopylen = srcSize - srcStart;
-		memcpy_aligned(dest, &srcBase[srcStart], firstcopylen );
-
-		srcStart = endpos & srcSize;
-		memcpy_aligned(dest+firstcopylen, srcBase, srcStart );
+		memcpy_qwc(dest, &srcBase[srcStart], len );
+		srcStart += len;
 	}
 	else
 	{
-		memcpy_aligned(dest, &srcBase[srcStart], len );
-		srcStart += len;
+		uint firstcopylen = srcSize - srcStart;
+		memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
+
+		srcStart = endpos & srcSize;
+		memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
 	}
 }

--- a/pcsx2/x86/microVU_Lower.inl
+++ b/pcsx2/x86/microVU_Lower.inl
@ -1129,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 			// fixme: one of these days the following *16's will get cleaned up when we introduce
 			// a special qwc/simd16 optimized version of memcpy_aligned. :)
 			//DevCon.Status("XGkick Wrap!");
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff);
+			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
 			Path1WritePos += size;
 			size  -= diff;
 			pDest += diff*16;
-			memcpy_aligned(pDest, microVU1.regs->Mem, size);			
+			memcpy_qwc(pDest, microVU1.regs->Mem, size);			
 		}
 		else {
-			memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size);
+			memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
 			Path1WritePos += size;
 		}
 		//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);