ReorderingMTGS: Added a new optimized memcpy_amd_qwc, for use by GIFpath copies. After much studying, we determined this is about as efficient as memcpy will ever get, for what we're doing with it.

DevNot:  Win32-only at the moment -- needs a GAS port (but that shouldn't be hard).  I made some notes in the code about it.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3472 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2010-07-12 19:40:30 +00:00
parent a6b3acb5d0
commit 934578c8fe
9 changed files with 133 additions and 85 deletions

View File

@ -28,6 +28,7 @@
# include "win_memzero.h" # include "win_memzero.h"
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes); extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize); extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
extern void memxor_mmx(void* dst, const void* src1, int cmpsize); extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
@ -37,6 +38,8 @@
void _memset16_unaligned( void* dest, u16 data, size_t size ); void _memset16_unaligned( void* dest, u16 data, size_t size );
#define memcpy_fast memcpy_amd_ // Fast memcpy #define memcpy_fast memcpy_amd_ // Fast memcpy
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c*16) // Memcpy with 16-byte Aligned addresses #define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses
#define memcpy_const memcpy_amd_ // Memcpy with constant size #define memcpy_const memcpy_amd_ // Memcpy with constant size
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned #define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)
//#define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16)

View File

@ -129,6 +129,10 @@ namespace Threading
// For use in spin/wait loops. // For use in spin/wait loops.
extern void SpinWait(); extern void SpinWait();
// Use prior to committing data to another thread (internal memcpy_qwc does not use fencing,
// so that many memcpys can be issued in a row more efficiently)
extern void StoreFence();
// Optional implementation to enable hires thread/process scheduler for the operating system. // Optional implementation to enable hires thread/process scheduler for the operating system.
// Needed by Windows, but might not be relevant to other platforms. // Needed by Windows, but might not be relevant to other platforms.

View File

@ -36,6 +36,11 @@ __forceinline void Threading::SpinWait()
__asm pause; __asm pause;
} }
__forceinline void Threading::StoreFence()
{
__asm sfence;
}
__forceinline void Threading::EnableHiresScheduler() __forceinline void Threading::EnableHiresScheduler()
{ {
// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of // This improves accuracy of Sleep() by some amount, and only adds a negligible amount of

View File

@ -146,7 +146,7 @@ $memcpy_ic_1: ; 64-byte block copies, in-cache copy
add esi, 64 ; update source pointer add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer add edi, 64 ; update destination pointer
dec eax ; count down sub eax, 1
jnz $memcpy_ic_1 ; last 64-byte block? jnz $memcpy_ic_1 ; last 64-byte block?
$memcpy_ic_2: $memcpy_ic_2:
@ -189,64 +189,15 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy
movq mm1,[esi-8] movq mm1,[esi-8]
movntq [edi-24], mm2 movntq [edi-24], mm2
movntq [edi-16], mm0 movntq [edi-16], mm0
dec eax
movntq [edi-8], mm1 movntq [edi-8], mm1
sub eax, 1
jnz $memcpy_uc_1 ; last 64-byte block? jnz $memcpy_uc_1 ; last 64-byte block?
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed) jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
// For the largest size blocks, a special technique called Block Prefetch // Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
// can be used to accelerate the read operations. Block Prefetch reads // disabled to help keep the code cache footprint of memcpy_fast to a minimum.
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
// help keep the code cache footprint of memcpy_fast to a minimum.
/*
$memcpy_bp_1: ; large blocks, block prefetch copy
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jl $memcpy_64_test ; no, back to regular uncached copy
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_bp_2:
mov edx, [esi-64] ; grab one address per cache line
mov edx, [esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order to suppress HW prefetcher
dec eax ; count down the cache lines
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
movq mm0, [esi ] ; read 64 bits
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64 ; update source pointer
movntq [edi ], mm0 ; write 64 bits, bypassing cache
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
movntq [edi+16], mm2 ; from READING the destination address
movntq [edi+24], mm3 ; into the cache, only to be over-written,
movntq [edi+32], mm4 ; so that also helps performance
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64 ; update dest pointer
dec eax ; count down
jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks
*/
// The smallest copy uses the X86 "movsd" instruction, in an optimized // The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes. // form which is an "unrolled loop". Then it handles the last few bytes.
@ -274,17 +225,99 @@ $memcpy_last_few: ; dword aligned from before movsd's
rep movsb ; the last 1, 2, or 3 bytes rep movsb ; the last 1, 2, or 3 bytes
$memcpy_final: $memcpy_final:
pop esi
pop edi
emms ; clean up the MMX state emms ; clean up the MMX state
sfence ; flush the write buffer sfence ; flush the write buffer
//mov eax, [dest] ; ret value = destination pointer //mov eax, [dest] ; ret value = destination pointer
pop esi
pop edi
ret 4 ret 4
} }
} }
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
{
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
// registers will improve copy performance, because they won't. Use of XMMs is only
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
// and even then the benefits are typically minimal (sometimes slower depending on the
// amount of data being copied).
//
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
// --air
// Linux Conversion note:
// This code would benefit nicely from having inline-able GAS syntax, since it should
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
// And its called enough times to probably merit the extra effort to ensure proper
// optimization. --air
__asm
{
mov ecx, [dest]
mov edx, [src]
mov eax, [qwc] ; keep a copy of count
shr eax, 1
jz $memcpy_qwc_1 ; only one 16 byte block to copy?
cmp eax, IN_CACHE_COPY/32
jb $memcpy_qwc_loop1 ; small copies should be cached (definite speedup --air)
$memcpy_qwc_loop2: ; 32-byte blocks, uncached copy
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
movq mm0,[edx+0] ; read 64 bits
movq mm1,[edx+8]
movq mm2,[edx+16]
movntq [ecx+0], mm0 ; write 64 bits, bypassing the cache
movntq [ecx+8], mm1
movq mm3,[edx+24]
movntq [ecx+16], mm2
movntq [ecx+24], mm3
add edx,32 ; update source pointer
add ecx,32 ; update destination pointer
sub eax,1
jnz $memcpy_qwc_loop2 ; last 64-byte block?
sfence ; flush the write buffer
jmp $memcpy_qwc_1
; 32-byte blocks, cached!
; This *is* important. Removing this and using exclusively non-temporal stores
; results in noticable speed loss!
$memcpy_qwc_loop1:
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
movq mm0,[edx+0] ; read 64 bits
movq mm1,[edx+8]
movq mm2,[edx+16]
movq [ecx+0], mm0 ; write 64 bits, bypassing the cache
movq [ecx+8], mm1
movq mm3,[edx+24]
movq [ecx+16], mm2
movq [ecx+24], mm3
add edx,32 ; update source pointer
add ecx,32 ; update destination pointer
sub eax,1
jnz $memcpy_qwc_loop1 ; last 64-byte block?
$memcpy_qwc_1:
test [qwc],1
jz $memcpy_qwc_final
movq mm0,[edx]
movq mm1,[edx+8]
movq [ecx], mm0
movq [ecx+8], mm1
$memcpy_qwc_final:
emms ; clean up the MMX state
}
}
// mmx mem-compare implementation, size has to be a multiple of 8 // mmx mem-compare implementation, size has to be a multiple of 8
// returns 0 is equal, nonzero value if not equal // returns 0 is equal, nonzero value if not equal
// ~10 times faster than standard memcmp // ~10 times faster than standard memcmp

View File

@ -395,6 +395,7 @@ struct Pcsx2Config
// style. Useful for debugging potential bugs in the MTGS pipeline. // style. Useful for debugging potential bugs in the MTGS pipeline.
bool SynchronousMTGS; bool SynchronousMTGS;
bool DisableOutput; bool DisableOutput;
int VsyncQueueSize;
bool FrameLimitEnable; bool FrameLimitEnable;
bool FrameSkipEnable; bool FrameSkipEnable;
@ -420,6 +421,8 @@ struct Pcsx2Config
return return
OpEqu( SynchronousMTGS ) && OpEqu( SynchronousMTGS ) &&
OpEqu( DisableOutput ) && OpEqu( DisableOutput ) &&
OpEqu( VsyncQueueSize ) &&
OpEqu( FrameSkipEnable ) && OpEqu( FrameSkipEnable ) &&
OpEqu( FrameLimitEnable ) && OpEqu( FrameLimitEnable ) &&
OpEqu( VsyncEnable ) && OpEqu( VsyncEnable ) &&

View File

@ -142,14 +142,11 @@ void SysMtgsThread::PostVsyncEnd()
SendDataPacket(); SendDataPacket();
// Alter-frame flushing! Restarts the ringbuffer (wraps) on every other frame. This is a // If the MTGS is allowed to queue a lot of frames in advance, it creates input lag.
// mandatory feature that prevents the MTGS from queuing more than 2 frames at any time. // Use the Queued FrameCount to stall the EE if another vsync is already queued in
// (queued frames cause input lag and desynced audio -- bad!). Ring restarts work for this // the ringbuffer.
// because they act as sync points where the EE must stall to wait for the GS to catch-up,
// and they also allow us to reuse the front of the ringbuffer more often, which should improve
// L2 cache performance.
if( AtomicIncrement(m_QueuedFrameCount) < 2 ) return; if( AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize ) return;
uint readpos = volatize(m_RingPos); uint readpos = volatize(m_RingPos);
uint freeroom; uint freeroom;
@ -190,7 +187,7 @@ void SysMtgsThread::OpenPlugin()
{ {
if( m_PluginOpened ) return; if( m_PluginOpened ) return;
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 ); memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
GSsetBaseMem( RingBuffer.Regs ); GSsetBaseMem( RingBuffer.Regs );
GSirqCallback( dummyIrqCallback ); GSirqCallback( dummyIrqCallback );
@ -624,6 +621,7 @@ void SysMtgsThread::SendDataPacket()
PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos]; PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
tag.data[0] = actualSize; tag.data[0] = actualSize;
//Threading::StoreFence();
m_WritePos = m_packet_ringpos; m_WritePos = m_packet_ringpos;
if( EmuConfig.GS.SynchronousMTGS ) if( EmuConfig.GS.SynchronousMTGS )

View File

@ -217,6 +217,7 @@ Pcsx2Config::GSOptions::GSOptions()
SynchronousMTGS = false; SynchronousMTGS = false;
DisableOutput = false; DisableOutput = false;
VsyncQueueSize = 2;
DefaultRegionMode = Region_NTSC; DefaultRegionMode = Region_NTSC;
FramesToDraw = 2; FramesToDraw = 2;
@ -234,6 +235,7 @@ void Pcsx2Config::GSOptions::LoadSave( IniInterface& ini )
IniEntry( SynchronousMTGS ); IniEntry( SynchronousMTGS );
IniEntry( DisableOutput ); IniEntry( DisableOutput );
IniEntry( VsyncQueueSize );
IniEntry( FrameLimitEnable ); IniEntry( FrameLimitEnable );
IniEntry( FrameSkipEnable ); IniEntry( FrameSkipEnable );

View File

@ -526,36 +526,36 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len ) void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
{ {
uint endpos = destStart + len; uint endpos = destStart + len;
if( endpos >= destSize ) if( endpos < destSize )
{ {
uint firstcopylen = destSize - destStart; memcpy_qwc(&destBase[destStart], src, len );
memcpy_aligned(&destBase[destStart], src, firstcopylen ); destStart += len;
destStart = endpos % destSize;
memcpy_aligned(destBase, src+firstcopylen, destStart );
} }
else else
{ {
memcpy_aligned(&destBase[destStart], src, len ); uint firstcopylen = destSize - destStart;
destStart += len; memcpy_qwc(&destBase[destStart], src, firstcopylen );
destStart = endpos % destSize;
memcpy_qwc(destBase, src+firstcopylen, destStart );
} }
} }
void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len ) void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
{ {
uint endpos = srcStart + len; uint endpos = srcStart + len;
if( endpos >= srcSize ) if( endpos < srcSize )
{ {
uint firstcopylen = srcSize - srcStart; memcpy_qwc(dest, &srcBase[srcStart], len );
memcpy_aligned(dest, &srcBase[srcStart], firstcopylen ); srcStart += len;
srcStart = endpos & srcSize;
memcpy_aligned(dest+firstcopylen, srcBase, srcStart );
} }
else else
{ {
memcpy_aligned(dest, &srcBase[srcStart], len ); uint firstcopylen = srcSize - srcStart;
srcStart += len; memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
srcStart = endpos & srcSize;
memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
} }
} }

View File

@ -1129,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
// fixme: one of these days the following *16's will get cleaned up when we introduce // fixme: one of these days the following *16's will get cleaned up when we introduce
// a special qwc/simd16 optimized version of memcpy_aligned. :) // a special qwc/simd16 optimized version of memcpy_aligned. :)
//DevCon.Status("XGkick Wrap!"); //DevCon.Status("XGkick Wrap!");
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff); memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
Path1WritePos += size; Path1WritePos += size;
size -= diff; size -= diff;
pDest += diff*16; pDest += diff*16;
memcpy_aligned(pDest, microVU1.regs->Mem, size); memcpy_qwc(pDest, microVU1.regs->Mem, size);
} }
else { else {
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size); memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
Path1WritePos += size; Path1WritePos += size;
} }
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128); //if(!gifRegs->stat.P1Q) CPU_INT(28, 128);