ReorderingMTGS: Added a new optimized memcpy_amd_qwc, for use by GIFpath copies. After much studying, we determined this is about as efficient as memcpy will ever get, for what we're doing with it.

DevNot:  Win32-only at the moment -- needs a GAS port (but that shouldn't be hard).  I made some notes in the code about it.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3472 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2010-07-12 19:40:30 +00:00
parent a6b3acb5d0
commit 934578c8fe
9 changed files with 133 additions and 85 deletions

View File

@ -28,6 +28,7 @@
# include "win_memzero.h"
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
@ -37,6 +38,8 @@
void _memset16_unaligned( void* dest, u16 data, size_t size );
#define memcpy_fast memcpy_amd_ // Fast memcpy
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c*16) // Memcpy with 16-byte Aligned addresses
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses
#define memcpy_const memcpy_amd_ // Memcpy with constant size
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)
//#define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16)

View File

@ -129,6 +129,10 @@ namespace Threading
// For use in spin/wait loops.
extern void SpinWait();
// Use prior to committing data to another thread (internal memcpy_qwc does not use fencing,
// so that many memcpys can be issued in a row more efficiently)
extern void StoreFence();
// Optional implementation to enable hires thread/process scheduler for the operating system.
// Needed by Windows, but might not be relevant to other platforms.

View File

@ -36,6 +36,11 @@ __forceinline void Threading::SpinWait()
__asm pause;
}
__forceinline void Threading::StoreFence()
{
__asm sfence;
}
__forceinline void Threading::EnableHiresScheduler()
{
// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of

View File

@ -146,7 +146,7 @@ $memcpy_ic_1: ; 64-byte block copies, in-cache copy
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec eax ; count down
sub eax, 1
jnz $memcpy_ic_1 ; last 64-byte block?
$memcpy_ic_2:
@ -189,64 +189,15 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec eax
movntq [edi-8], mm1
sub eax, 1
jnz $memcpy_uc_1 ; last 64-byte block?
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
// help keep the code cache footprint of memcpy_fast to a minimum.
/*
$memcpy_bp_1: ; large blocks, block prefetch copy
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jl $memcpy_64_test ; no, back to regular uncached copy
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_bp_2:
mov edx, [esi-64] ; grab one address per cache line
mov edx, [esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order to suppress HW prefetcher
dec eax ; count down the cache lines
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
movq mm0, [esi ] ; read 64 bits
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64 ; update source pointer
movntq [edi ], mm0 ; write 64 bits, bypassing cache
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
movntq [edi+16], mm2 ; from READING the destination address
movntq [edi+24], mm3 ; into the cache, only to be over-written,
movntq [edi+32], mm4 ; so that also helps performance
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64 ; update dest pointer
dec eax ; count down
jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks
*/
// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
@ -274,17 +225,99 @@ $memcpy_last_few: ; dword aligned from before movsd's
rep movsb ; the last 1, 2, or 3 bytes
$memcpy_final:
pop esi
pop edi
emms ; clean up the MMX state
sfence ; flush the write buffer
//mov eax, [dest] ; ret value = destination pointer
pop esi
pop edi
ret 4
}
}
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
{
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
// registers will improve copy performance, because they won't. Use of XMMs is only
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
// and even then the benefits are typically minimal (sometimes slower depending on the
// amount of data being copied).
//
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
// --air
// Linux Conversion note:
// This code would benefit nicely from having inline-able GAS syntax, since it should
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
// And its called enough times to probably merit the extra effort to ensure proper
// optimization. --air
__asm
{
mov ecx, [dest]
mov edx, [src]
mov eax, [qwc] ; keep a copy of count
shr eax, 1
jz $memcpy_qwc_1 ; only one 16 byte block to copy?
cmp eax, IN_CACHE_COPY/32
jb $memcpy_qwc_loop1 ; small copies should be cached (definite speedup --air)
$memcpy_qwc_loop2: ; 32-byte blocks, uncached copy
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
movq mm0,[edx+0] ; read 64 bits
movq mm1,[edx+8]
movq mm2,[edx+16]
movntq [ecx+0], mm0 ; write 64 bits, bypassing the cache
movntq [ecx+8], mm1
movq mm3,[edx+24]
movntq [ecx+16], mm2
movntq [ecx+24], mm3
add edx,32 ; update source pointer
add ecx,32 ; update destination pointer
sub eax,1
jnz $memcpy_qwc_loop2 ; last 64-byte block?
sfence ; flush the write buffer
jmp $memcpy_qwc_1
; 32-byte blocks, cached!
; This *is* important. Removing this and using exclusively non-temporal stores
; results in noticable speed loss!
$memcpy_qwc_loop1:
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
movq mm0,[edx+0] ; read 64 bits
movq mm1,[edx+8]
movq mm2,[edx+16]
movq [ecx+0], mm0 ; write 64 bits, bypassing the cache
movq [ecx+8], mm1
movq mm3,[edx+24]
movq [ecx+16], mm2
movq [ecx+24], mm3
add edx,32 ; update source pointer
add ecx,32 ; update destination pointer
sub eax,1
jnz $memcpy_qwc_loop1 ; last 64-byte block?
$memcpy_qwc_1:
test [qwc],1
jz $memcpy_qwc_final
movq mm0,[edx]
movq mm1,[edx+8]
movq [ecx], mm0
movq [ecx+8], mm1
$memcpy_qwc_final:
emms ; clean up the MMX state
}
}
// mmx mem-compare implementation, size has to be a multiple of 8
// returns 0 is equal, nonzero value if not equal
// ~10 times faster than standard memcmp

View File

@ -395,6 +395,7 @@ struct Pcsx2Config
// style. Useful for debugging potential bugs in the MTGS pipeline.
bool SynchronousMTGS;
bool DisableOutput;
int VsyncQueueSize;
bool FrameLimitEnable;
bool FrameSkipEnable;
@ -420,6 +421,8 @@ struct Pcsx2Config
return
OpEqu( SynchronousMTGS ) &&
OpEqu( DisableOutput ) &&
OpEqu( VsyncQueueSize ) &&
OpEqu( FrameSkipEnable ) &&
OpEqu( FrameLimitEnable ) &&
OpEqu( VsyncEnable ) &&

View File

@ -142,14 +142,11 @@ void SysMtgsThread::PostVsyncEnd()
SendDataPacket();
// Alter-frame flushing! Restarts the ringbuffer (wraps) on every other frame. This is a
// mandatory feature that prevents the MTGS from queuing more than 2 frames at any time.
// (queued frames cause input lag and desynced audio -- bad!). Ring restarts work for this
// because they act as sync points where the EE must stall to wait for the GS to catch-up,
// and they also allow us to reuse the front of the ringbuffer more often, which should improve
// L2 cache performance.
// If the MTGS is allowed to queue a lot of frames in advance, it creates input lag.
// Use the Queued FrameCount to stall the EE if another vsync is already queued in
// the ringbuffer.
if( AtomicIncrement(m_QueuedFrameCount) < 2 ) return;
if( AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize ) return;
uint readpos = volatize(m_RingPos);
uint freeroom;
@ -190,7 +187,7 @@ void SysMtgsThread::OpenPlugin()
{
if( m_PluginOpened ) return;
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 );
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
GSsetBaseMem( RingBuffer.Regs );
GSirqCallback( dummyIrqCallback );
@ -624,6 +621,7 @@ void SysMtgsThread::SendDataPacket()
PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
tag.data[0] = actualSize;
//Threading::StoreFence();
m_WritePos = m_packet_ringpos;
if( EmuConfig.GS.SynchronousMTGS )

View File

@ -217,6 +217,7 @@ Pcsx2Config::GSOptions::GSOptions()
SynchronousMTGS = false;
DisableOutput = false;
VsyncQueueSize = 2;
DefaultRegionMode = Region_NTSC;
FramesToDraw = 2;
@ -234,6 +235,7 @@ void Pcsx2Config::GSOptions::LoadSave( IniInterface& ini )
IniEntry( SynchronousMTGS );
IniEntry( DisableOutput );
IniEntry( VsyncQueueSize );
IniEntry( FrameLimitEnable );
IniEntry( FrameSkipEnable );

View File

@ -526,36 +526,36 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
{
uint endpos = destStart + len;
if( endpos >= destSize )
if( endpos < destSize )
{
uint firstcopylen = destSize - destStart;
memcpy_aligned(&destBase[destStart], src, firstcopylen );
destStart = endpos % destSize;
memcpy_aligned(destBase, src+firstcopylen, destStart );
memcpy_qwc(&destBase[destStart], src, len );
destStart += len;
}
else
{
memcpy_aligned(&destBase[destStart], src, len );
destStart += len;
uint firstcopylen = destSize - destStart;
memcpy_qwc(&destBase[destStart], src, firstcopylen );
destStart = endpos % destSize;
memcpy_qwc(destBase, src+firstcopylen, destStart );
}
}
void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
{
uint endpos = srcStart + len;
if( endpos >= srcSize )
if( endpos < srcSize )
{
uint firstcopylen = srcSize - srcStart;
memcpy_aligned(dest, &srcBase[srcStart], firstcopylen );
srcStart = endpos & srcSize;
memcpy_aligned(dest+firstcopylen, srcBase, srcStart );
memcpy_qwc(dest, &srcBase[srcStart], len );
srcStart += len;
}
else
{
memcpy_aligned(dest, &srcBase[srcStart], len );
srcStart += len;
uint firstcopylen = srcSize - srcStart;
memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
srcStart = endpos & srcSize;
memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
}
}

View File

@ -1129,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
// fixme: one of these days the following *16's will get cleaned up when we introduce
// a special qwc/simd16 optimized version of memcpy_aligned. :)
//DevCon.Status("XGkick Wrap!");
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff);
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
Path1WritePos += size;
size -= diff;
pDest += diff*16;
memcpy_aligned(pDest, microVU1.regs->Mem, size);
memcpy_qwc(pDest, microVU1.regs->Mem, size);
}
else {
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size);
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
Path1WritePos += size;
}
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);