mirror of https://github.com/PCSX2/pcsx2.git
ReorderingMTGS: Added a new optimized memcpy_amd_qwc, for use by GIFpath copies. After much studying, we determined this is about as efficient as memcpy will ever get, for what we're doing with it.
DevNot: Win32-only at the moment -- needs a GAS port (but that shouldn't be hard). I made some notes in the code about it. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3472 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
a6b3acb5d0
commit
934578c8fe
|
@ -28,6 +28,7 @@
|
|||
# include "win_memzero.h"
|
||||
|
||||
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
||||
extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
|
||||
extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
||||
extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
||||
|
||||
|
@ -37,6 +38,8 @@
|
|||
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||
|
||||
#define memcpy_fast memcpy_amd_ // Fast memcpy
|
||||
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c*16) // Memcpy with 16-byte Aligned addresses
|
||||
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses
|
||||
#define memcpy_const memcpy_amd_ // Memcpy with constant size
|
||||
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
|
||||
#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)
|
||||
//#define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16)
|
||||
|
|
|
@ -129,6 +129,10 @@ namespace Threading
|
|||
|
||||
// For use in spin/wait loops.
|
||||
extern void SpinWait();
|
||||
|
||||
// Use prior to committing data to another thread (internal memcpy_qwc does not use fencing,
|
||||
// so that many memcpys can be issued in a row more efficiently)
|
||||
extern void StoreFence();
|
||||
|
||||
// Optional implementation to enable hires thread/process scheduler for the operating system.
|
||||
// Needed by Windows, but might not be relevant to other platforms.
|
||||
|
|
|
@ -36,6 +36,11 @@ __forceinline void Threading::SpinWait()
|
|||
__asm pause;
|
||||
}
|
||||
|
||||
__forceinline void Threading::StoreFence()
|
||||
{
|
||||
__asm sfence;
|
||||
}
|
||||
|
||||
__forceinline void Threading::EnableHiresScheduler()
|
||||
{
|
||||
// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of
|
||||
|
|
|
@ -146,7 +146,7 @@ $memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
|||
|
||||
add esi, 64 ; update source pointer
|
||||
add edi, 64 ; update destination pointer
|
||||
dec eax ; count down
|
||||
sub eax, 1
|
||||
jnz $memcpy_ic_1 ; last 64-byte block?
|
||||
|
||||
$memcpy_ic_2:
|
||||
|
@ -189,64 +189,15 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy
|
|||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16], mm0
|
||||
dec eax
|
||||
movntq [edi-8], mm1
|
||||
|
||||
sub eax, 1
|
||||
jnz $memcpy_uc_1 ; last 64-byte block?
|
||||
|
||||
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
|
||||
|
||||
// For the largest size blocks, a special technique called Block Prefetch
|
||||
// can be used to accelerate the read operations. Block Prefetch reads
|
||||
// one address per cache line, for a series of cache lines, in a short loop.
|
||||
// This is faster than using software prefetch. The technique is great for
|
||||
// getting maximum read bandwidth, especially in DDR memory systems.
|
||||
|
||||
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
|
||||
// help keep the code cache footprint of memcpy_fast to a minimum.
|
||||
/*
|
||||
$memcpy_bp_1: ; large blocks, block prefetch copy
|
||||
|
||||
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
||||
jl $memcpy_64_test ; no, back to regular uncached copy
|
||||
|
||||
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
|
||||
add esi, CACHEBLOCK * 64 ; move to the top of the block
|
||||
align 16
|
||||
$memcpy_bp_2:
|
||||
mov edx, [esi-64] ; grab one address per cache line
|
||||
mov edx, [esi-128] ; grab one address per cache line
|
||||
sub esi, 128 ; go reverse order to suppress HW prefetcher
|
||||
dec eax ; count down the cache lines
|
||||
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
|
||||
|
||||
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
|
||||
align 16
|
||||
$memcpy_bp_3:
|
||||
movq mm0, [esi ] ; read 64 bits
|
||||
movq mm1, [esi+ 8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq mm4, [esi+32]
|
||||
movq mm5, [esi+40]
|
||||
movq mm6, [esi+48]
|
||||
movq mm7, [esi+56]
|
||||
add esi, 64 ; update source pointer
|
||||
movntq [edi ], mm0 ; write 64 bits, bypassing cache
|
||||
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
|
||||
movntq [edi+16], mm2 ; from READING the destination address
|
||||
movntq [edi+24], mm3 ; into the cache, only to be over-written,
|
||||
movntq [edi+32], mm4 ; so that also helps performance
|
||||
movntq [edi+40], mm5
|
||||
movntq [edi+48], mm6
|
||||
movntq [edi+56], mm7
|
||||
add edi, 64 ; update dest pointer
|
||||
|
||||
dec eax ; count down
|
||||
|
||||
jnz $memcpy_bp_3 ; keep copying
|
||||
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
||||
jmp $memcpy_bp_1 ; keep processing chunks
|
||||
*/
|
||||
// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
|
||||
// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
|
||||
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop". Then it handles the last few bytes.
|
||||
|
@ -274,17 +225,99 @@ $memcpy_last_few: ; dword aligned from before movsd's
|
|||
rep movsb ; the last 1, 2, or 3 bytes
|
||||
|
||||
$memcpy_final:
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
emms ; clean up the MMX state
|
||||
sfence ; flush the write buffer
|
||||
//mov eax, [dest] ; ret value = destination pointer
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
ret 4
|
||||
}
|
||||
}
|
||||
|
||||
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
|
||||
__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
|
||||
{
|
||||
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
|
||||
// registers will improve copy performance, because they won't. Use of XMMs is only
|
||||
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
|
||||
// and even then the benefits are typically minimal (sometimes slower depending on the
|
||||
// amount of data being copied).
|
||||
//
|
||||
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
|
||||
// --air
|
||||
|
||||
// Linux Conversion note:
|
||||
// This code would benefit nicely from having inline-able GAS syntax, since it should
|
||||
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
|
||||
// And its called enough times to probably merit the extra effort to ensure proper
|
||||
// optimization. --air
|
||||
|
||||
__asm
|
||||
{
|
||||
mov ecx, [dest]
|
||||
mov edx, [src]
|
||||
mov eax, [qwc] ; keep a copy of count
|
||||
shr eax, 1
|
||||
jz $memcpy_qwc_1 ; only one 16 byte block to copy?
|
||||
|
||||
cmp eax, IN_CACHE_COPY/32
|
||||
jb $memcpy_qwc_loop1 ; small copies should be cached (definite speedup --air)
|
||||
|
||||
$memcpy_qwc_loop2: ; 32-byte blocks, uncached copy
|
||||
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
|
||||
|
||||
movq mm0,[edx+0] ; read 64 bits
|
||||
movq mm1,[edx+8]
|
||||
movq mm2,[edx+16]
|
||||
movntq [ecx+0], mm0 ; write 64 bits, bypassing the cache
|
||||
movntq [ecx+8], mm1
|
||||
movq mm3,[edx+24]
|
||||
movntq [ecx+16], mm2
|
||||
movntq [ecx+24], mm3
|
||||
|
||||
add edx,32 ; update source pointer
|
||||
add ecx,32 ; update destination pointer
|
||||
sub eax,1
|
||||
jnz $memcpy_qwc_loop2 ; last 64-byte block?
|
||||
sfence ; flush the write buffer
|
||||
jmp $memcpy_qwc_1
|
||||
|
||||
; 32-byte blocks, cached!
|
||||
; This *is* important. Removing this and using exclusively non-temporal stores
|
||||
; results in noticable speed loss!
|
||||
|
||||
$memcpy_qwc_loop1:
|
||||
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
|
||||
|
||||
movq mm0,[edx+0] ; read 64 bits
|
||||
movq mm1,[edx+8]
|
||||
movq mm2,[edx+16]
|
||||
movq [ecx+0], mm0 ; write 64 bits, bypassing the cache
|
||||
movq [ecx+8], mm1
|
||||
movq mm3,[edx+24]
|
||||
movq [ecx+16], mm2
|
||||
movq [ecx+24], mm3
|
||||
|
||||
add edx,32 ; update source pointer
|
||||
add ecx,32 ; update destination pointer
|
||||
sub eax,1
|
||||
jnz $memcpy_qwc_loop1 ; last 64-byte block?
|
||||
|
||||
$memcpy_qwc_1:
|
||||
test [qwc],1
|
||||
jz $memcpy_qwc_final
|
||||
movq mm0,[edx]
|
||||
movq mm1,[edx+8]
|
||||
movq [ecx], mm0
|
||||
movq [ecx+8], mm1
|
||||
|
||||
$memcpy_qwc_final:
|
||||
emms ; clean up the MMX state
|
||||
}
|
||||
}
|
||||
|
||||
// mmx mem-compare implementation, size has to be a multiple of 8
|
||||
// returns 0 is equal, nonzero value if not equal
|
||||
// ~10 times faster than standard memcmp
|
||||
|
|
|
@ -395,6 +395,7 @@ struct Pcsx2Config
|
|||
// style. Useful for debugging potential bugs in the MTGS pipeline.
|
||||
bool SynchronousMTGS;
|
||||
bool DisableOutput;
|
||||
int VsyncQueueSize;
|
||||
|
||||
bool FrameLimitEnable;
|
||||
bool FrameSkipEnable;
|
||||
|
@ -420,6 +421,8 @@ struct Pcsx2Config
|
|||
return
|
||||
OpEqu( SynchronousMTGS ) &&
|
||||
OpEqu( DisableOutput ) &&
|
||||
OpEqu( VsyncQueueSize ) &&
|
||||
|
||||
OpEqu( FrameSkipEnable ) &&
|
||||
OpEqu( FrameLimitEnable ) &&
|
||||
OpEqu( VsyncEnable ) &&
|
||||
|
|
|
@ -142,14 +142,11 @@ void SysMtgsThread::PostVsyncEnd()
|
|||
|
||||
SendDataPacket();
|
||||
|
||||
// Alter-frame flushing! Restarts the ringbuffer (wraps) on every other frame. This is a
|
||||
// mandatory feature that prevents the MTGS from queuing more than 2 frames at any time.
|
||||
// (queued frames cause input lag and desynced audio -- bad!). Ring restarts work for this
|
||||
// because they act as sync points where the EE must stall to wait for the GS to catch-up,
|
||||
// and they also allow us to reuse the front of the ringbuffer more often, which should improve
|
||||
// L2 cache performance.
|
||||
// If the MTGS is allowed to queue a lot of frames in advance, it creates input lag.
|
||||
// Use the Queued FrameCount to stall the EE if another vsync is already queued in
|
||||
// the ringbuffer.
|
||||
|
||||
if( AtomicIncrement(m_QueuedFrameCount) < 2 ) return;
|
||||
if( AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize ) return;
|
||||
|
||||
uint readpos = volatize(m_RingPos);
|
||||
uint freeroom;
|
||||
|
@ -190,7 +187,7 @@ void SysMtgsThread::OpenPlugin()
|
|||
{
|
||||
if( m_PluginOpened ) return;
|
||||
|
||||
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 );
|
||||
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
|
||||
GSsetBaseMem( RingBuffer.Regs );
|
||||
GSirqCallback( dummyIrqCallback );
|
||||
|
||||
|
@ -624,6 +621,7 @@ void SysMtgsThread::SendDataPacket()
|
|||
PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
|
||||
tag.data[0] = actualSize;
|
||||
|
||||
//Threading::StoreFence();
|
||||
m_WritePos = m_packet_ringpos;
|
||||
|
||||
if( EmuConfig.GS.SynchronousMTGS )
|
||||
|
|
|
@ -217,6 +217,7 @@ Pcsx2Config::GSOptions::GSOptions()
|
|||
|
||||
SynchronousMTGS = false;
|
||||
DisableOutput = false;
|
||||
VsyncQueueSize = 2;
|
||||
|
||||
DefaultRegionMode = Region_NTSC;
|
||||
FramesToDraw = 2;
|
||||
|
@ -234,6 +235,7 @@ void Pcsx2Config::GSOptions::LoadSave( IniInterface& ini )
|
|||
|
||||
IniEntry( SynchronousMTGS );
|
||||
IniEntry( DisableOutput );
|
||||
IniEntry( VsyncQueueSize );
|
||||
|
||||
IniEntry( FrameLimitEnable );
|
||||
IniEntry( FrameSkipEnable );
|
||||
|
|
|
@ -526,36 +526,36 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
|
|||
void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
|
||||
{
|
||||
uint endpos = destStart + len;
|
||||
if( endpos >= destSize )
|
||||
if( endpos < destSize )
|
||||
{
|
||||
uint firstcopylen = destSize - destStart;
|
||||
memcpy_aligned(&destBase[destStart], src, firstcopylen );
|
||||
|
||||
destStart = endpos % destSize;
|
||||
memcpy_aligned(destBase, src+firstcopylen, destStart );
|
||||
memcpy_qwc(&destBase[destStart], src, len );
|
||||
destStart += len;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_aligned(&destBase[destStart], src, len );
|
||||
destStart += len;
|
||||
uint firstcopylen = destSize - destStart;
|
||||
memcpy_qwc(&destBase[destStart], src, firstcopylen );
|
||||
|
||||
destStart = endpos % destSize;
|
||||
memcpy_qwc(destBase, src+firstcopylen, destStart );
|
||||
}
|
||||
}
|
||||
|
||||
void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
|
||||
{
|
||||
uint endpos = srcStart + len;
|
||||
if( endpos >= srcSize )
|
||||
if( endpos < srcSize )
|
||||
{
|
||||
uint firstcopylen = srcSize - srcStart;
|
||||
memcpy_aligned(dest, &srcBase[srcStart], firstcopylen );
|
||||
|
||||
srcStart = endpos & srcSize;
|
||||
memcpy_aligned(dest+firstcopylen, srcBase, srcStart );
|
||||
memcpy_qwc(dest, &srcBase[srcStart], len );
|
||||
srcStart += len;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_aligned(dest, &srcBase[srcStart], len );
|
||||
srcStart += len;
|
||||
uint firstcopylen = srcSize - srcStart;
|
||||
memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
|
||||
|
||||
srcStart = endpos & srcSize;
|
||||
memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1129,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
|
|||
// fixme: one of these days the following *16's will get cleaned up when we introduce
|
||||
// a special qwc/simd16 optimized version of memcpy_aligned. :)
|
||||
//DevCon.Status("XGkick Wrap!");
|
||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff);
|
||||
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
|
||||
Path1WritePos += size;
|
||||
size -= diff;
|
||||
pDest += diff*16;
|
||||
memcpy_aligned(pDest, microVU1.regs->Mem, size);
|
||||
memcpy_qwc(pDest, microVU1.regs->Mem, size);
|
||||
}
|
||||
else {
|
||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size);
|
||||
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
|
||||
Path1WritePos += size;
|
||||
}
|
||||
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);
|
||||
|
|
Loading…
Reference in New Issue