MTGS Optimization! Implemented EEcore thread sleeping and signaling. [as always: MTGS changes are experimental, and need testing to isolate potential thread sync bugs, which are usually quite random in nature]

Important Notes: I designed the new MTGS to largely favor speed on GS intensive scenes, at the possible cost of some speed loss on scenes that do very little GS work (simple boring menus, mostly).  The idea is that losing 5-10% on a menu screen that already runs *really* fast is a valid trade off for possibly gaining a few FPS for in-game scenes (especially slow ones that need it most).  So don't benchmark this thing on game menus and expect it to be faster.

The new MTGS also has several other benefits that do not currently reflect well in benchmarking:
 * It renders only two frames ahead instead of 8.  This is great for fixing laggy input problems, but bad for benchmarking.  If the new MTGS manages the same speed while having lost the queued frames count, it's a sizable achievement.

 * It works a lot nicer with the GSdx software rasterizer in general.
 * It's new design will work nicer with future DX11 multithreaded features, when supported.

git-svn-id: 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-11-24 08:43:36 +00:00
parent 48ca426be4
commit 2f99d8c514
2 changed files with 222 additions and 153 deletions

View File

@ -91,29 +91,31 @@ class SysMtgsThread : public SysThreadBase
typedef SysThreadBase _parent;
// note: when g_pGSRingPos == g_pGSWritePos, the fifo is empty
// note: when m_RingPos == m_WritePos, the fifo is empty
uint m_RingPos; // cur pos gs is reading from
uint m_WritePos; // cur pos ee thread is writing to
Semaphore m_sem_OpenDone;
volatile bool m_RingBufferIsBusy;
volatile u32 m_SignalRingEnable;
volatile s32 m_SignalRingPosition;
int m_alterFrameFlush;
u32 m_RingWrapSpot;
Mutex m_lock_RingBufferBusy;
Mutex m_lock_RingRestart;
Semaphore m_sem_OnRingReset;
// used to keep multiple threads from sending packets to the ringbuffer concurrently.
MutexLockRecursive m_PacketLocker;
// (currently not used or implemented -- is a planned feature for a future threaded VU1)
//MutexLockRecursive m_PacketLocker;
// Used to delay the sending of events. Performance is better if the ringbuffer
// has more than one command in it when the thread is kicked.
int m_CopyCommandTally;
int m_CopyDataTally;
//volatile bool m_RingBufferIsBusy;
volatile bool m_PluginOpened;
// Counts the number of vsync frames queued in the MTGS ringbuffer. This is used to
// throttle the number of frames allowed to be rendered ahead of time for games that
// run very fast and have little or no ringbuffer overhead (typically opening menus)
//volatile s32 m_QueuedFrames;
Semaphore m_sem_OpenDone;
volatile bool m_PluginOpened;
// These vars maintain instance data for sending Data Packets.
// Only one data packet can be constructed and uploaded at a time.
@ -143,7 +145,7 @@ public:
void WaitForOpen();
void Freeze( int mode, MTGS_FreezeData& data );
void RestartRingbuffer();
void RestartRingbuffer( uint packsize=1 );
void SendSimplePacket( MTGS_RingCommand type, int data0, int data1, int data2 );
void SendPointerPacket( MTGS_RingCommand type, u32 data0, void* data1 );
@ -163,11 +165,6 @@ protected:
void OnResumeInThread( bool IsSuspended );
void OnCleanupInThread();
// Sets the Event flag and issues a timeslice on the EEcore thread (ie, an efficient
// method of kicking the MTGS thread into action once there's a sizable chunk of work
// accumulated).
void PrepEventWait();
// Used internally by SendSimplePacket type functions
uint _PrepForSimplePacket();
void _FinishSimplePacket( uint future_writepos );

View File

@ -107,14 +107,15 @@ void SysMtgsThread::OnStart()
m_RingPos = 0;
m_WritePos = 0;
//m_RingBufferIsBusy = false;
//m_QueuedFrames = 0;
m_RingBufferIsBusy = false;
m_packet_size = 0;
m_packet_ringpos = 0;
m_CopyCommandTally = 0;
m_alterFrameFlush = 0;
m_SignalRingEnable = 0;
m_SignalRingPosition= 0;
m_RingWrapSpot = 0;
m_CopyDataTally = 0;
@ -147,17 +148,19 @@ void SysMtgsThread::ResetGS()
static int alterFrameFlush = 0;
void SysMtgsThread::PostVsyncEnd( bool updategs )
SendSimplePacket( GS_RINGTYPE_VSYNC, (*(u32*)(PS2MEM_GS+0x1000)&0x2000), updategs, 0 );
if( alterFrameFlush || (m_WritePos > (RingBufferSize/3)) )
// Alter-frame flush! Restarts the ringbuffer (wraps) on every other frame. This is a
// mandatory feature that prevents the MTGS from queuing more than 2 frames at any time.
// (queued frames cause input lag and desynced audio -- bad!).
m_alterFrameFlush ^= 1;
if( m_alterFrameFlush )
alterFrameFlush ^= 1;
struct PacketTagType
@ -203,6 +206,25 @@ void SysMtgsThread::OpenPlugin()
GSsetGameCRC( ElfCRC, 0 );
class RingBufferLock : public ScopedLock
SysMtgsThread& m_mtgs;
RingBufferLock( SysMtgsThread& mtgs )
: ScopedLock( mtgs.m_lock_RingBufferBusy )
, m_mtgs( mtgs )
m_mtgs.m_RingBufferIsBusy = true;
virtual ~RingBufferLock() throw()
m_mtgs.m_RingBufferIsBusy = false;
void SysMtgsThread::ExecuteTaskInThread()
tls_mtgsThread = this;
@ -220,7 +242,7 @@ void SysMtgsThread::ExecuteTaskInThread()
ScopedLock busy( m_lock_RingBufferBusy );
RingBufferLock busy( *this );
// note: m_RingPos is intentionally not volatile, because it should only
// ever be modified by this thread.
@ -298,8 +320,13 @@ void SysMtgsThread::ExecuteTaskInThread()
MTGS_LOG( "(MTGS Packet Read) ringtype=Restart" );
m_RingPos = 0;
// It's the EEcore's job to make sure the writepos is set to 0 only AFTER the
// readpos has moved past. Otherwise the ringbuffer execution will stop on
// the readpos==writepos condition. >_<
pxAssertDev( m_RingPos != m_WritePos, "MTGS Synchronization Error -- Premature stoppage detected on ringbutter restart." );
@ -396,8 +423,29 @@ void SysMtgsThread::ExecuteTaskInThread()
uint newringpos = m_RingPos + ringposinc;
pxAssert( newringpos <= RingBufferSize );
newringpos &= RingBufferMask;
m_RingPos = newringpos & RingBufferMask;
if( m_SignalRingEnable != 0 )
// The EEcore has requested a signal after some amount of processed data.
m_SignalRingPosition -= ringposinc;
if( m_SignalRingPosition <= 0 )
// Make sure to post the signal after the m_RingPos has been updated...
m_RingPos = newringpos;
AtomicExchange( m_SignalRingEnable, 0 );
// Safety valve in case standard signals fail for some reason -- this ensures the EEcore
// won't sleep the eternity, even if SignalRingPosition didn't reach 0 for some reason.
if( AtomicExchange( m_SignalRingEnable, 0 ) != 0 )
AtomicExchange( m_SignalRingPosition, 0 );
@ -442,7 +490,7 @@ void SysMtgsThread::WaitGS()
if( volatize(m_RingPos) != m_WritePos )
do {
} while( volatize(m_RingPos) != m_WritePos );
@ -453,16 +501,10 @@ void SysMtgsThread::WaitGS()
// For use in loops that wait on the GS thread to do certain things.
void SysMtgsThread::SetEvent()
if( !m_RingBufferIsBusy )
m_CopyCommandTally = 0;
m_CopyDataTally = 0;
void SysMtgsThread::PrepEventWait()
//Console.Warning( "MTGS Stall! EE waits for nothing! ... except your GPU sometimes." );
m_CopyDataTally = 0;
u8* SysMtgsThread::GetDataPacketPtr() const
@ -502,12 +544,17 @@ void SysMtgsThread::SendDataPacket()
m_WritePos = temp;
m_packet_size = 0;
if( EmuConfig.Video.SynchronousMTGS )
else if( m_RingBufferIsBusy )
m_CopyDataTally += m_packet_size;
if( m_CopyDataTally > 0x2000 ) SetEvent();
m_packet_size = 0;
@ -580,11 +627,9 @@ int SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size
// Note on volatiles: g_pGSWritePos is not modified by the GS thread,
// so there's no need to use volatile reads here. We still have to use
// interlocked exchanges when we modify it, however, since the GS thread
// is reading it.
// Note on volatiles: m_WritePos is not modified by the GS thread, so there's no need
// to use volatile reads here. We do cache it though, since we know it never changes,
// except for calls to RingbufferRestert() -- handled below.
uint writepos = m_WritePos;
// Checks if a previous copy was started without an accompanying call to GSRINGBUF_DONECOPY
@ -604,19 +649,50 @@ int SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size
// But if not then we need to make sure the readpos is outside the scope of
// the block about to be written (writepos + size)
if( writepos < volatize(m_RingPos) )
// writepos is behind the readpos, so we need to wait until
// readpos is out past the end of the future write pos, or until it wraps
// around (in which case writepos will be >= readpos)
while( true )
uint readpos = volatize(m_RingPos);
if( writepos >= readpos ) break;
if( writepos+size < readpos ) break;
if( (writepos < readpos) && (writepos+size >= readpos) )
pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
// writepos is behind the readpos and will overlap it if we commit the data,
// so we need to wait until readpos is out past the end of the future write pos,
// or until it wraps around (in which case writepos will be >= readpos).
// Ideally though we want to wait longer, because if we just toss in this packet
// the next packet will likely stall up too. So lets set a condition for the MTGS
// thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied.
uint totalAccum = (m_RingWrapSpot - readpos) + writepos;
uint somedone = (totalAccum / 4);
if( somedone < size+1 ) somedone = size + 1;
// FMV Optimization: FMVs typically send *very* little data to the GS, in some cases
// every other frame is nothing more than a page swap. Sleeping the EEcore is a
// waste of time, and we get better results using a spinwait.
if( somedone > 0x80 )
m_SignalRingPosition = somedone;
//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Sleep!\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
do {
AtomicExchange( m_SignalRingEnable, 1 );
readpos = volatize(m_RingPos);
//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Post-sleep Report!\tringpos=0x%06x", readpos );
} while( (writepos < readpos) && (writepos+size >= readpos) );
pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
do {
readpos = volatize(m_RingPos);
} while( (writepos < readpos) && (writepos+size >= readpos) );
@ -624,44 +700,56 @@ int SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size
pxAssert( writepos != 0 );
// If the incoming packet doesn't fit, then start over from
// the start of the ring buffer (it's a lot easier than trying
// to wrap the packet around the end of the buffer).
// If the incoming packet doesn't fit, then start over from the start of the ring
// buffer (it's a lot easier than trying to wrap the packet around the end of the
// buffer).
RestartRingbuffer( size );
writepos = m_WritePos;
// stall until the read position is past the end of our incoming block,
// or until it reaches the current write position (signals an empty buffer).
while( true )
uint readpos = volatize(m_RingPos);
if( readpos == m_WritePos ) break;
if( m_WritePos+size < readpos ) break;
else // always true - if( writepos + size == MTGS_RINGBUFFEREND )
// Yay. Perfect fit. What are the odds?
// Copy is ready so long as readpos is less than writepos and *not* equal to the
// base of the ringbuffer (otherwise the buffer will stop when the writepos is
// wrapped around to zero later-on in SendDataPacket).
//Console.WriteLn( "MTGS > Perfect Fit!");
while( true )
uint readpos = volatize(m_RingPos);
if( readpos > writepos )
uint totalAccum = (m_RingWrapSpot - readpos) + writepos;
uint somedone = totalAccum / 4;
if( somedone < size+1 ) somedone = size + 1;
// stop waiting if the buffer is empty!
if( writepos == readpos ) break;
// FMV Optimization: (see above) This condition of a perfect fit is so rare that optimizing
// for it is pointless -- but it was also mindlessly simple copy-paste. So there. :p
// Copy is ready so long as readpos is less than writepos and *not*
// equal to the base of the ringbuffer (otherwise the buffer will stop
// when the writepos is wrapped around to zero later-on in SendDataPacket)
if( readpos < writepos && readpos != 0 ) break;
if( somedone > 0x80 )
m_SignalRingPosition = somedone;
//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Sleep!\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
do {
AtomicExchange( m_SignalRingEnable, 1 );
readpos = volatize(m_RingPos);
//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Post-sleep Report!\tringpos=0x%06x", readpos );
} while( (writepos < readpos) || (readpos==0) );
pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
do {
readpos = volatize(m_RingPos);
} while( (writepos < readpos) || (readpos==0) );
@ -682,6 +770,43 @@ int SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size
return m_packet_size;
void SysMtgsThread::RestartRingbuffer( uint packsize )
if( m_WritePos == 0 ) return;
const uint thefuture = 0;
// Always kick the MTGS into action for a ringbuffer restart.
uint readpos = volatize(m_RingPos);
if( readpos > m_WritePos )
// We have to be careful not to leapfrog our read-position, which would happen if
// it's greater than the current write position (since wrapping writepos to 0 would
// be the act of skipping PAST readpos). Stall until it loops around to the
// beginning of the buffer.
m_SignalRingPosition = (readpos - m_WritePos) + packsize;
do {
AtomicExchange( m_SignalRingEnable, 1 );
} while( volatize(m_RingPos) > m_WritePos );
PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
tag.command = GS_RINGTYPE_RESTART;
m_RingWrapSpot = m_WritePos;
m_WritePos = 0;
if( EmuConfig.Video.SynchronousMTGS )
__forceinline uint SysMtgsThread::_PrepForSimplePacket()
@ -694,12 +819,15 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket()
pxAssert( future_writepos <= RingBufferSize );
future_writepos &= RingBufferMask;
if( future_writepos == 0 )
m_RingWrapSpot = m_WritePos;
// FIXME: Optimize this using m_SignalRingEnable and friends!
// The ringbuffer read pos is blocking the future write position, so stall out
// until the read position has moved.
if( future_writepos == volatize(m_RingPos) )
do {
} while( future_writepos == volatize(m_RingPos) );
@ -715,64 +843,8 @@ __forceinline void SysMtgsThread::_FinishSimplePacket( uint future_writepos )
if( EmuConfig.Video.SynchronousMTGS )
// TODO : These will be moved to the mtgs class once I solidify the new synch method.
Semaphore m_sem_OnRingReset;
u32 m_SignalRingReset;
void SysMtgsThread::RestartRingbuffer()
if( m_WritePos == 0 ) return;
const uint thefuture = 0;
// We have to be careful not to leapfrog our read-position, which would happen if
// it's greater than the current write position (since wrapping writepos to 0 would
// be the act of skipping PAST readpos). Stall until it loops around to the
// beginning of the buffer
// TODO : Implement this using a mutex/semaphore signal for when the ring buffer has
// wrapped around from 0. ...which should end up looking something like this:
// note: the boolean for signalling ring resets is to prevent both frivilous posting
// to the semapore in the MTGS thread, and to avoid having accumulations of large
// numbers of signals in the semaphore that would have to be unwound here.
/*AtomicExchange( m_SignalRingReset, true );
uint readpos = volatize(m_RingPos);
while( readpos >= m_WritePos || readpos == thefuture )
AtomicExchange( m_SignalRingReset, false );*/
while( true )
uint readpos = volatize(m_RingPos);
// is the buffer empty?
if( readpos == m_WritePos ) break;
// Also: Wait for the readpos to go past the start of the buffer (which is our
// 'future' write position), otherwise it'll stop dead in its tracks when we set
// the new write position below. (readpos == writepos is a "stop" condition).
if( (readpos < m_WritePos) && (readpos != thefuture) ) break;
PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
tag.command = GS_RINGTYPE_RESTART;
//[0] = data0;
//[1] = data1;
//[2] = data2;
m_WritePos = 0;
if( EmuConfig.Video.SynchronousMTGS )
void SysMtgsThread::SendSimplePacket( MTGS_RingCommand type, int data0, int data1, int data2 )