pcsx2/pcsx2/MTGS.cpp

/*  Pcsx2 - Pc Ps2 Emulator
 *  Copyright (C) 2002-2009  Pcsx2 Team
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *  
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 */

#include "PrecompiledHeader.h"

#include <list>

#include <wx/datetime.h>

#include "Common.h"
#include "VU.h"
#include "GS.h"
#include "iR5900.h"
#include "VifDma.h"

#include "SamplProf.h"

// Uncomment this to enable profiling of the GS RingBufferCopy function.
//#define PCSX2_GSRING_SAMPLING_STATS

using namespace Threading;
using namespace std;

#ifdef DEBUG
#define MTGS_LOG Console::WriteLn
#else
#define MTGS_LOG 0&&
#endif

// forces the compiler to treat a non-volatile value as volatile.
// This allows us to declare the vars as non-volatile and only use
// them as volatile when appropriate (more optimized).

#define volatize(x) (*reinterpret_cast<volatile uint*>(&(x)))

/////////////////////////////////////////////////////////////////////////////
//   BEGIN  --  MTGS GIFtag Parse Implementation
//
// The MTGS needs a dummy "GS plugin" for processing SIGNAL, FINISH, and LABEL
// commands.  These commands trigger gsIRQs, which need to be handled accurately
// in synch with the EE (which can be running several frames ahead of the MTGS)
//
// Yeah, it's a lot of work, but the performance gains are huge, even on HT cpus.

// unpack the registers
// registers are stored as a sequence of 4 bit values in the
// upper 64 bits of the GIFTAG.  That sucks for us, so we unpack
// them into an 8 bit array.
__forceinline void GIFPath::PrepRegs()
{
	if( tag.nreg == 0 )
	{
		u32 tempreg = tag.regs[0];
		for(u32 i=0; i<16; ++i, tempreg >>= 4)
		{
			if( i == 8 ) tempreg = tag.regs[1];
			assert( (tempreg&0xf) < 0x64 );
			regs[i] = tempreg & 0xf;
		}
	}
	else
	{
		u32 tempreg = tag.regs[0];
		for(u32 i=0; i<tag.nreg; ++i, tempreg >>= 4)
		{
			assert( (tempreg&0xf) < 0x64 );
			regs[i] = tempreg & 0xf;
		}
	}
}

void GIFPath::SetTag(const void* mem)
{
	tag = *((GIFTAG*)mem);
	curreg = 0;

	PrepRegs();
}

u32 GIFPath::GetReg() 
{
	return regs[curreg];
}

static void _mtgsFreezeGIF( SaveState& state, GIFPath (&paths)[3] )
{
	for(int i=0; i<3; i++ )
	{
		state.Freeze( paths[i].tag );
		state.Freeze( paths[i].curreg );
	}

	for(int i=0; i<3; i++ )
	{
		state.Freeze( paths[i].regs );
	}
}

void SaveState::mtgsFreeze()
{
	FreezeTag( "mtgs" );
	mtgsThread->Freeze( *this );
}


static void RegHandlerSIGNAL(const u32* data)
{
	MTGS_LOG("MTGS SIGNAL data %x_%x CSRw %x IMR %x CSRr\n",data[0], data[1], CSRw, GSIMR, GSCSRr);

	GSSIGLBLID->SIGID = (GSSIGLBLID->SIGID&~data[1])|(data[0]&data[1]);
	
	if ((CSRw & 0x1))
	{
		if (!(GSIMR&0x100) )  
		{
			gsIrq();
		}

		GSCSRr |= 1; // signal
	}
}

static void RegHandlerFINISH(const u32* data)
{
	MTGS_LOG("MTGS FINISH data %x_%x CSRw %x\n", params data[0], data[1], CSRw);

	if ((CSRw & 0x2))
	{
		if (!(GSIMR&0x200))
			gsIrq();

		GSCSRr |= 2; // finish
	}
}

static void RegHandlerLABEL(const u32* data)
{
	GSSIGLBLID->LBLID = (GSSIGLBLID->LBLID&~data[1])|(data[0]&data[1]);
}

//  END  --  MTGS GIFtag Parse Implementation
/////////////////////////////////////////////////////////////////////////////

/////////////////////////////////////////////////////////////////////////////
//  MTGS Threaded Class Implementation

mtgsThreadObject* mtgsThread = NULL;

#ifdef RINGBUF_DEBUG_STACK
#include <list>
std::list<uint> ringposStack;
#endif

#ifdef PCSX2_DEBUG
// debug variable used to check for bad code bits where copies are started
// but never closed, or closed without having been started.  (GSRingBufCopy calls
// should always be followed by a call to GSRINGBUF_DONECOPY)
// And it's not even used in the debug code.
//static int copyLock = 0;
#endif

typedef void (*GIFRegHandler)(const u32* data);
static GIFRegHandler s_GSHandlers[3] = { RegHandlerSIGNAL, RegHandlerFINISH, RegHandlerLABEL };

mtgsThreadObject::mtgsThreadObject() :
	PersistentThread()
,	m_RingPos( 0 )
,	m_WritePos( 0 )

,	m_sem_InitDone()
,	m_lock_RingRestart()
,	m_PacketLocker( true )		// true - makes it a recursive lock

,	m_CopyCommandTally( 0 )
,	m_CopyDataTally( 0 )
,	m_RingBufferIsBusy( 0 )
,	m_QueuedFrames( 0 )
,	m_lock_FrameQueueCounter()
,	m_packet_size( 0 )
,	m_packet_ringpos( 0 )

#ifdef RINGBUF_DEBUG_STACK
,	m_lock_Stack()
#endif
,	m_RingBuffer( m_RingBufferSize + (Ps2MemSize::GSregs/sizeof(u128)) )
,	m_gsMem( (u8*)m_RingBuffer.GetPtr( m_RingBufferSize ) )
{
	memzero_obj( m_path );
}

void mtgsThreadObject::Start()
{
	m_sem_InitDone.Reset();
	PersistentThread::Start();

	// Wait for the thread to finish initialization (it runs GSopen, which can take
	// some time since it's creating a new window and all), and then check for errors.

	m_sem_InitDone.Wait();

	if( m_returncode != 0 )	// means the thread failed to init the GS plugin
		throw Exception::PluginOpenError( PluginId_GS );
}

mtgsThreadObject::~mtgsThreadObject()
{
	Cancel();
}

void mtgsThreadObject::Cancel()
{
	Console::WriteLn( "MTGS > Closing GS thread..." );
	SendSimplePacket( GS_RINGTYPE_QUIT, 0, 0, 0 );
	SetEvent();
	m_sem_Quitter.Wait( wxTimeSpan( 0, 0, 5, 0 ) );
	Sleep( 2 );
	PersistentThread::Cancel( true );
}

void mtgsThreadObject::Reset()
{
	// MTGS Reset process:
	//  * clear the ringbuffer.
	//  * Signal a reset.
	//  * clear the path and byRegs structs (used by GIFtagDummy)

	AtomicExchange( m_RingPos, m_WritePos );

	MTGS_LOG( "MTGS > Sending Reset...\n" );
	SendSimplePacket( GS_RINGTYPE_RESET, 0, 0, 0 );
	SendSimplePacket( GS_RINGTYPE_FRAMESKIP, 0, 0, 0 );

	memzero_obj( m_path );
}

// Processes a GIFtag & packet, and throws out some gsIRQs as needed.
// Used to keep interrupts in sync with the EE, while the GS itself
// runs potentially several frames behind.
// size - size of the packet in simd128's
__forceinline int mtgsThreadObject::_gifTransferDummy( GIF_PATH pathidx, const u8* pMem, u32 size )
{
	GIFPath& path = m_path[pathidx];
  /*	bool path1loop = false;
	int startval = size;*/
#ifdef PCSX2_GSRING_SAMPLING_STATS
	static uptr profStartPtr = 0;
	static uptr profEndPtr = 0;
	if( profStartPtr == 0 )
	{
		__asm 
		{ 
	__beginfunc:
			mov profStartPtr, offset __beginfunc;
			mov profEndPtr, offset __endfunc;
		}
		ProfilerRegisterSource( "GSRingBufCopy", (void*)profStartPtr, profEndPtr - profStartPtr );
	}
#endif

	while(size > 0)
	{
		if (path.tag.nloop == 0)
		{
			path.SetTag( pMem );

			pMem += sizeof(GIFTAG);
			--size;

			if (pathidx == 2)
			{			
				if (path.tag.flg != GIF_FLG_IMAGE) 
					Path3progress = TRANSFER_MODE; //Other mode (but not stopped, I guess?)
				else  
					Path3progress = IMAGE_MODE; //IMAGE mode
				//if(pathidx == 2) GIF_LOG("Set Giftag NLoop %d EOP %x Mode %d Path3msk %x Path3progress %x ", path.tag.nloop, path.tag.eop, path.tag.flg, vif1Regs->mskpath3, Path3progress);
			}

			if (pathidx == 0) 
			{                       
			//	int transize = 0;
				// hack: if too much data for VU1, just ignore.

				// The GIF is evil : if nreg is 0, it's really 16.  Otherwise it's the value in nreg.
				/*const int numregs = path.tag.nreg ? path.tag.nreg : 16;
				if(path.tag.flg < 2)
				{
					transize = (path.tag.nloop * numregs);
				}
				else transize = path.tag.nloop;

				if(transize > (path.tag.flg == 1 ? 0x800 : 0x400))
				{
					//DevCon::Notice("Too much data");
					path.tag.nloop = 0;
					if(path1loop == true)return ++size - 0x400;
					else return ++size;
				}*/
				const int numregs = ((path.tag.nreg-1)&15)+1;

				if((path.tag.nloop * numregs) > (size * ((path.tag.flg == 1) ? 2 : 1)))
				{
					path.tag.nloop = 0;
					return ++size;
				}
			}
		}
		else
		{
			// NOTE: size > 0 => do {} while(size > 0); should be faster than while(size > 0) {}
		
			//if(pathidx == 2) GIF_LOG("PATH3 NLoop %d EOP %x Mode %d Path3msk %x Path3progress %x ", path.tag.nloop, path.tag.eop, path.tag.flg, vif1Regs->mskpath3, Path3progress);
			switch(path.tag.flg)
			{
			case GIF_FLG_PACKED:

				do
				{
					if( path.GetReg() == 0xe )
					{
						const int handler = pMem[8];
						if(handler >= 0x60 && handler < 0x63)
							s_GSHandlers[handler&0x3]((const u32*)pMem);
					}

					size--;
					pMem += 16; // 128 bits! //sizeof(GIFPackedReg);
				}
				while(path.StepReg() && size > 0);

			break;

			case GIF_FLG_REGLIST:

				size *= 2;

				do
				{
					const int handler = path.GetReg();
					if(handler >= 0x60 && handler < 0x63)
						s_GSHandlers[handler&0x3]((const u32*)pMem);

					size--;
					pMem += 8; //sizeof(GIFReg); -- 64 bits!
				}
				while(path.StepReg() && size > 0);
			
				if(size & 1) pMem += 8; //sizeof(GIFReg);

				size /= 2;

			break;

			case GIF_FLG_IMAGE2: // hmmm
				assert(0);
				path.tag.nloop = 0;

			break;

			case GIF_FLG_IMAGE:
			{
				int len = (int)min(size, path.tag.nloop);

				pMem += len * 16;
				path.tag.nloop -= len;
				size -= len;
			}
			break;

			jNO_DEFAULT;

			}
		}
		
		if(path.tag.nloop == 0)
		{
			if(path.tag.eop)
			{
				if(pathidx != 1)
				{				
					break;
				}
				/*if((path.tag.nloop > 0 || (!path.tag.eop && path.tag.nloop == 0)) && size == 0)
				{
					if(path1loop == true) return size - 0x400;
					//DevCon::Notice("Looping Nloop %x, Eop %x, FLG %x", params path.tag.nloop, path.tag.eop, path.tag.flg);
					size = 0x400;
					pMem -= 0x4000;
					path1loop = true;
				}*/
			} 
			/*else if(size == 0 && pathidx == 0)
			{
				if(path1loop == true) return size - 0x400;
				//DevCon::Notice("Looping Nloop %x, Eop %x, FLG %x", params path.tag.nloop, path.tag.eop, path.tag.flg);
				size = 0x400;
				pMem -= 0x4000;
				path1loop = true;
			}*/
		} 
		/*else if(size == 0 && pathidx == 0)
		{
			if(path1loop == true) return size - 0x400;
			//DevCon::Notice("Looping Nloop %x, Eop %x, FLG %x", params path.tag.nloop, path.tag.eop, path.tag.flg);
			size = 0x400;
			pMem -= 0x4000;
			path1loop = true;
		}*/
	}

	if(pathidx == 0)
	{
		//If the XGKick has spun around the VU memory end address, we need to INCREASE the size sent.
		/*if(path1loop == true)
		{
			return (size - 0x400); //This will cause a negative making eg. size(20) - retval(-30) = 50;
		}*/
		if(size == 0 && path.tag.nloop > 0)
		{
			path.tag.nloop = 0;
			DevCon::Write( "path1 hack! " );

			// This means that the giftag data got screwly somewhere
			// along the way (often means curreg was in a bad state or something)
		}
	}

	
	if(pathidx == 2)
		{
			if(path.tag.nloop == 0 )
			{
				//DevCon::Notice("Finishing Giftag NLoop %d EOP %x Mode %d nregs %d Path3progress %d Vifstat VGW %x", 
					//params path.tag.nloop, path.tag.eop, path.tag.flg, path.tag.nreg, Path3progress, vif1Regs->stat & VIF1_STAT_VGW);
				if(path.tag.eop)
				{
					Path3progress = STOPPED_MODE;	
					//GIF_LOG("Set progress NLoop %d EOP %x Mode %d Path3msk %x Path3progress %x ", path.tag.nloop, path.tag.eop, path.tag.flg, vif1Regs->mskpath3, Path3progress);
				}
				
			}
		
		}
#ifdef PCSX2_GSRING_SAMPLING_STATS
	__asm
	{
		__endfunc:
				nop;
	}
#endif
	return size;
}

void mtgsThreadObject::PostVsyncEnd( bool updategs )
{
	while( m_QueuedFrames > 8 )
	{
		if( m_WritePos == volatize( m_RingPos ) )
		{
			// MTGS ringbuffer is empty, but we still have queued frames in the counter?  Ouch!
			Console::Error( "MTGS > Queued framecount mismatch = %d", params m_QueuedFrames );
			m_QueuedFrames = 0;
			break;
		}
		Threading::Sleep( 2 );		// Sleep off quite a bit of time, since we're obviously *waaay* ahead.
		SpinWait();
	}

	m_lock_FrameQueueCounter.Lock();
	m_QueuedFrames++;
	//Console::Status( " >> Frame Added!" );
	m_lock_FrameQueueCounter.Unlock();

	SendSimplePacket( GS_RINGTYPE_VSYNC,
		(*(u32*)(PS2MEM_GS+0x1000)&0x2000), updategs, 0);

	// No need to freeze MMX/XMM registers here since this
	// code is always called from the context of a BranchTest.
	SetEvent();
}

struct PacketTagType
{
	u32 command;
	u32 data[3];
};

sptr mtgsThreadObject::ExecuteTask()
{
	Console::WriteLn("MTGS > Thread Started, Opening GS Plugin...");

	memcpy_aligned( m_gsMem, PS2MEM_GS, sizeof(PS2MEM_GS) );
	GSsetBaseMem( m_gsMem );
	GSirqCallback( NULL );

	GetPluginManager().Open( PluginId_GS );
	
	Console::WriteLn( "MTGS > GSopen Finished, return code: 0x%x", params m_returncode );

	GSCSRr = 0x551B4000; // 0x55190000
	m_sem_InitDone.Post();
	if (m_returncode != 0) { return m_returncode; }		// error msg will be issued to the user by Plugins.c

#ifdef RINGBUF_DEBUG_STACK
	PacketTagType prevCmd;
#endif

	while( true )
	{
		m_sem_event.Wait();

		AtomicExchange( m_RingBufferIsBusy, 1 );

		// note: m_RingPos is intentionally not volatile, because it should only
		// ever be modified by this thread.
		while( m_RingPos != volatize(m_WritePos))
		{
			assert( m_RingPos < m_RingBufferSize );

			const PacketTagType& tag = (PacketTagType&)m_RingBuffer[m_RingPos];
			u32 ringposinc = 1;

#ifdef RINGBUF_DEBUG_STACK
			// pop a ringpos off the stack.  It should match this one!

			m_lock_Stack.Lock();
			uptr stackpos = ringposStack.back();
			if( stackpos != m_RingPos )
			{
				Console::Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", params stackpos, m_RingPos, prevCmd.command );
			}
			assert( stackpos == m_RingPos );
			prevCmd = tag;
			ringposStack.pop_back();
			m_lock_Stack.Unlock();
#endif

			switch( tag.command )
			{
				case GS_RINGTYPE_RESTART:
					AtomicExchange(m_RingPos, 0);
					
					// stall for a bit to let the MainThread have time to update the g_pGSWritePos. 
					m_lock_RingRestart.Lock();
					m_lock_RingRestart.Unlock();
				continue;

				case GS_RINGTYPE_P1:
				{
					const int qsize = tag.data[0];
					const u128* data = m_RingBuffer.GetPtr( m_RingPos+1 );

					// make sure that tag>>16 is the MAX size readable
					GSgifTransfer1((u32*)(data - 0x400 + qsize), 0x4000-qsize*16);
					//GSgifTransfer1((u32*)data, qsize);
					ringposinc += qsize;
				}
				break;

				case GS_RINGTYPE_P2:
				{
					const int qsize = tag.data[0];
					const u128* data = m_RingBuffer.GetPtr( m_RingPos+1 );
					GSgifTransfer2((u32*)data, qsize);
					ringposinc += qsize;
				}
				break;

				case GS_RINGTYPE_P3:
				{
					const int qsize = tag.data[0];
					const u128* data = m_RingBuffer.GetPtr( m_RingPos+1 );
					GSgifTransfer3((u32*)data, qsize);
					ringposinc += qsize;
				}
				break;

				case GS_RINGTYPE_VSYNC:
				{
					GSvsync(tag.data[0]);
					gsFrameSkip( !tag.data[1] );

					m_lock_FrameQueueCounter.Lock();
					AtomicDecrement( m_QueuedFrames );
					jASSUME( m_QueuedFrames >= 0 );
					//Console::Status( " << Frame Removed!" );
					m_lock_FrameQueueCounter.Unlock();

					if( PADupdate != NULL )
					{
						PADupdate(0);
						PADupdate(1);
					}
				}
				break;

				case GS_RINGTYPE_FRAMESKIP:
					_gs_ResetFrameskip();
				break;

				case GS_RINGTYPE_MEMWRITE8:
					m_gsMem[tag.data[0]] = (u8)tag.data[1];
				break;
				case GS_RINGTYPE_MEMWRITE16:
					*(u16*)(m_gsMem+tag.data[0]) = (u16)tag.data[1];
				break;
				case GS_RINGTYPE_MEMWRITE32:
					*(u32*)(m_gsMem+tag.data[0]) = tag.data[1];
				break;
				case GS_RINGTYPE_MEMWRITE64:
					*(u64*)(m_gsMem+tag.data[0]) = *(u64*)&tag.data[1];
				break;

				case GS_RINGTYPE_FREEZE:
				{
					freezeData* data = (freezeData*)(*(uptr*)&tag.data[1]);
					int mode = tag.data[0];
					GetPluginManager().Freeze( PluginId_GS, mode, data );
					break;
				}

				case GS_RINGTYPE_RECORD:
				{
					int record = tag.data[0];
					if( GSsetupRecording != NULL ) GSsetupRecording(record, NULL);
					if( SPU2setupRecording != NULL ) SPU2setupRecording(record, NULL);
					break;
				}

				case GS_RINGTYPE_RESET:
					MTGS_LOG( "MTGS > Receiving Reset...\n" );
					if( GSreset != NULL ) GSreset();
					break;

				case GS_RINGTYPE_SOFTRESET:
				{
					int mask = tag.data[0];
					MTGS_LOG( "MTGS > Receiving GIF Soft Reset (mask: %d)\n", mask );
					GSgifSoftReset( mask );
					break;
				}

				case GS_RINGTYPE_WRITECSR:
					GSwriteCSR( tag.data[0] );
				break;

				case GS_RINGTYPE_MODECHANGE:
					_gs_ChangeTimings( tag.data[0], tag.data[1] );
				break;

				case GS_RINGTYPE_STARTTIME:
					m_iSlowStart += tag.data[0];
				break;

				case GS_RINGTYPE_QUIT:
					GetPluginManager().Close( PluginId_GS );
					m_sem_Quitter.Post();
				return 0;

#ifdef PCSX2_DEVBUILD
				default:
					Console::Error("GSThreadProc, bad packet (%x) at m_RingPos: %x, m_WritePos: %x", params tag.command, m_RingPos, m_WritePos);
					assert(0);
					m_RingPos = m_WritePos;
					continue;
#else
				// Optimized performance in non-Dev builds.
				jNO_DEFAULT;
#endif
			}

			uint newringpos = m_RingPos + ringposinc;
			assert( newringpos <= m_RingBufferSize );
			newringpos &= m_RingBufferMask;
			AtomicExchange( m_RingPos, newringpos );
		}
		AtomicExchange( m_RingBufferIsBusy, 0 );
	}
}

// Waits for the GS to empty out the entire ring buffer contents.
// Used primarily for plugin startup/shutdown.
void mtgsThreadObject::WaitGS()
{
	// Freeze registers because some kernel code likes to destroy them
	SetEvent();
	while( volatize(m_RingPos) != volatize(m_WritePos) )
	{
		Timeslice();
		//SpinWait();
	}
}

// Sets the gsEvent flag and releases a timeslice.
// For use in loops that wait on the GS thread to do certain things.
void mtgsThreadObject::SetEvent()
{
	m_sem_event.Post();
	m_CopyCommandTally = 0;
	m_CopyDataTally = 0;
}

void mtgsThreadObject::PrepEventWait()
{
	//Console::Notice( "MTGS Stall!  EE waits for nothing! ... except your GPU sometimes." );
	SetEvent();
	Timeslice();
}

void mtgsThreadObject::PostEventWait() const
{
}

u8* mtgsThreadObject::GetDataPacketPtr() const
{
	return (u8*)m_RingBuffer.GetPtr( m_packet_ringpos );
}

// Closes the data packet send command, and initiates the gs thread (if needed).
void mtgsThreadObject::SendDataPacket()
{
	// make sure a previous copy block has been started somewhere.
	jASSUME( m_packet_size != 0 );

	uint temp = m_packet_ringpos + m_packet_size;
	jASSUME( temp <= m_RingBufferSize );
	temp &= m_RingBufferMask;

	if( IsDebugBuild )
	{
		if( m_packet_ringpos + m_packet_size < m_RingBufferSize )
		{
			uint readpos = volatize(m_RingPos);
			if( readpos != m_WritePos )
			{
				// The writepos should never leapfrog the readpos
				// since that indicates a bad write.
				if( m_packet_ringpos < readpos )
					assert( temp < readpos );
			}

			// Updating the writepos should never make it equal the readpos, since
			// that would stop the buffer prematurely (and indicates bad code in the
			// ringbuffer manager)
			assert( readpos != temp );
		}
	}

	AtomicExchange( m_WritePos, temp );

	m_packet_size = 0;

	if( !m_RingBufferIsBusy )
	{
		// The ringbuffer is current in a resting state, so if enough copies have
		// queued up then go ahead and initiate the GS thread..
		
		// Optimization notes:  What we're doing here is initiating a "burst" mode on
		// the thread, which improves its cache hit performance and makes it more friendly
		// to other threads in Pcsx2 and such.  Primary is the Command Tally, and then a 
		// secondary data size threshold for games that do lots of texture swizzling.
		
		// 16 was the best value I found so far.
		// tested values:
		//  24 - very slow on HT machines (+5% drop in fps)
		//  8 - roughly 2% slower on HT machines.

		m_CopyDataTally += m_packet_size;
		if( ( m_CopyDataTally > 0x8000 ) || ( ++m_CopyCommandTally > 16 ) )
		{
			//Console::Status( "MTGS Kick! DataSize : 0x%5.8x, CommandTally : %d", m_CopyDataTally, m_CopyCommandTally );
			SetEvent();
		}
	}
	//m_PacketLocker.Unlock();
}

int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u64* srcdata, u32 size )
{
	return PrepDataPacket( pathidx, (u8*)srcdata, size );
}

int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u32* srcdata, u32 size )
{
	return PrepDataPacket( pathidx, (u8*)srcdata, size );
}

#ifdef PCSX2_GSRING_TX_STATS
static u32 ringtx_s=0;
static u32 ringtx_s_ulg=0;
static u32 ringtx_s_min=0xFFFFFFFF;
static u32 ringtx_s_max=0;
static u32 ringtx_c=0;
static u32 ringtx_inf[32][32];
static u32 ringtx_inf_s[32];
#endif

#ifdef PCSX2_GSRING_SAMPLING_STATS
static u32 GSRingBufCopySz = 0;
#endif

// returns the amount of giftag data not processed (in simd128 values).
// Return value is used by VU1 XGKICK to hack-fix data packets which are too
// large for VU1 memory.
// Parameters:
//  size - size of the packet data, in smd128's
int mtgsThreadObject::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size )
{
	//m_PacketLocker.Lock();

#ifdef PCSX2_GSRING_TX_STATS
	ringtx_s += size;
	ringtx_s_ulg += size&0x7F;
	ringtx_s_min = min(ringtx_s_min,size);
	ringtx_s_max = max(ringtx_s_max,size);
	ringtx_c++;
	u32 tx_sz;
	
	if (_BitScanReverse(&tx_sz,size))
	{
		u32 tx_algn;
		_BitScanForward(&tx_algn,size);
		ringtx_inf[tx_sz][tx_algn]++;
		ringtx_inf_s[tx_sz]+=size;
	}
	if (ringtx_s>=128*1024*1024)
	{
		Console::Status("GSRingBufCopy:128MB in %d tx -> b/tx: AVG = %.2f , max = %d, min = %d",ringtx_c,ringtx_s/(float)ringtx_c,ringtx_s_max,ringtx_s_min);
		for (int i=0;i<32;i++)
		{
			u32 total_bucket=0;
			u32 bucket_subitems=0;
			for (int j=0;j<32;j++)
			{
				if (ringtx_inf[i][j])
				{
					total_bucket+=ringtx_inf[i][j];
					bucket_subitems++;
					Console::Notice("GSRingBufCopy :tx [%d,%d] algn %d : count= %d [%.2f%%]",1<<i,(1<<(i+1))-16,1<<j,ringtx_inf[i][j],ringtx_inf[i][j]/(float)ringtx_c*100);
					ringtx_inf[i][j]=0;
				}
			}
			if (total_bucket)
				Console::Notice("GSRingBufCopy :tx [%d,%d] total : count= %d [%.2f%%] [%.2f%%]",1<<i,(1<<(i+1))-16,total_bucket,total_bucket/(float)ringtx_c*100,ringtx_inf_s[i]/(float)ringtx_s*100);
			ringtx_inf_s[i]=0;
		}
		Console::Notice("GSRingBufCopy :tx ulg count =%d [%.2f%%]",ringtx_s_ulg,ringtx_s_ulg/(float)ringtx_s*100);
		ringtx_s_ulg=0;
		ringtx_c=0;
		ringtx_s=0;
		ringtx_s_min=0xFFFFFFFF;
		ringtx_s_max=0;
	}
#endif
	// Note on volatiles: g_pGSWritePos is not modified by the GS thread,
	// so there's no need to use volatile reads here.  We still have to use
	// interlocked exchanges when we modify it, however, since the GS thread
	// is reading it.

	uint writepos = m_WritePos;
	
	// Checks if a previous copy was started without an accompanying call to GSRINGBUF_DONECOPY
	jASSUME( m_packet_size == 0 );

	// Sanity checks! (within the confines of our ringbuffer please!)
	jASSUME( size < m_RingBufferSize );
	jASSUME( writepos < m_RingBufferSize );

	//fixme: Vif sometimes screws up and size is unaligned, try this then (rama)
	//Is this still a problem?  It should be fixed on the specific VIF command now. (air)
	//It seems to be fixed in Fatal Frame, leaving the code here still in case we get that again (rama)
	/*if( (size&15) != 0){
		Console::Error( "MTGS problem, size unaligned"); 
		size = (size+15)&(~15);
	}*/

	// retval has the amount of data *not* processed, so we only need to reserve
	// enough room for size - retval:
	int retval = _gifTransferDummy( pathidx, srcdata, size );

	if(pathidx == 2)
	{
		gif->madr += (size - retval) * 16;
		gif->qwc -= size - retval;
	}
	//if(retval < 0) DevCon::Notice("Increasing size from %x to %x path %x", params size, size-retval, pathidx+1);
	size = size - retval;
	m_packet_size = size;
	size++;			// takes into account our command qword.

	if( writepos + size < m_RingBufferSize )
	{
		// generic gs wait/stall.
		// if the writepos is past the readpos then we're safe.
		// But if not then we need to make sure the readpos is outside the scope of
		// the block about to be written (writepos + size)

		if( writepos < volatize(m_RingPos) )
		{
			// writepos is behind the readpos, so we need to wait until
			// readpos is out past the end of the future write pos, or until it wraps
			// around (in which case writepos will be >= readpos)

			PrepEventWait(); 
			while( true )
			{
				uint readpos = volatize(m_RingPos);
				if( writepos >= readpos ) break;
				if( writepos+size < readpos ) break;
				SpinWait();
			}
			PostEventWait();
		}
	}
	else if( writepos + size > m_RingBufferSize )
	{
		// If the incoming packet doesn't fit, then start over from
		// the start of the ring buffer (it's a lot easier than trying
		// to wrap the packet around the end of the buffer).

		// We have to be careful not to leapfrog our read-position.  If it's 
		// greater than the current write position then we need to stall
		// until it loops around to the beginning of the buffer

		PrepEventWait();
		while( true )
		{
			uint readpos = volatize(m_RingPos);

			// is the buffer empty?
			if( readpos == writepos ) break;

			// Also: Wait for the readpos to go past the start of the buffer
			// Otherwise it'll stop dead in its tracks when we set the new write
			// position below (bad!)
			if( readpos < writepos && readpos != 0 ) break;

			SpinWait();
		}

		m_lock_RingRestart.Lock();
		SendSimplePacket( GS_RINGTYPE_RESTART, 0, 0, 0 );
		writepos = 0;
		AtomicExchange( m_WritePos, writepos );
		m_lock_RingRestart.Unlock();
		SetEvent();

		// stall until the read position is past the end of our incoming block,
		// or until it reaches the current write position (signals an empty buffer).
		while( true )
		{
			uint readpos = volatize(m_RingPos);

			if( readpos == m_WritePos ) break;
			if( writepos+size < readpos ) break;

			SpinWait();
		}
		PostEventWait();
	}
    else	// always true - if( writepos + size == MTGS_RINGBUFFEREND )
	{
		// Yay.  Perfect fit.  What are the odds?
		//Console::WriteLn( "MTGS > Perfect Fit!");

		PrepEventWait();
		while( true )
		{
			uint readpos = volatize(m_RingPos);

			// stop waiting if the buffer is empty!
			if( writepos == readpos ) break;

			// Copy is ready so long as readpos is less than writepos and *not*
			// equal to the base of the ringbuffer (otherwise the buffer will stop
			// when the writepos is wrapped around to zero later-on in SendDataPacket)
			if( readpos < writepos && readpos != 0 ) break;

			SpinWait();
		}
		PostEventWait();
    }

#ifdef RINGBUF_DEBUG_STACK
	m_lock_Stack.Lock();
	ringposStack.push_front( writepos );
	m_lock_Stack.Unlock();
#endif

	// Command qword: Low word is the command, and the high word is the packet
	// length in SIMDs (128 bits).

	PacketTagType& tag = (PacketTagType&)m_RingBuffer[m_WritePos];
	tag.command = pathidx+1;
	tag.data[0] = m_packet_size;
	m_packet_ringpos = m_WritePos + 1;

	return m_packet_size;
}

__forceinline uint mtgsThreadObject::_PrepForSimplePacket()
{
#ifdef RINGBUF_DEBUG_STACK
	m_lock_Stack.Lock();
	ringposStack.push_front( m_WritePos );
	m_lock_Stack.Unlock();
#endif

	uint future_writepos = m_WritePos+1;
	jASSUME( future_writepos <= m_RingBufferSize );

    future_writepos &= m_RingBufferMask;

	if( future_writepos == volatize(m_RingPos) )
	{
		PrepEventWait();
		do
		{
			SpinWait();
		} while( future_writepos == volatize(m_RingPos) );
		PostEventWait();
	}

	return future_writepos;
}

__forceinline void mtgsThreadObject::_FinishSimplePacket( uint future_writepos )
{
	assert( future_writepos != volatize(m_RingPos) );
	AtomicExchange( m_WritePos, future_writepos );
}

void mtgsThreadObject::SendSimplePacket( GS_RINGTYPE type, int data0, int data1, int data2 )
{
	//ScopedLock locker( m_PacketLocker );

	const uint thefuture = _PrepForSimplePacket();
	PacketTagType& tag = (PacketTagType&)m_RingBuffer[m_WritePos];

	tag.command = type;
	tag.data[0] = data0;
	tag.data[1] = data1;
	tag.data[2] = data2;

	_FinishSimplePacket( thefuture );
}

void mtgsThreadObject::SendPointerPacket( GS_RINGTYPE type, u32 data0, void* data1 )
{
	//ScopedLock locker( m_PacketLocker );

	const uint thefuture = _PrepForSimplePacket();
	PacketTagType& tag = (PacketTagType&)m_RingBuffer[m_WritePos];

	tag.command = type;
	tag.data[0] = data0;
	*(uptr*)&tag.data[1] = (uptr)data1;

	_FinishSimplePacket( thefuture );	
}

// Waits for the GS to empty out the entire ring buffer contents.
// Used primarily for plugin startup/shutdown.
void mtgsWaitGS()
{
	if( mtgsThread == NULL ) return;
	mtgsThread->WaitGS();
}

// Exceptions:
//   ThreadCreationError - Thready could not be created (indicates OS resource limitations)
//   PluginFailure - GS plugin's "GSopen" call failed.
//
void mtgsOpen()
{
	// better not be a thread already running, yo!
	if( mtgsThread != NULL ) return;

	mtgsThread = new mtgsThreadObject();

	try
	{
		mtgsThread->Start();
	}
	catch( ... )
	{
		// if the thread start fails for any reason then set the handle to null.
		// The handle is used as a NULL test of thread running status, which is why
		// we really need to do this. :)
		safe_delete( mtgsThread );
		throw;
	}
}


void mtgsThreadObject::GIFSoftReset( int mask )
{
	if(mask & 1) memzero_obj(m_path[0]);
	if(mask & 2) memzero_obj(m_path[1]);
	if(mask & 4) memzero_obj(m_path[2]);

	if( GSgifSoftReset == NULL ) return;

	MTGS_LOG( "MTGS > Sending GIF Soft Reset (mask: %d)\n", mask );
	mtgsThread->SendSimplePacket( GS_RINGTYPE_SOFTRESET, mask, 0, 0 );
}

void mtgsThreadObject::Freeze( SaveState& state )
{
	_mtgsFreezeGIF( state, this->m_path );
}

// this function is needed because of recompiled calls from iGS.cpp
// (currently used in GCC only)
//void mtgsRingBufSimplePacket( s32 command, u32 data0, u32 data1, u32 data2 )
//{
//	mtgsThread->SendSimplePacket( (GS_RINGTYPE)command, data0, data1, data2 );
//}