From a9084741bc1dc9a427a9125ac927e5d487726a8a Mon Sep 17 00:00:00 2001 From: "Jake.Stine" Date: Sun, 11 Jul 2010 04:53:50 +0000 Subject: [PATCH] ReorderingMTGS: * Implemented GIFPath_CopyTag, which performs a "copy-in-place" while parsing tags (big speedup over the old parse-then-copy strategy, especially with the SSE intrinsics I've included for kicks). * Removed the old ringbuffer 'restart' mechanism and replaced it with a truly free-flowing wrapping mechanism. Utilizes the ringbuffer more efficiently, and removes quite a bit of overhead from the MTGS's PrepDataPacket call. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3458 96395faa-99c1-11dd-bbfe-3dabce05a288 --- common/include/PS2Edefs.h | 2 + common/include/Utilities/MemcpyFast.h | 8 +- pcsx2/FiFo.cpp | 5 +- pcsx2/GS.h | 37 ++- pcsx2/Gif.cpp | 23 +- pcsx2/MTGS.cpp | 329 +++++++++++++------------- pcsx2/PluginManager.cpp | 4 +- pcsx2/VUops.cpp | 17 +- pcsx2/Vif_Codes.cpp | 13 +- pcsx2/ps2/GIFpath.cpp | 118 +++++---- pcsx2/x86/microVU_Lower.inl | 24 +- pcsx2/x86/sVU_Lower.cpp | 25 +- 12 files changed, 304 insertions(+), 301 deletions(-) diff --git a/common/include/PS2Edefs.h b/common/include/PS2Edefs.h index 5496df0587..f394cf5025 100644 --- a/common/include/PS2Edefs.h +++ b/common/include/PS2Edefs.h @@ -564,6 +564,7 @@ typedef void (CALLBACK* _PS2EsetEmuVersion)(const char* emuId, u32 version); // typedef s32 (CALLBACK* _GSopen)(void *pDsp, char *Title, int multithread); typedef s32 (CALLBACK* _GSopen2)( void *pDsp, u32 flags ); typedef void (CALLBACK* _GSvsync)(int field); +typedef void (CALLBACK* _GSgifTransfer)(u32 *pMem, u32 size); typedef void (CALLBACK* _GSgifTransfer1)(u32 *pMem, u32 addr); typedef void (CALLBACK* _GSgifTransfer2)(u32 *pMem, u32 size); typedef void (CALLBACK* _GSgifTransfer3)(u32 *pMem, u32 size); @@ -723,6 +724,7 @@ typedef void (CALLBACK* _FWirqCallback)(void (*callback)()); extern _GSopen GSopen; extern _GSopen2 GSopen2; extern _GSvsync GSvsync; +extern _GSgifTransfer GSgifTransfer; extern _GSgifTransfer1 GSgifTransfer1; extern _GSgifTransfer2 GSgifTransfer2; extern _GSgifTransfer3 GSgifTransfer3; diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h index 5c74d55087..76526a5eed 100644 --- a/common/include/Utilities/MemcpyFast.h +++ b/common/include/Utilities/MemcpyFast.h @@ -36,7 +36,7 @@ // Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason. void _memset16_unaligned( void* dest, u16 data, size_t size ); -#define memcpy_fast memcpy_amd_ // Fast memcpy -#define memcpy_aligned memcpy_amd_ // Memcpy with 16-byte Aligned addresses -#define memcpy_const memcpy_amd_ // Memcpy with constant size -#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned +#define memcpy_fast memcpy_amd_ // Fast memcpy +#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c*16) // Memcpy with 16-byte Aligned addresses +#define memcpy_const memcpy_amd_ // Memcpy with constant size +#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp index ed87881d03..f93b80242b 100644 --- a/pcsx2/FiFo.cpp +++ b/pcsx2/FiFo.cpp @@ -195,10 +195,9 @@ void __fastcall WriteFIFO_page_6(u32 mem, const mem128_t *value) nloop0_packet[1] = psHu32(GIF_FIFO + 4); nloop0_packet[2] = psHu32(GIF_FIFO + 8); nloop0_packet[3] = psHu32(GIF_FIFO + 12); - GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)nloop0_packet, 1); + GetMTGS().PrepDataPacket(GIF_PATH_3, 1); u64* data = (u64*)GetMTGS().GetDataPacketPtr(); - data[0] = value[0]; - data[1] = value[1]; + GIFPath_CopyTag( GIF_PATH_3, (u128*)nloop0_packet, 1 ); GetMTGS().SendDataPacket(); if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 ) { diff --git a/pcsx2/GS.h b/pcsx2/GS.h index 3d1dc74d78..d3232ef2aa 100644 --- a/pcsx2/GS.h +++ b/pcsx2/GS.h @@ -229,7 +229,7 @@ enum GIF_PATH GIF_PATH_3, }; -extern int GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size); +extern int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size); extern int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size); extern void GIFPath_Reset(); extern void GIFPath_Clear( GIF_PATH pathidx ); @@ -282,7 +282,6 @@ public: volatile s32 m_SignalRingPosition; int m_QueuedFrameCount; - u32 m_RingWrapSpot; Mutex m_lock_RingBufferBusy; Semaphore m_sem_OnRingReset; @@ -301,6 +300,7 @@ public: // These vars maintain instance data for sending Data Packets. // Only one data packet can be constructed and uploaded at a time. + uint m_packet_startpos; // size of the packet (data only, ie. not including the 16 byte command!) uint m_packet_size; // size of the packet (data only, ie. not including the 16 byte command!) uint m_packet_ringpos; // index of the data location in the ringbuffer. @@ -317,14 +317,13 @@ public: void WaitGS(); void ResetGS(); - int PrepDataPacket( MTGS_RingCommand cmd, u32 size ); - int PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size ); + void PrepDataPacket( MTGS_RingCommand cmd, u32 size ); + void PrepDataPacket( GIF_PATH pathidx, u32 size ); void SendDataPacket(); void SendGameCRC( u32 crc ); void WaitForOpen(); void Freeze( int mode, MTGS_FreezeData& data ); - void RestartRingbuffer( uint packsize=0 ); void SendSimplePacket( MTGS_RingCommand type, int data0, int data1, int data2 ); void SendPointerPacket( MTGS_RingCommand type, u32 data0, void* data1 ); @@ -416,3 +415,31 @@ extern int g_nLeftGSFrames; #endif +// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s. +// (actual size is 1<stat.P1Q = false; while(Path1WritePos > 0) { - u32 size = GetMTGS().PrepDataPacket(GIF_PATH_1, Path1Buffer + (Path1ReadPos * 16), (Path1WritePos - Path1ReadPos)); - u8* pDest = GetMTGS().GetDataPacketPtr(); + uint size = (Path1WritePos - Path1ReadPos); + GetMTGS().PrepDataPacket(GIF_PATH_1, size); //DevCon.Warning("Flush Size = %x", size); - - memcpy_aligned(pDest, Path1Buffer + (Path1ReadPos * 16), size * 16); - GetMTGS().SendDataPacket(); - - Path1ReadPos += size; - + uint count = GIFPath_CopyTag(GIF_PATH_1, ((u128*)Path1Buffer) + Path1ReadPos, size); + GetMTGS().SendDataPacket(); + pxAssume( count == size ); + Path1ReadPos += count; + if(GSTransferStatus.PTH1 == STOPPED_MODE) { gifRegs->stat.OPH = false; @@ -150,11 +149,9 @@ __forceinline void gsInterrupt() static u32 WRITERING_DMA(u32 *pMem, u32 qwc) { - int size = GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)pMem, qwc); - u8* pgsmem = GetMTGS().GetDataPacketPtr(); - - memcpy_aligned(pgsmem, pMem, size<<4); - + GetMTGS().PrepDataPacket(GIF_PATH_3, qwc); + //uint len1 = GIFPath_ParseTag(GIF_PATH_3, (u8*)pMem, qwc ); + uint size = GIFPath_CopyTag(GIF_PATH_3, (u128*)pMem, qwc ); GetMTGS().SendDataPacket(); return size; } diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp index a6905b9788..04a3a42db1 100644 --- a/pcsx2/MTGS.cpp +++ b/pcsx2/MTGS.cpp @@ -29,7 +29,7 @@ using namespace Threading; -#if 0 // PCSX2_DEBUG +#if 0 //PCSX2_DEBUG # define MTGS_LOG Console.WriteLn #else # define MTGS_LOG 0&& @@ -46,34 +46,7 @@ using namespace Threading; // MTGS Threaded Class Implementation // ===================================================================================================== -// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s. -// (actual size is 1< 0 ) - RestartRingbuffer(); + if( AtomicIncrement(m_QueuedFrameCount) == 0 ) return; + + uint readpos = volatize(m_RingPos); + uint freeroom; + + if (m_WritePos < readpos) + freeroom = readpos - m_WritePos; else - { - m_QueuedFrameCount++; - SetEvent(); - } + freeroom = RingBufferSize - (m_WritePos - readpos); + + uint totalAccum = RingBufferSize - freeroom; + uint somedone = totalAccum / 4; + + m_SignalRingPosition = totalAccum; + + //Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Vsync Sleep!\t\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition ); + + AtomicExchange( m_SignalRingEnable, 1 ); + SetEvent(); + m_sem_OnRingReset.WaitWithoutYield(); + readpos = volatize(m_RingPos); + + pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" ); } struct PacketTagType @@ -197,7 +190,7 @@ void SysMtgsThread::OpenPlugin() { if( m_PluginOpened ) return; - memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) ); + memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 ); GSsetBaseMem( RingBuffer.Regs ); GSirqCallback( dummyIrqCallback ); @@ -330,38 +323,75 @@ void SysMtgsThread::ExecuteTaskInThread() { case GS_RINGTYPE_P1: { + uint datapos = (m_RingPos+1) & RingBufferMask; const int qsize = tag.data[0]; - const u128* data = &RingBuffer[m_RingPos+1]; + const u128* data = &RingBuffer[datapos]; MTGS_LOG( "(MTGS Packet Read) ringtype=P1, qwc=%u", qsize ); - // make sure that tag>>16 is the MAX size readable - GSgifTransfer1((u32*)(data - 0x400 + qsize), 0x4000-qsize*16); - //GSgifTransfer1((u32*)data, qsize); + uint endpos = datapos + qsize; + if( endpos >= RingBufferSize ) + { + uint firstcopylen = RingBufferSize - datapos; + GSgifTransfer( (u32*)data, firstcopylen ); + datapos = endpos & RingBufferMask; + GSgifTransfer( (u32*)RingBuffer.m_Ring, datapos ); + } + else + { + GSgifTransfer( (u32*)data, qsize ); + } + ringposinc += qsize; } break; case GS_RINGTYPE_P2: { + uint datapos = (m_RingPos+1) & RingBufferMask; const int qsize = tag.data[0]; - const u128* data = &RingBuffer[m_RingPos+1]; + const u128* data = &RingBuffer[datapos]; MTGS_LOG( "(MTGS Packet Read) ringtype=P2, qwc=%u", qsize ); - GSgifTransfer2((u32*)data, qsize); + uint endpos = datapos + qsize; + if( endpos >= RingBufferSize ) + { + uint firstcopylen = RingBufferSize - datapos; + GSgifTransfer2( (u32*)data, firstcopylen ); + datapos = endpos & RingBufferMask; + GSgifTransfer2( (u32*)RingBuffer.m_Ring, datapos ); + } + else + { + GSgifTransfer2( (u32*)data, qsize ); + } + ringposinc += qsize; } break; case GS_RINGTYPE_P3: { + uint datapos = (m_RingPos+1) & RingBufferMask; const int qsize = tag.data[0]; - const u128* data = &RingBuffer[m_RingPos+1]; + const u128* data = &RingBuffer[datapos]; MTGS_LOG( "(MTGS Packet Read) ringtype=P3, qwc=%u", qsize ); - GSgifTransfer3((u32*)data, qsize); + uint endpos = datapos + qsize; + if( endpos >= RingBufferSize ) + { + uint firstcopylen = RingBufferSize - datapos; + GSgifTransfer3( (u32*)data, firstcopylen ); + datapos = endpos & RingBufferMask; + GSgifTransfer3( (u32*)RingBuffer.m_Ring, datapos ); + } + else + { + GSgifTransfer3( (u32*)data, qsize ); + } + ringposinc += qsize; } break; @@ -380,7 +410,7 @@ void SysMtgsThread::ExecuteTaskInThread() const int qsize = tag.data[0]; ringposinc += qsize; - MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", tag.data[0], tag.data[1] ? "true" : "false" ); + MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", !!(((u32&)RingBuffer.Regs[0x1000]) & 0x2000) ? 0 : 1, tag.data[1] ? "true" : "false" ); // Mail in the important GS registers. RingCmdPacket_Vsync& local((RingCmdPacket_Vsync&)RingBuffer[m_RingPos+1]); @@ -398,6 +428,7 @@ void SysMtgsThread::ExecuteTaskInThread() if( (GSopen2 == NULL) && (PADupdate != NULL) ) PADupdate(0); + AtomicDecrement( m_QueuedFrameCount ); StateCheckInThread(); } break; @@ -450,9 +481,14 @@ void SysMtgsThread::ExecuteTaskInThread() } } - uint newringpos = m_RingPos + ringposinc; - pxAssert( newringpos <= RingBufferSize ); - m_RingPos = newringpos & RingBufferMask; + uint newringpos = (m_RingPos + ringposinc) & RingBufferMask; + + if( EmuConfig.GS.SynchronousMTGS ) + { + pxAssert( m_WritePos == newringpos ); + } + + m_RingPos = newringpos; if( m_SignalRingEnable != 0 ) { @@ -546,7 +582,7 @@ void SysMtgsThread::SetEvent() u8* SysMtgsThread::GetDataPacketPtr() const { - return (u8*)&RingBuffer[m_packet_ringpos]; + return (u8*)&RingBuffer[m_packet_ringpos & RingBufferMask]; } // Closes the data packet send command, and initiates the gs thread (if needed). @@ -555,6 +591,7 @@ void SysMtgsThread::SendDataPacket() // make sure a previous copy block has been started somewhere. pxAssert( m_packet_size != 0 ); + #if 0 uint temp = m_packet_ringpos + m_packet_size; pxAssert( temp <= RingBufferSize ); temp &= RingBufferMask; @@ -578,8 +615,16 @@ void SysMtgsThread::SendDataPacket() pxAssert( readpos != temp ); } } + #endif - m_WritePos = temp; + uint actualSize = ((m_packet_ringpos - m_packet_startpos) & RingBufferMask)-1; + pxAssert( actualSize <= m_packet_size ); + pxAssert( m_packet_ringpos < RingBufferSize ); + + PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos]; + tag.data[0] = actualSize; + + m_WritePos = m_packet_ringpos; if( EmuConfig.GS.SynchronousMTGS ) { @@ -596,7 +641,7 @@ void SysMtgsThread::SendDataPacket() //m_PacketLocker.Release(); } -int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size ) +void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size ) { // Note on volatiles: m_WritePos is not modified by the GS thread, so there's no need // to use volatile reads here. We do cache it though, since we know it never changes, @@ -613,119 +658,63 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size ) m_packet_size = size; ++size; // takes into account our RingCommand QWC. - if( writepos + size < RingBufferSize ) + // generic gs wait/stall. + // if the writepos is past the readpos then we're safe. + // But if not then we need to make sure the readpos is outside the scope of + // the block about to be written (writepos + size) + + uint readpos = volatize(m_RingPos); + uint endpos = writepos+size; + uint freeroom; + + if (writepos < readpos) + freeroom = readpos - writepos; + else + freeroom = RingBufferSize - (writepos - readpos); + + if (freeroom < size) { - // generic gs wait/stall. - // if the writepos is past the readpos then we're safe. - // But if not then we need to make sure the readpos is outside the scope of - // the block about to be written (writepos + size) + // writepos will overlap readpos if we commit the data, so we need to wait until + // readpos is out past the end of the future write pos, or until it wraps around + // (in which case writepos will be >= readpos). - uint readpos = volatize(m_RingPos); - if( (writepos < readpos) && (writepos+size >= readpos) ) + // Ideally though we want to wait longer, because if we just toss in this packet + // the next packet will likely stall up too. So lets set a condition for the MTGS + // thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied. + + uint somedone = (RingBufferSize - freeroom) / 4; + if( somedone < size+1 ) somedone = size + 1; + + // FMV Optimization: FMVs typically send *very* little data to the GS, in some cases + // every other frame is nothing more than a page swap. Sleeping the EEcore is a + // waste of time, and we get better results using a spinwait. + + if( somedone > 0x80 ) { - // writepos is behind the readpos and will overlap it if we commit the data, - // so we need to wait until readpos is out past the end of the future write pos, - // or until it wraps around (in which case writepos will be >= readpos). + pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" ); + m_SignalRingPosition = somedone; - // Ideally though we want to wait longer, because if we just toss in this packet - // the next packet will likely stall up too. So lets set a condition for the MTGS - // thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied. + //Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition ); - uint totalAccum = (m_RingWrapSpot - readpos) + writepos; - uint somedone = totalAccum / 4; - if( somedone < size+1 ) somedone = size + 1; - - // FMV Optimization: FMVs typically send *very* little data to the GS, in some cases - // every other frame is nothing more than a page swap. Sleeping the EEcore is a - // waste of time, and we get better results using a spinwait. - - if( somedone > 0x80 ) - { - pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" ); - m_SignalRingPosition = somedone; - - //Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition ); - - do { - AtomicExchange( m_SignalRingEnable, 1 ); - SetEvent(); - m_sem_OnRingReset.WaitWithoutYield(); - readpos = volatize(m_RingPos); - //Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos ); - } while( (writepos < readpos) && (writepos+size >= readpos) ); - - pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" ); - } - else - { + do { + AtomicExchange( m_SignalRingEnable, 1 ); SetEvent(); - do { - SpinWait(); - readpos = volatize(m_RingPos); - } while( (writepos < readpos) && (writepos+size >= readpos) ); - } + m_sem_OnRingReset.WaitWithoutYield(); + readpos = volatize(m_RingPos); + //Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos ); + } while( (writepos < readpos) && (writepos+size >= readpos) ); + + pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" ); + } + else + { + SetEvent(); + do { + SpinWait(); + readpos = volatize(m_RingPos); + } while( (writepos < readpos) && (writepos+size >= readpos) ); } } - else if( writepos + size > RingBufferSize ) - { - pxAssert( writepos != 0 ); - - // If the incoming packet doesn't fit, then start over from the start of the ring - // buffer (it's a lot easier than trying to wrap the packet around the end of the - // buffer). - - //Console.WriteLn( "MTGS > Ringbuffer Got Filled!"); - RestartRingbuffer( size ); - writepos = m_WritePos; - } - else // always true - if( writepos + size == MTGS_RINGBUFFEREND ) - { - // Yay. Perfect fit. What are the odds? - // Copy is ready so long as readpos is less than writepos and *not* equal to the - // base of the ringbuffer (otherwise the buffer will stop when the writepos is - // wrapped around to zero later-on in SendDataPacket). - - uint readpos = volatize(m_RingPos); - //Console.WriteLn( "MTGS > Perfect Fit!\tringpos=0x%06x, writepos=0x%06x", readpos, writepos ); - if( readpos > writepos || readpos == 0 ) - { - uint totalAccum = (readpos == 0) ? RingBufferSize : ((m_RingWrapSpot - readpos) + writepos); - uint somedone = totalAccum / 4; - if( somedone < size+1 ) somedone = size + 1; - - // FMV Optimization: (see above) This condition of a perfect fit is so rare that optimizing - // for it is pointless -- but it was also mindlessly simple copy-paste. So there. :p - - if( somedone > 0x80 ) - { - m_SignalRingPosition = somedone; - - //Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Sleep!\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition ); - - do { - AtomicExchange( m_SignalRingEnable, 1 ); - SetEvent(); - m_sem_OnRingReset.WaitWithoutYield(); - readpos = volatize(m_RingPos); - //Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Post-sleep Report!\tringpos=0x%06x", readpos ); - } while( (writepos < readpos) || (readpos==0) ); - - pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" ); - } - else - { - //Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Spin!" ); - SetEvent(); - do { - SpinWait(); - readpos = volatize(m_RingPos); - } while( (writepos < readpos) || (readpos==0) ); - } - } - - m_QueuedFrameCount = 0; - m_RingWrapSpot = RingBufferSize; - } #ifdef RINGBUF_DEBUG_STACK m_lock_Stack.Lock(); @@ -739,9 +728,8 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size ) PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos]; tag.command = cmd; tag.data[0] = m_packet_size; - m_packet_ringpos = m_WritePos + 1; - - return m_packet_size; + m_packet_startpos = m_WritePos; + m_packet_ringpos = (m_WritePos + 1) & RingBufferMask; } // Returns the amount of giftag data processed (in simd128 values). @@ -749,13 +737,14 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size ) // around VU memory instead of having buffer overflow... // Parameters: // size - size of the packet data, in smd128's -int SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size ) +void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size ) { //m_PacketLocker.Acquire(); - return PrepDataPacket( (MTGS_RingCommand)pathidx, GIFPath_ParseTag(pathidx, srcdata, size) ); + PrepDataPacket( (MTGS_RingCommand)pathidx, size ); } +#if 0 void SysMtgsThread::RestartRingbuffer( uint packsize ) { if( m_WritePos == 0 ) return; @@ -816,6 +805,7 @@ void SysMtgsThread::RestartRingbuffer( uint packsize ) if( EmuConfig.GS.SynchronousMTGS ) WaitGS(); } +#endif __forceinline uint SysMtgsThread::_PrepForSimplePacket() { @@ -830,10 +820,7 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket() future_writepos &= RingBufferMask; if( future_writepos == 0 ) - { m_QueuedFrameCount = 0; - m_RingWrapSpot = RingBufferSize; - } uint readpos = volatize(m_RingPos); if( future_writepos == readpos ) @@ -841,7 +828,15 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket() // The ringbuffer read pos is blocking the future write position, so stall out // until the read position has moved. - uint totalAccum = (m_RingWrapSpot - readpos) + future_writepos; + uint freeroom; + + if (future_writepos < readpos) + freeroom = readpos - future_writepos; + else + freeroom = RingBufferSize - (future_writepos - readpos); + + uint totalAccum = RingBufferSize - freeroom; + uint somedone = totalAccum / 4; if( somedone > 0x80 ) diff --git a/pcsx2/PluginManager.cpp b/pcsx2/PluginManager.cpp index 080f1f5e9d..558a12180f 100644 --- a/pcsx2/PluginManager.cpp +++ b/pcsx2/PluginManager.cpp @@ -144,6 +144,7 @@ static s32 CALLBACK fallback_test() { return 0; } _GSvsync GSvsync; _GSopen GSopen; _GSopen2 GSopen2; +_GSgifTransfer GSgifTransfer; _GSgifTransfer1 GSgifTransfer1; _GSgifTransfer2 GSgifTransfer2; _GSgifTransfer3 GSgifTransfer3; @@ -309,7 +310,8 @@ static const LegacyApi_ReqMethod s_MethMessReq_GS[] = { { "GSopen", (vMeth**)&GSopen, NULL }, { "GSvsync", (vMeth**)&GSvsync, NULL }, - { "GSgifTransfer1", (vMeth**)&GSgifTransfer1, NULL }, + { "GSgifTransfer", (vMeth**)&GSgifTransfer, NULL }, + //{ "GSgifTransfer1", (vMeth**)&GSgifTransfer1, NULL }, { "GSgifTransfer2", (vMeth**)&GSgifTransfer2, NULL }, { "GSgifTransfer3", (vMeth**)&GSgifTransfer3, NULL }, { "GSreadFIFO2", (vMeth**)&GSreadFIFO2, NULL }, diff --git a/pcsx2/VUops.cpp b/pcsx2/VUops.cpp index 6baaacebad..0172fd1d9f 100644 --- a/pcsx2/VUops.cpp +++ b/pcsx2/VUops.cpp @@ -2057,21 +2057,8 @@ void _vuXGKICK(VURegs * VU) u8* data = ((u8*)VU->Mem + ((VU->VI[_Is_].US[0]*16) & 0x3fff)); u32 size; - size = GetMTGS().PrepDataPacket( GIF_PATH_1, data, (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4); - u8* pmem = GetMTGS().GetDataPacketPtr(); - - if((size << 4) > (u32)(0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff))) - { - //DevCon.Warning("addr + Size = 0x%x, transferring %x then doing %x", ((VU->VI[_Is_].US[0]*16) & 0x3fff) + (size << 4), (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4, size - (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff) >> 4)); - memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)); - size -= (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4; - //DevCon.Warning("Size left %x", size); - pmem += 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff); - memcpy_aligned(pmem, (u8*)VU->Mem, size<<4); - } - else { - memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), size<<4); - } + GetMTGS().PrepDataPacket( GIF_PATH_1, 0x400 ); + size = GIFPath_CopyTag( GIF_PATH_1, (u128*)data, (0x400-(VU->VI[_Is_].US[0] & 0x3ff)) ); GetMTGS().SendDataPacket(); } diff --git a/pcsx2/Vif_Codes.cpp b/pcsx2/Vif_Codes.cpp index fb2fb3a9f9..f7af604502 100644 --- a/pcsx2/Vif_Codes.cpp +++ b/pcsx2/Vif_Codes.cpp @@ -213,8 +213,8 @@ template _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) { v.bSize = 0; v.bPtr = 0; } - const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, v.buffer, 1); - memcpy_fast(GetMTGS().GetDataPacketPtr(), v.buffer, count << 4); + GetMTGS().PrepDataPacket(GIF_PATH_2, 1); + GIFPath_CopyTag(GIF_PATH_2, (u128*)v.buffer, 1); GetMTGS().SendDataPacket(); if(vif1.tag.size == 0) @@ -226,16 +226,17 @@ template _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) { } else { - const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, data, size >> 4); - memcpy_fast(GetMTGS().GetDataPacketPtr(), data, count << 4); + GetMTGS().PrepDataPacket(GIF_PATH_2, size/16); + uint count = GIFPath_CopyTag(GIF_PATH_2, (u128*)data, size/16) * 4; GetMTGS().SendDataPacket(); - vif1.tag.size -= count << 2; + + vif1.tag.size -= count; if(vif1.tag.size == 0) { vif1.cmd = 0; } vif1.vifstalled = true; - return count << 2; + return count; } } diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp index 02b6551e4f..3b5f477e0e 100644 --- a/pcsx2/ps2/GIFpath.cpp +++ b/pcsx2/ps2/GIFpath.cpp @@ -97,7 +97,7 @@ struct GIFPath u8 GetReg(); bool IsActive() const; - int ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size); + int CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size); int ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size); }; @@ -287,7 +287,8 @@ __forceinline void GIFPath::PrepPackedRegs() __forceinline void GIFPath::SetTag(const void* mem) { - const_cast(tag) = *((GIFTAG*)mem); + _mm_store_ps( (float*)&tag, _mm_loadu_ps((float*)mem) ); + //const_cast(tag) = *((GIFTAG*)mem); nloop = tag.NLOOP; curreg = 0; @@ -521,15 +522,50 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s return size; } -__forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) +void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len ) { + uint endpos = destStart + len; + if( endpos >= destSize ) + { + uint firstcopylen = RingBufferSize - destStart; + memcpy_aligned(&destBase[destStart], src, firstcopylen ); + + destStart = endpos & RingBufferMask; + memcpy_aligned(destBase, src+firstcopylen, destStart ); + } + else + { + memcpy_aligned(&destBase[destStart], src, len ); + destStart += len; + } +} + +// [TODO] optimization: If later templated, we can have Paths 1 and 3 use aligned SSE movs, +// since only PATH2 can feed us unaligned source data. +#define copyTag() do { \ + /*RingBuffer.m_Ring[ringpos] = *pMem128;*/ \ + _mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], _mm_loadu_ps((float*)pMem128)); \ + ++pMem128; --size; \ + ringpos = (ringpos+1)&RingBufferMask; \ +} while(false) + +__forceinline int GIFPath::CopyTag(GIF_PATH pathidx, const u128* pMem128, u32 size) +{ + uint& ringpos = GetMTGS().m_packet_ringpos; + const uint original_ringpos = ringpos; + u32 startSize = size; // Start Size while (size > 0) { if (!nloop) { - SetTag(pMem); - incTag(1); + // [TODO] Optimization: Use MMX intrinsics for SetTag and CopyTag, which both currently + // produce a series of mov eax,[src]; mov [dest],eax instructions to copy these + // individual qwcs. Warning: Path2 transfers are not always QWC-aligned, but they are + // always aligned on an 8 byte boundary; so its probably best to use MMX here. + + SetTag((u8*)pMem128); + copyTag(); if(nloop > 0) { @@ -599,9 +635,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) { do { if (GetReg() == 0xe) { - gsHandler(pMem); + gsHandler((u8*)pMem128); } - incTag(1); + copyTag(); } while(StepReg() && size > 0 && SIGNAL_IMR_Pending == false); } else @@ -644,11 +680,14 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) curreg = 0; nloop = 0; } - incTag(len); + + MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len ); + pMem128 += len; + size -= len; } break; case GIF_FLG_REGLIST: - { + { GIF_LOG("Reglist Mode EOP %x", tag.EOP); // In reglist mode, the GIF packs 2 registers into each QWC. The nloop however @@ -687,8 +726,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) nloop = 0; } - incTag(len); - + MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len ); + pMem128 += len; + size -= len; } break; case GIF_FLG_IMAGE: @@ -696,13 +736,15 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) { GIF_LOG("IMAGE Mode EOP %x", tag.EOP); int len = aMin(size, nloop); - incTag(len); + + MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len ); + + pMem128 += len; + size -= len; nloop -= len; } break; } - - } if(pathidx == GIF_PATH_1) @@ -713,11 +755,11 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) { size = 0x3ff - startSize; startSize = 0x3ff; - pMem -= 0x4000; + pMem128 -= 0x400; } else { - // Note: The BIOS does an XGKICK on the VU1 and lets yt DMA to the GS without an EOP + // Note: The BIOS does an XGKICK on the VU1 and lets it DMA to the GS without an EOP // (seemingly to loop forever), only to write an EOP later on. No other game is known to // do anything of the sort. // So lets just cap the DMA at 16k, and force it to "look" like it's terminated for now. @@ -727,6 +769,11 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) Console.Warning("GIFTAG error, size exceeded VU memory size %x", startSize); nloop = 0; + + // Don't send the packet to the GS -- its incomplete and might cause the GS plugin + // to get confused and die. >_< + + ringpos = original_ringpos; } } } @@ -793,47 +840,18 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) gif->qwc -= size; } } - - return size; } -// Processes a GIFtag & packet, and throws out some gsIRQs as needed. -// Used to keep interrupts in sync with the EE, while the GS itself -// runs potentially several frames behind. -// Parameters: -// size - max size of incoming data stream, in qwc (simd128) -__forceinline int GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) +__forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size) { -#ifdef PCSX2_GSRING_SAMPLING_STATS - static uptr profStartPtr = 0; - static uptr profEndPtr = 0; - if (profStartPtr == 0) { - __asm - { - __beginfunc: - mov profStartPtr, offset __beginfunc; - mov profEndPtr, offset __endfunc; - } - ProfilerRegisterSource( "GSRingBufCopy", (void*)profStartPtr, profEndPtr - profStartPtr ); - } -#endif - - int retSize = s_gifPath[pathidx].ParseTag(pathidx, pMem, size); - -#ifdef PCSX2_GSRING_SAMPLING_STATS - __asm - { - __endfunc: - nop; - } -#endif - return retSize; + return s_gifPath[pathidx].CopyTag(pathidx, pMem, size); } -//Quick version for queueing PATH1 data - +// Quick version for queueing PATH1 data. +// This version calculates the real length of the packet data only. It does not process +// IRQs or DMA status updates. __forceinline int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size) { int retSize = s_gifPath[pathidx].ParseTagQuick(pathidx, pMem, size); diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl index b28427a46f..de20032e9e 100644 --- a/pcsx2/x86/microVU_Lower.inl +++ b/pcsx2/x86/microVU_Lower.inl @@ -1101,27 +1101,15 @@ void __fastcall mVU_XGKICK_(u32 addr) { if(gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.APATH == GIF_APATH3 && gifRegs->stat.IP3 == true) && SIGNAL_IMR_Pending == false) { - if(Path1WritePos != 0) { //Flush any pending transfers so things dont go up in the wrong order while(gifRegs->stat.P1Q == true) gsPath1Interrupt(); } - size = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff); - pDest = GetMTGS().GetDataPacketPtr(); - if (size > diff) { - // fixme: one of these days the following *16's will get cleaned up when we introduce - // a special qwc/simd16 optimized version of memcpy_aligned. :) - //DevCon.Status("XGkick Wrap!"); - memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16); - size -= diff; - pDest += diff*16; - memcpy_aligned(pDest, microVU1.regs->Mem, size*16); - } - else { - memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16); - } + GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400); + size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff); GetMTGS().SendDataPacket(); + if(GSTransferStatus.PTH1 == STOPPED_MODE) { gifRegs->stat.OPH = false; @@ -1141,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) { // fixme: one of these days the following *16's will get cleaned up when we introduce // a special qwc/simd16 optimized version of memcpy_aligned. :) //DevCon.Status("XGkick Wrap!"); - memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16); + memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff); Path1WritePos += size; size -= diff; pDest += diff*16; - memcpy_aligned(pDest, microVU1.regs->Mem, size*16); + memcpy_aligned(pDest, microVU1.regs->Mem, size); } else { - memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16); + memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size); Path1WritePos += size; } //if(!gifRegs->stat.P1Q) CPU_INT(28, 128); diff --git a/pcsx2/x86/sVU_Lower.cpp b/pcsx2/x86/sVU_Lower.cpp index c8d103477b..30ec18fd91 100644 --- a/pcsx2/x86/sVU_Lower.cpp +++ b/pcsx2/x86/sVU_Lower.cpp @@ -1988,21 +1988,10 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr) //Flush any pending transfers so things dont go up in the wrong order while(gifRegs->stat.P1Q == true) gsPath1Interrupt(); } - size = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff); - pDest = GetMTGS().GetDataPacketPtr(); - if (size > diff) { - // fixme: one of these days the following *16's will get cleaned up when we introduce - // a special qwc/simd16 optimized version of memcpy_aligned. :) - - memcpy_aligned(pDest, VU1.Mem + addr, diff*16); - size -= diff; - pDest += diff*16; - memcpy_aligned(pDest, VU1.Mem, size*16); - } - else { - memcpy_aligned(pDest, VU1.Mem + addr, size*16); - } + GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400); + size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff); GetMTGS().SendDataPacket(); + if(GSTransferStatus.PTH1 == STOPPED_MODE ) { gifRegs->stat.OPH = false; @@ -2015,8 +2004,6 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr) size = GIFPath_ParseTagQuick(GIF_PATH_1, data, diff); pDest = &Path1Buffer[Path1WritePos*16]; - - pxAssumeMsg((Path1WritePos+size < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!"); //DevCon.Warning("Storing size %x PATH 1", size); @@ -2024,14 +2011,14 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr) // fixme: one of these days the following *16's will get cleaned up when we introduce // a special qwc/simd16 optimized version of memcpy_aligned. :) //DevCon.Status("XGkick Wrap!"); - memcpy_aligned(pDest, VU1.Mem + addr, diff*16); + memcpy_aligned(pDest, VU1.Mem + addr, diff); Path1WritePos += size; size -= diff; pDest += diff*16; - memcpy_aligned(pDest, VU1.Mem, size*16); + memcpy_aligned(pDest, VU1.Mem, size); } else { - memcpy_aligned(pDest, VU1.Mem + addr, size*16); + memcpy_aligned(pDest, VU1.Mem + addr, size); Path1WritePos += size; } //if(!gifRegs->stat.P1Q) CPU_INT(28, 128);