diff --git a/common/include/PS2Edefs.h b/common/include/PS2Edefs.h index f394cf5025..64a98e1016 100644 --- a/common/include/PS2Edefs.h +++ b/common/include/PS2Edefs.h @@ -248,6 +248,7 @@ void CALLBACK GSsetSettingsDir( const char* dir ); void CALLBACK GSsetLogDir( const char* dir ); void CALLBACK GSvsync(int field); +void CALLBACK GSgifTransfer(u32 *pMem, u32 addr); void CALLBACK GSgifTransfer1(u32 *pMem, u32 addr); void CALLBACK GSgifTransfer2(u32 *pMem, u32 size); void CALLBACK GSgifTransfer3(u32 *pMem, u32 size); diff --git a/common/include/Utilities/MemcpyFast.h b/common/include/Utilities/MemcpyFast.h index 2ff39cbe84..800c1071b6 100644 --- a/common/include/Utilities/MemcpyFast.h +++ b/common/include/Utilities/MemcpyFast.h @@ -22,12 +22,14 @@ extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes); extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize); extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize); + extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes); #else # include "win_memzero.h" extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes); + extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes); extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize); extern void memxor_mmx(void* dst, const void* src1, int cmpsize); @@ -40,9 +42,12 @@ void _memset16_unaligned( void* dest, u16 data, size_t size ); extern void memcpy_vibes(void * dest, const void * src, int size); extern void gen_memcpy_vibes(); -#define memcpy_fast memcpy_amd_ // Fast memcpy -#define memcpy_aligned memcpy_amd_ // Memcpy with 16-byte Aligned addresses -#define memcpy_const memcpy_amd_ // Memcpy with constant size -#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned -#define memcpy_qwc_ memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less -#define memcpy_qwc(x,y,z) memcpy_amd_(x, y, z*16) // Memcpy in aligned qwc increments +#define memcpy_fast memcpy_amd_ // Fast memcpy +#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses +#define memcpy_const memcpy_amd_ // Memcpy with constant size +#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned +#define memcpy_qwc_ memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less +#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c) + +// Useful alternative if we think memcpy_amd_qwc is buggy +//#define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16) diff --git a/common/include/Utilities/Threading.h b/common/include/Utilities/Threading.h index 5df1b80621..a3fa5261fa 100644 --- a/common/include/Utilities/Threading.h +++ b/common/include/Utilities/Threading.h @@ -129,6 +129,10 @@ namespace Threading // For use in spin/wait loops. extern void SpinWait(); + + // Use prior to committing data to another thread (internal memcpy_qwc does not use fencing, + // so that many memcpys can be issued in a row more efficiently) + extern void StoreFence(); // Optional implementation to enable hires thread/process scheduler for the operating system. // Needed by Windows, but might not be relevant to other platforms. diff --git a/common/src/Utilities/Exceptions.cpp b/common/src/Utilities/Exceptions.cpp index 7ff4440f7f..6e4869fc5e 100644 --- a/common/src/Utilities/Exceptions.cpp +++ b/common/src/Utilities/Exceptions.cpp @@ -71,7 +71,7 @@ wxString DiagnosticOrigin::ToString( const wxChar* msg ) const bool pxAssertImpl_LogIt( const DiagnosticOrigin& origin, const wxChar *msg ) { - wxLogError( origin.ToString( msg ) ); + wxLogError( L"%s", origin.ToString( msg ) ); return false; } diff --git a/common/src/Utilities/Windows/WinThreads.cpp b/common/src/Utilities/Windows/WinThreads.cpp index 0133f89e38..22cdfb21d6 100644 --- a/common/src/Utilities/Windows/WinThreads.cpp +++ b/common/src/Utilities/Windows/WinThreads.cpp @@ -36,6 +36,11 @@ __forceinline void Threading::SpinWait() __asm pause; } +__forceinline void Threading::StoreFence() +{ + __asm sfence; +} + __forceinline void Threading::EnableHiresScheduler() { // This improves accuracy of Sleep() by some amount, and only adds a negligible amount of diff --git a/common/src/Utilities/x86/MemcpyFast.cpp b/common/src/Utilities/x86/MemcpyFast.cpp index 40caf98308..0c8af9e63e 100644 --- a/common/src/Utilities/x86/MemcpyFast.cpp +++ b/common/src/Utilities/x86/MemcpyFast.cpp @@ -41,12 +41,10 @@ MEMCPY_AMD.CPP ******************************************************************************/ -// Very optimized memcpy() routine for AMD Athlon and Duron family. -// This code uses any of FOUR different basic copy methods, depending -// on the transfer size. // NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or // "Streaming Store"), and also uses the software prefetch instructions, -// be sure you're running on Athlon/Duron or other recent CPU before calling! +// be sure you're running on P4/Core2/i7, Athlon/Phenom or newer CPUs before +// calling! #define TINY_BLOCK_COPY 64 // upper limit for movsd type copy // The smallest copy uses the X86 "movsd" instruction, in an optimized @@ -68,10 +66,8 @@ MEMCPY_AMD.CPP #if defined(_MSC_VER) -// -------------------------------------------------------------------------------------- -// Fast memcpy as coded by AMD, and then improved by air. -// -------------------------------------------------------------------------------------- +// Fast memcpy as coded by AMD, and then improved by air for PCSX2 needs. __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n) { __asm @@ -92,6 +88,7 @@ __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_ jbe $memcpy_do_align ; it appears to be slower cmp eax, 64*1024 jbe $memcpy_align_done + $memcpy_do_align: mov eax, 8 ; a trick that's faster than rep movsb... sub eax, edi ; align destination to qword @@ -146,7 +143,7 @@ $memcpy_ic_1: ; 64-byte block copies, in-cache copy add esi, 64 ; update source pointer add edi, 64 ; update destination pointer - dec eax ; count down + sub eax, 1 jnz $memcpy_ic_1 ; last 64-byte block? $memcpy_ic_2: @@ -189,64 +186,15 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy movq mm1,[esi-8] movntq [edi-24], mm2 movntq [edi-16], mm0 - dec eax movntq [edi-8], mm1 + + sub eax, 1 jnz $memcpy_uc_1 ; last 64-byte block? jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed) -// For the largest size blocks, a special technique called Block Prefetch -// can be used to accelerate the read operations. Block Prefetch reads -// one address per cache line, for a series of cache lines, in a short loop. -// This is faster than using software prefetch. The technique is great for -// getting maximum read bandwidth, especially in DDR memory systems. - -// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to -// help keep the code cache footprint of memcpy_fast to a minimum. -/* -$memcpy_bp_1: ; large blocks, block prefetch copy - - cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop? - jl $memcpy_64_test ; no, back to regular uncached copy - - mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X - add esi, CACHEBLOCK * 64 ; move to the top of the block -align 16 -$memcpy_bp_2: - mov edx, [esi-64] ; grab one address per cache line - mov edx, [esi-128] ; grab one address per cache line - sub esi, 128 ; go reverse order to suppress HW prefetcher - dec eax ; count down the cache lines - jnz $memcpy_bp_2 ; keep grabbing more lines into cache - - mov eax, CACHEBLOCK ; now that it's in cache, do the copy -align 16 -$memcpy_bp_3: - movq mm0, [esi ] ; read 64 bits - movq mm1, [esi+ 8] - movq mm2, [esi+16] - movq mm3, [esi+24] - movq mm4, [esi+32] - movq mm5, [esi+40] - movq mm6, [esi+48] - movq mm7, [esi+56] - add esi, 64 ; update source pointer - movntq [edi ], mm0 ; write 64 bits, bypassing cache - movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU - movntq [edi+16], mm2 ; from READING the destination address - movntq [edi+24], mm3 ; into the cache, only to be over-written, - movntq [edi+32], mm4 ; so that also helps performance - movntq [edi+40], mm5 - movntq [edi+48], mm6 - movntq [edi+56], mm7 - add edi, 64 ; update dest pointer - - dec eax ; count down - - jnz $memcpy_bp_3 ; keep copying - sub ecx, CACHEBLOCK ; update the 64-byte block count - jmp $memcpy_bp_1 ; keep processing chunks -*/ +// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been +// disabled to help keep the code cache footprint of memcpy_fast to a minimum. // The smallest copy uses the X86 "movsd" instruction, in an optimized // form which is an "unrolled loop". Then it handles the last few bytes. @@ -274,17 +222,99 @@ $memcpy_last_few: ; dword aligned from before movsd's rep movsb ; the last 1, 2, or 3 bytes $memcpy_final: + pop esi + pop edi + emms ; clean up the MMX state sfence ; flush the write buffer //mov eax, [dest] ; ret value = destination pointer - pop esi - pop edi - ret 4 } } +// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned. +__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc) +{ + // Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM + // registers will improve copy performance, because they won't. Use of XMMs is only + // warranted in situations where both source and dest are guaranteed aligned to 16 bytes, + // and even then the benefits are typically minimal (sometimes slower depending on the + // amount of data being copied). + // + // Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them. + // --air + + // Linux Conversion note: + // This code would benefit nicely from having inline-able GAS syntax, since it should + // allow GCC to optimize the first 3 instructions out of existence in many scenarios. + // And its called enough times to probably merit the extra effort to ensure proper + // optimization. --air + + __asm + { + mov ecx, dest + mov edx, src + mov eax, qwc ; keep a copy of count + shr eax, 1 + jz $memcpy_qwc_1 ; only one 16 byte block to copy? + + cmp eax, IN_CACHE_COPY/32 + jb $memcpy_qwc_loop1 ; small copies should be cached (definite speedup --air) + +$memcpy_qwc_loop2: ; 32-byte blocks, uncached copy + prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air) + + movq mm0,[edx+0] ; read 64 bits + movq mm1,[edx+8] + movq mm2,[edx+16] + movntq [ecx+0], mm0 ; write 64 bits, bypassing the cache + movntq [ecx+8], mm1 + movq mm3,[edx+24] + movntq [ecx+16], mm2 + movntq [ecx+24], mm3 + + add edx,32 ; update source pointer + add ecx,32 ; update destination pointer + sub eax,1 + jnz $memcpy_qwc_loop2 ; last 64-byte block? + sfence ; flush the write buffer + jmp $memcpy_qwc_1 + +; 32-byte blocks, cached! +; This *is* important. Removing this and using exclusively non-temporal stores +; results in noticable speed loss! + +$memcpy_qwc_loop1: + prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air) + + movq mm0,[edx+0] ; read 64 bits + movq mm1,[edx+8] + movq mm2,[edx+16] + movq [ecx+0], mm0 ; write 64 bits, bypassing the cache + movq [ecx+8], mm1 + movq mm3,[edx+24] + movq [ecx+16], mm2 + movq [ecx+24], mm3 + + add edx,32 ; update source pointer + add ecx,32 ; update destination pointer + sub eax,1 + jnz $memcpy_qwc_loop1 ; last 64-byte block? + +$memcpy_qwc_1: + test qwc,1 + jz $memcpy_qwc_final + movq mm0,[edx] + movq mm1,[edx+8] + movq [ecx], mm0 + movq [ecx+8], mm1 + +$memcpy_qwc_final: + emms ; clean up the MMX state + } +} + // mmx mem-compare implementation, size has to be a multiple of 8 // returns 0 is equal, nonzero value if not equal // ~10 times faster than standard memcmp diff --git a/common/src/Utilities/x86/MemcpyVibes.cpp b/common/src/Utilities/x86/MemcpyVibes.cpp index 7efcd83f39..b154cd2847 100644 --- a/common/src/Utilities/x86/MemcpyVibes.cpp +++ b/common/src/Utilities/x86/MemcpyVibes.cpp @@ -156,3 +156,95 @@ __forceinline void memcpy_vibes(void * dest, const void * src, int size) { #endif #endif + +// Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment, +// to get around compilation issues with having it in the headers. +#ifdef __LINUX__ + + // This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now. + // Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned. + __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc) + { + // Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM + // registers will improve copy performance, because they won't. Use of XMMs is only + // warranted in situations where both source and dest are guaranteed aligned to 16 bytes, + // and even then the benefits are typically minimal (sometimes slower depending on the + // amount of data being copied). + // + // Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them. + // --air + + // Linux Conversion note: + // This code would benefit nicely from having inline-able GAS syntax, since it should + // allow GCC to optimize the first 3 instructions out of existence in many scenarios. + // And its called enough times to probably merit the extra effort to ensure proper + // optimization. --air + + __asm__ + ( + ".intel_syntax noprefix\n" + "mov eax, %[qwc]\n" // keep a copy of count for looping + "shr eax, 1\n" + "jz memcpy_qwc_1\n" // only one 16 byte block to copy? + + "cmp eax, 64\n" // "IN_CACHE_COPY/32" + "jb memcpy_qwc_loop1\n" // small copies should be cached (definite speedup --air) + + "memcpy_qwc_loop2:\n" // 32-byte blocks, uncached copy + "prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air) + + "movq mm0,[%[src]+0]\n" // read 64 bits + "movq mm1,[%[src]+8]\n" + "movq mm2,[%[src]+16]\n" + "movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache + "movntq [%[dest]+8], mm1\n" + "movq mm3,[%[src]+24]\n" + "movntq [%[dest]+16], mm2\n" + "movntq [%[dest]+24], mm3\n" + + "add %[src],32\n" // update source pointer + "add %[dest],32\n" // update destination pointer + "sub eax,1\n" + "jnz memcpy_qwc_loop2\n" // last 64-byte block? + "sfence\n" // flush the write buffer + "jmp memcpy_qwc_1\n" + + // 32-byte blocks, cached! + // This *is* important. Removing this and using exclusively non-temporal stores + // results in noticeable speed loss! + + "memcpy_qwc_loop1:\n" + "prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air) + + "movq mm0,[%[src]+0]\n" // read 64 bits + "movq mm1,[%[src]+8]\n" + "movq mm2,[%[src]+16]\n" + "movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache + "movq [%[dest]+8], mm1\n" + "movq mm3,[%[src]+24]\n" + "movq [%[dest]+16], mm2\n" + "movq [%[dest]+24], mm3\n" + + "add %[src],32\n" // update source pointer + "add %[dest],32\n" // update destination pointer + "sub eax,1\n" + "jnz memcpy_qwc_loop1\n" // last 64-byte block? + + "memcpy_qwc_1:\n" + "test %[qwc],1\n" + "jz memcpy_qwc_final\n" + "movq mm0,[%[src]]\n" + "movq mm1,[%[src]+8]\n" + "movq [%[dest]], mm0\n" + "movq [%[dest]+8], mm1\n" + + "memcpy_qwc_final:\n" + "emms\n" // clean up the MMX state + ".att_syntax\n" + : "=&r"(dest), "=&r"(src), "=&r"(qwc) + : [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc) + : "memory", "eax", "mm0", "mm1", "mm2", "mm3" + ); + } +#endif + diff --git a/pcsx2/Config.h b/pcsx2/Config.h index a81f51d8e8..cc83918e92 100644 --- a/pcsx2/Config.h +++ b/pcsx2/Config.h @@ -395,6 +395,7 @@ struct Pcsx2Config // style. Useful for debugging potential bugs in the MTGS pipeline. bool SynchronousMTGS; bool DisableOutput; + int VsyncQueueSize; bool FrameLimitEnable; bool FrameSkipEnable; @@ -420,6 +421,8 @@ struct Pcsx2Config return OpEqu( SynchronousMTGS ) && OpEqu( DisableOutput ) && + OpEqu( VsyncQueueSize ) && + OpEqu( FrameSkipEnable ) && OpEqu( FrameLimitEnable ) && OpEqu( VsyncEnable ) && diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp index ed87881d03..282eb68aaa 100644 --- a/pcsx2/FiFo.cpp +++ b/pcsx2/FiFo.cpp @@ -164,7 +164,6 @@ void __fastcall WriteFIFO_page_5(u32 mem, const mem128_t *value) if(GSTransferStatus.PTH2 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH2) { - if(gifRegs->stat.DIR == 0)gifRegs->stat.OPH = false; gifRegs->stat.APATH = GIF_APATH_IDLE; if(gifRegs->stat.P1Q) gsPath1Interrupt(); } @@ -195,14 +194,12 @@ void __fastcall WriteFIFO_page_6(u32 mem, const mem128_t *value) nloop0_packet[1] = psHu32(GIF_FIFO + 4); nloop0_packet[2] = psHu32(GIF_FIFO + 8); nloop0_packet[3] = psHu32(GIF_FIFO + 12); - GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)nloop0_packet, 1); - u64* data = (u64*)GetMTGS().GetDataPacketPtr(); - data[0] = value[0]; - data[1] = value[1]; + GetMTGS().PrepDataPacket(GIF_PATH_3, 1); + //u64* data = (u64*)GetMTGS().GetDataPacketPtr(); + GIFPath_CopyTag( GIF_PATH_3, (u128*)nloop0_packet, 1 ); GetMTGS().SendDataPacket(); if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 ) { - if(gifRegs->stat.DIR == 0)gifRegs->stat.OPH = false; gifRegs->stat.APATH = GIF_APATH_IDLE; if(gifRegs->stat.P1Q) gsPath1Interrupt(); } diff --git a/pcsx2/GS.cpp b/pcsx2/GS.cpp index b9028de2ee..4dc0251393 100644 --- a/pcsx2/GS.cpp +++ b/pcsx2/GS.cpp @@ -282,14 +282,19 @@ void __fastcall gsWrite64_page_01( u32 mem, const mem64_t* value ) { case 0x12001040: //busdir - //This is probably a complete hack, however writing to BUSDIR "should" start a transfer (Bleach Blade Battlers) - //Only problem is it kills killzone :( leaving it commented out for now. + //This is probably a complete hack, however writing to BUSDIR "should" start a transfer + //(Bleach Blade Battlers, Growlanser 2 and 3, Wizardry) + //Only problem is it kills killzone :(. // (yes it *is* a complete hack; both lines here in fact --air) //========================================================================= - //gifRegs->stat.OPH = true; + //Console.Warning("BUSDIR write! Setting OPH and DIR to = %x",(u32)value[0]); + if ((u32)value[0] == 1) + gifRegs->stat.OPH = true; + else + gifRegs->stat.OPH = false; + + gifRegs->stat.DIR = (u32)value[0]; //========================================================================= - gifRegs->stat.DIR = (u32)value; - // BUSDIR INSANITY !! MTGS FLUSH NEEDED // // Yup folks. BUSDIR is evil. The only safe way to handle it is to flush the whole MTGS diff --git a/pcsx2/GS.h b/pcsx2/GS.h index 3d1dc74d78..8162149218 100644 --- a/pcsx2/GS.h +++ b/pcsx2/GS.h @@ -229,7 +229,8 @@ enum GIF_PATH GIF_PATH_3, }; -extern int GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size); +extern void GIFPath_Initialize(); +extern int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size); extern int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size); extern void GIFPath_Reset(); extern void GIFPath_Clear( GIF_PATH pathidx ); @@ -248,7 +249,6 @@ enum MTGS_RingCommand GS_RINGTYPE_P1 , GS_RINGTYPE_P2 , GS_RINGTYPE_P3 -, GS_RINGTYPE_RESTART , GS_RINGTYPE_VSYNC , GS_RINGTYPE_FRAMESKIP , GS_RINGTYPE_FREEZE @@ -273,19 +273,20 @@ class SysMtgsThread : public SysThreadBase typedef SysThreadBase _parent; public: - // note: when m_RingPos == m_WritePos, the fifo is empty - uint m_RingPos; // cur pos gs is reading from + // note: when m_ReadPos == m_WritePos, the fifo is empty + uint m_ReadPos; // cur pos gs is reading from uint m_WritePos; // cur pos ee thread is writing to volatile bool m_RingBufferIsBusy; volatile u32 m_SignalRingEnable; volatile s32 m_SignalRingPosition; - int m_QueuedFrameCount; - u32 m_RingWrapSpot; + volatile s32 m_QueuedFrameCount; + volatile u32 m_VsyncSignalListener; - Mutex m_lock_RingBufferBusy; + Mutex m_mtx_RingBufferBusy; Semaphore m_sem_OnRingReset; + Semaphore m_sem_Vsync; // used to keep multiple threads from sending packets to the ringbuffer concurrently. // (currently not used or implemented -- is a planned feature for a future threaded VU1) @@ -301,8 +302,9 @@ public: // These vars maintain instance data for sending Data Packets. // Only one data packet can be constructed and uploaded at a time. + uint m_packet_startpos; // size of the packet (data only, ie. not including the 16 byte command!) uint m_packet_size; // size of the packet (data only, ie. not including the 16 byte command!) - uint m_packet_ringpos; // index of the data location in the ringbuffer. + uint m_packet_writepos; // index of the data location in the ringbuffer. #ifdef RINGBUF_DEBUG_STACK Threading::Mutex m_lock_Stack; @@ -317,14 +319,13 @@ public: void WaitGS(); void ResetGS(); - int PrepDataPacket( MTGS_RingCommand cmd, u32 size ); - int PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size ); + void PrepDataPacket( MTGS_RingCommand cmd, u32 size ); + void PrepDataPacket( GIF_PATH pathidx, u32 size ); void SendDataPacket(); void SendGameCRC( u32 crc ); void WaitForOpen(); void Freeze( int mode, MTGS_FreezeData& data ); - void RestartRingbuffer( uint packsize=0 ); void SendSimplePacket( MTGS_RingCommand type, int data0, int data1, int data2 ); void SendPointerPacket( MTGS_RingCommand type, u32 data0, void* data1 ); @@ -346,9 +347,10 @@ protected: void OnResumeInThread( bool IsSuspended ); void OnCleanupInThread(); + void GenericStall( uint size ); + // Used internally by SendSimplePacket type functions - uint _PrepForSimplePacket(); - void _FinishSimplePacket( uint future_writepos ); + void _FinishSimplePacket(); void ExecuteTaskInThread(); }; @@ -416,3 +418,36 @@ extern int g_nLeftGSFrames; #endif +// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s. +// (actual size is 1<stat.APATH <= GIF_APATH1 || (gifRegs->stat.IP3 == true && gifRegs->stat.APATH == GIF_APATH3)) && Path1WritePos > 0 && !gifRegs->stat.PSE) { gifRegs->stat.P1Q = false; - while(Path1WritePos > 0) - { - u32 size = GetMTGS().PrepDataPacket(GIF_PATH_1, Path1Buffer + (Path1ReadPos * 16), (Path1WritePos - Path1ReadPos)); - u8* pDest = GetMTGS().GetDataPacketPtr(); - //DevCon.Warning("Flush Size = %x", size); - - memcpy_aligned(pDest, Path1Buffer + (Path1ReadPos * 16), size * 16); - GetMTGS().SendDataPacket(); - - Path1ReadPos += size; - - if(GSTransferStatus.PTH1 == STOPPED_MODE) + if (uint size = (Path1WritePos - Path1ReadPos)) + { + GetMTGS().PrepDataPacket(GIF_PATH_1, size); + //DevCon.Warning("Flush Size = %x", size); + while(size > 0) { - gifRegs->stat.OPH = false; - gifRegs->stat.APATH = GIF_APATH_IDLE; + uint count = GIFPath_CopyTag(GIF_PATH_1, ((u128*)Path1Buffer) + Path1ReadPos, size); + Path1ReadPos += count; + size -= count; + + if(GSTransferStatus.PTH1 == STOPPED_MODE) + { + gifRegs->stat.APATH = GIF_APATH_IDLE; + } } + GetMTGS().SendDataPacket(); if(Path1ReadPos == Path1WritePos) { @@ -105,7 +105,6 @@ __forceinline void gsInterrupt() if(GSTransferStatus.PTH3 >= PENDINGSTOP_MODE && gifRegs->stat.APATH == GIF_APATH3 ) { - gifRegs->stat.OPH = false; GSTransferStatus.PTH3 = STOPPED_MODE; gifRegs->stat.APATH = GIF_APATH_IDLE; if(gifRegs->stat.P1Q) gsPath1Interrupt(); @@ -150,11 +149,8 @@ __forceinline void gsInterrupt() static u32 WRITERING_DMA(u32 *pMem, u32 qwc) { - int size = GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)pMem, qwc); - u8* pgsmem = GetMTGS().GetDataPacketPtr(); - - memcpy_aligned(pgsmem, pMem, size<<4); - + GetMTGS().PrepDataPacket(GIF_PATH_3, qwc); + uint size = GIFPath_CopyTag(GIF_PATH_3, (u128*)pMem, qwc ); GetMTGS().SendDataPacket(); return size; } @@ -167,7 +163,6 @@ static u32 WRITERING_DMA(tDMA_TAG *pMem, u32 qwc) int _GIFchain() { tDMA_TAG *pMem; - int qwc = 0; pMem = dmaGetAddr(gif->madr, false); if (pMem == NULL) @@ -182,11 +177,6 @@ int _GIFchain() return -1; } - //in Intermittent Mode it enabled, IMAGE_MODE transfers are sliced. - - ///(gifRegs->stat.IMT && GSTransferStatus.PTH3 <= IMAGE_MODE) qwc = min((int)gif->qwc, 8); - /*else qwc = gif->qwc;*/ - return WRITERING_DMA(pMem, gif->qwc); } @@ -327,7 +317,7 @@ void GIFdma() - //gifRegs->stat.OPH = true; + //gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama) gifRegs->stat.FQC = min((u16)0x10, gif->qwc);// FQC=31, hack ;) (for values of 31 that equal 16) [ used to be 0xE00; // APATH=3] //Check with Path3 masking games if (gif->qwc > 0) { @@ -346,7 +336,7 @@ void GIFdma() } - //gifRegs->stat.OPH = true; + //gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama) // Transfer Dn_QWC from Dn_MADR to GIF if ((gif->chcr.MOD == NORMAL_MODE) || (gif->qwc > 0)) // Normal Mode { @@ -450,42 +440,44 @@ static __forceinline bool mfifoGIFrbTransfer() u32 mfifoqwc = min(gifqwc, (u32)gif->qwc); u32 *src; + GetMTGS().PrepDataPacket(GIF_PATH_3, mfifoqwc); + + // TODO (minor optimization): The new GIFpath parser can do rather efficient wrapping of + // its own internally now. We just need to groom a version of it that can wrap around MFIFO + // memory similarly to how it wraps VU1 memory on PATH1. + /* Check if the transfer should wrap around the ring buffer */ if ((gif->madr + mfifoqwc * 16) > (dmacRegs->rbor.ADDR + dmacRegs->rbsr.RMSK + 16)) { uint s1 = ((dmacRegs->rbor.ADDR + dmacRegs->rbsr.RMSK + 16) - gif->madr) >> 4; uint s2 = (mfifoqwc - s1); - // fixme - I don't think these should use WRITERING_DMA, since our source - // isn't the DmaGetAddr(gif->madr) address that WRITERING_DMA expects. /* it does (wrap around), so first copy 's1' bytes from 'addr' to 'data' */ + /* and second copy 's2' bytes from 'maddr' to '&data[s1]' */ + src = (u32*)PSM(gif->madr); if (src == NULL) return false; - s1 = WRITERING_DMA(src, s1); + uint copied = GIFPath_CopyTag(GIF_PATH_3, (u128*)src, s1); - if (s1 == (mfifoqwc - s2)) + if (copied == s1) // but only copy second if first didn't abort prematurely for some reason. { - /* and second copy 's2' bytes from 'maddr' to '&data[s1]' */ src = (u32*)PSM(dmacRegs->rbor.ADDR); if (src == NULL) return false; - s2 = WRITERING_DMA(src, s2); - } - else - { - s2 = 0; + copied += GIFPath_CopyTag(GIF_PATH_3, (u128*)src, s2); } - mfifoqwc = s1 + s2; + mfifoqwc = copied; } else { /* it doesn't, so just transfer 'qwc*16' words from 'gif->madr' to GS */ src = (u32*)PSM(gif->madr); if (src == NULL) return false; - mfifoqwc = WRITERING_DMA(src, mfifoqwc); + mfifoqwc = GIFPath_CopyTag(GIF_PATH_3, (u128*)src, mfifoqwc); gif->madr = dmacRegs->rbor.ADDR + (gif->madr & dmacRegs->rbsr.RMSK); } + GetMTGS().SendDataPacket(); gifqwc -= mfifoqwc; return true; @@ -571,36 +563,36 @@ void mfifoGIFtransfer(int qwc) switch (ptag->ID) { - case TAG_REFE: // Refe - Transfer Packet According to ADDR field + case TAG_REFE: // Refe - Transfer Packet According to ADDR field gif->tadr = qwctag(gif->tadr + 16); gifstate = GIF_STATE_DONE; //End Transfer break; - case TAG_CNT: // CNT - Transfer QWC following the tag. + case TAG_CNT: // CNT - Transfer QWC following the tag. gif->madr = qwctag(gif->tadr + 16); //Set MADR to QW after Tag - gif->tadr = qwctag(gif->madr + (gif->qwc << 4)); //Set TADR to QW following the data + gif->tadr = qwctag(gif->madr + (gif->qwc << 4)); //Set TADR to QW following the data gifstate = GIF_STATE_READY; break; - case TAG_NEXT: // Next - Transfer QWC following tag. TADR = ADDR + case TAG_NEXT: // Next - Transfer QWC following tag. TADR = ADDR { - u32 temp = gif->madr; //Temporarily Store ADDR - gif->madr = qwctag(gif->tadr + 16); //Set MADR to QW following the tag - gif->tadr = temp; //Copy temporarily stored ADDR to Tag + u32 temp = gif->madr; //Temporarily Store ADDR + gif->madr = qwctag(gif->tadr + 16); //Set MADR to QW following the tag + gif->tadr = temp; //Copy temporarily stored ADDR to Tag gifstate = GIF_STATE_READY; break; } - case TAG_REF: // Ref - Transfer QWC from ADDR field - case TAG_REFS: // Refs - Transfer QWC from ADDR field (Stall Control) + case TAG_REF: // Ref - Transfer QWC from ADDR field + case TAG_REFS: // Refs - Transfer QWC from ADDR field (Stall Control) gif->tadr = qwctag(gif->tadr + 16); //Set TADR to next tag gifstate = GIF_STATE_READY; break; - case TAG_END: // End - Transfer QWC following the tag - gif->madr = qwctag(gif->tadr + 16); //Set MADR to data following the tag - gif->tadr = qwctag(gif->madr + (gif->qwc << 4)); //Set TADR to QW following the data - gifstate = GIF_STATE_DONE; //End Transfer + case TAG_END: // End - Transfer QWC following the tag + gif->madr = qwctag(gif->tadr + 16); //Set MADR to data following the tag + gif->tadr = qwctag(gif->madr + (gif->qwc << 4)); //Set TADR to QW following the data + gifstate = GIF_STATE_DONE; //End Transfer break; } @@ -638,7 +630,6 @@ void gifMFIFOInterrupt() if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 ) { - gifRegs->stat.OPH = false; gifRegs->stat.APATH = GIF_APATH_IDLE; if(gifRegs->stat.P1Q) gsPath1Interrupt(); } diff --git a/pcsx2/Gif.h b/pcsx2/Gif.h index 8533cdc98d..097cb03532 100644 --- a/pcsx2/Gif.h +++ b/pcsx2/Gif.h @@ -290,7 +290,7 @@ extern void gifMFIFOInterrupt(); //Just some temporary bits to store Path1 transfers if another is in progress. extern void gsPath1Interrupt(); -extern u8 Path1Buffer[0x1000000]; +extern __aligned16 u8 Path1Buffer[0x1000000]; extern u32 Path1WritePos; extern u32 Path1ReadPos; #endif diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp index a6905b9788..115e08c3c5 100644 --- a/pcsx2/MTGS.cpp +++ b/pcsx2/MTGS.cpp @@ -29,7 +29,7 @@ using namespace Threading; -#if 0 // PCSX2_DEBUG +#if 0 //PCSX2_DEBUG # define MTGS_LOG Console.WriteLn #else # define MTGS_LOG 0&& @@ -46,34 +46,7 @@ using namespace Threading; // MTGS Threaded Class Implementation // ===================================================================================================== -// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s. -// (actual size is 1< 0 ) - RestartRingbuffer(); - else - { - m_QueuedFrameCount++; - SetEvent(); - } + // If the MTGS is allowed to queue a lot of frames in advance, it creates input lag. + // Use the Queued FrameCount to stall the EE if another vsync (or two) are already queued + // in the ringbuffer. The queue limit is disabled when both FrameLimiting and Vsync are + // disabled, since the queue can have perverse effects on framerate benchmarking. + + if ((AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize) || (!EmuConfig.GS.VsyncEnable && !EmuConfig.GS.FrameLimitEnable)) return; + + m_VsyncSignalListener = true; + //Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", volatize(m_ReadPos), m_WritePos ); + m_sem_Vsync.WaitNoCancel(); } struct PacketTagType @@ -261,12 +239,14 @@ void SysMtgsThread::OpenPlugin() class RingBufferLock : public ScopedLock { + typedef ScopedLock _parent; + protected: SysMtgsThread& m_mtgs; public: RingBufferLock( SysMtgsThread& mtgs ) - : ScopedLock( mtgs.m_lock_RingBufferBusy ) + : ScopedLock( mtgs.m_mtx_RingBufferBusy ) , m_mtgs( mtgs ) { m_mtgs.m_RingBufferIsBusy = true; @@ -276,6 +256,18 @@ public: { m_mtgs.m_RingBufferIsBusy = false; } + + void Acquire() + { + _parent::Acquire(); + m_mtgs.m_RingBufferIsBusy = true; + } + + void Release() + { + m_mtgs.m_RingBufferIsBusy = false; + _parent::Release(); + } }; void SysMtgsThread::ExecuteTaskInThread() @@ -284,31 +276,33 @@ void SysMtgsThread::ExecuteTaskInThread() PacketTagType prevCmd; #endif + RingBufferLock busy( *this ); + while( true ) { + busy.Release(); + // Performance note: Both of these perform cancellation tests, but pthread_testcancel // is very optimized (only 1 instruction test in most cases), so no point in trying // to avoid it. m_sem_event.WaitWithoutYield(); StateCheckInThread(); + busy.Acquire(); - { - RingBufferLock busy( *this ); - - // note: m_RingPos is intentionally not volatile, because it should only + // note: m_ReadPos is intentionally not volatile, because it should only // ever be modified by this thread. - while( m_RingPos != volatize(m_WritePos)) + while( m_ReadPos != volatize(m_WritePos)) { if( EmuConfig.GS.DisableOutput ) { - m_RingPos = m_WritePos; + m_ReadPos = m_WritePos; continue; } - pxAssert( m_RingPos < RingBufferSize ); + pxAssert( m_ReadPos < RingBufferSize ); - const PacketTagType& tag = (PacketTagType&)RingBuffer[m_RingPos]; + const PacketTagType& tag = (PacketTagType&)RingBuffer[m_ReadPos]; u32 ringposinc = 1; #ifdef RINGBUF_DEBUG_STACK @@ -316,11 +310,11 @@ void SysMtgsThread::ExecuteTaskInThread() m_lock_Stack.Lock(); uptr stackpos = ringposStack.back(); - if( stackpos != m_RingPos ) + if( stackpos != m_ReadPos ) { - Console.Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, m_RingPos, prevCmd.command ); + Console.Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, m_ReadPos, prevCmd.command ); } - pxAssert( stackpos == m_RingPos ); + pxAssert( stackpos == m_ReadPos ); prevCmd = tag; ringposStack.pop_back(); m_lock_Stack.Release(); @@ -330,38 +324,75 @@ void SysMtgsThread::ExecuteTaskInThread() { case GS_RINGTYPE_P1: { + uint datapos = (m_ReadPos+1) & RingBufferMask; const int qsize = tag.data[0]; - const u128* data = &RingBuffer[m_RingPos+1]; + const u128* data = &RingBuffer[datapos]; MTGS_LOG( "(MTGS Packet Read) ringtype=P1, qwc=%u", qsize ); - // make sure that tag>>16 is the MAX size readable - GSgifTransfer1((u32*)(data - 0x400 + qsize), 0x4000-qsize*16); - //GSgifTransfer1((u32*)data, qsize); + uint endpos = datapos + qsize; + if( endpos >= RingBufferSize ) + { + uint firstcopylen = RingBufferSize - datapos; + GSgifTransfer( (u32*)data, firstcopylen ); + datapos = endpos & RingBufferMask; + GSgifTransfer( (u32*)RingBuffer.m_Ring, datapos ); + } + else + { + GSgifTransfer( (u32*)data, qsize ); + } + ringposinc += qsize; } break; case GS_RINGTYPE_P2: { + uint datapos = (m_ReadPos+1) & RingBufferMask; const int qsize = tag.data[0]; - const u128* data = &RingBuffer[m_RingPos+1]; + const u128* data = &RingBuffer[datapos]; MTGS_LOG( "(MTGS Packet Read) ringtype=P2, qwc=%u", qsize ); - GSgifTransfer2((u32*)data, qsize); + uint endpos = datapos + qsize; + if( endpos >= RingBufferSize ) + { + uint firstcopylen = RingBufferSize - datapos; + GSgifTransfer2( (u32*)data, firstcopylen ); + datapos = endpos & RingBufferMask; + GSgifTransfer2( (u32*)RingBuffer.m_Ring, datapos ); + } + else + { + GSgifTransfer2( (u32*)data, qsize ); + } + ringposinc += qsize; } break; case GS_RINGTYPE_P3: { + uint datapos = (m_ReadPos+1) & RingBufferMask; const int qsize = tag.data[0]; - const u128* data = &RingBuffer[m_RingPos+1]; + const u128* data = &RingBuffer[datapos]; MTGS_LOG( "(MTGS Packet Read) ringtype=P3, qwc=%u", qsize ); - GSgifTransfer3((u32*)data, qsize); + uint endpos = datapos + qsize; + if( endpos >= RingBufferSize ) + { + uint firstcopylen = RingBufferSize - datapos; + GSgifTransfer3( (u32*)data, firstcopylen ); + datapos = endpos & RingBufferMask; + GSgifTransfer3( (u32*)RingBuffer.m_Ring, datapos ); + } + else + { + GSgifTransfer3( (u32*)data, qsize ); + } + ringposinc += qsize; } break; @@ -370,25 +401,25 @@ void SysMtgsThread::ExecuteTaskInThread() { switch( tag.command ) { - case GS_RINGTYPE_RESTART: - //MTGS_LOG( "(MTGS Packet Read) ringtype=Restart" ); - m_RingPos = 0; - continue; - case GS_RINGTYPE_VSYNC: { const int qsize = tag.data[0]; ringposinc += qsize; - MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", tag.data[0], tag.data[1] ? "true" : "false" ); - + MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", !!(((u32&)RingBuffer.Regs[0x1000]) & 0x2000) ? 0 : 1, tag.data[1] ? "true" : "false" ); + // Mail in the important GS registers. - RingCmdPacket_Vsync& local((RingCmdPacket_Vsync&)RingBuffer[m_RingPos+1]); - memcpy_fast( RingBuffer.Regs, local.regset1, sizeof(local.regset1)); - ((u32&)RingBuffer.Regs[0x1000]) = local.csr; - ((u32&)RingBuffer.Regs[0x1010]) = local.imr; - ((GSRegSIGBLID&)RingBuffer.Regs[0x1080]) = local.siglblid; - + // This seemingly obtuse system is needed in order to handle cases where the vsync data wraps + // around the edge of the ringbuffer. If not for that I'd just use a struct. >_< + + uint datapos = (m_ReadPos+1) & RingBufferMask; + MemCopy_WrappedSrc( RingBuffer.m_Ring, datapos, RingBufferSize, (u128*)RingBuffer.Regs, 0xf ); + + u32* remainder = (u32*)&RingBuffer[datapos]; + ((u32&)RingBuffer.Regs[0x1000]) = remainder[0]; + ((u32&)RingBuffer.Regs[0x1010]) = remainder[1]; + ((GSRegSIGBLID&)RingBuffer.Regs[0x1080]) = (GSRegSIGBLID&)remainder[2]; + // CSR & 0x2000; is the pageflip id. GSvsync(((u32&)RingBuffer.Regs[0x1000]) & 0x2000); gsFrameSkip(); @@ -398,7 +429,13 @@ void SysMtgsThread::ExecuteTaskInThread() if( (GSopen2 == NULL) && (PADupdate != NULL) ) PADupdate(0); + AtomicDecrement( m_QueuedFrameCount ); + if (!!AtomicExchange(m_VsyncSignalListener, false)) + m_sem_Vsync.Post(); + + busy.Release(); StateCheckInThread(); + busy.Acquire(); } break; @@ -438,9 +475,9 @@ void SysMtgsThread::ExecuteTaskInThread() #ifdef PCSX2_DEVBUILD default: - Console.Error("GSThreadProc, bad packet (%x) at m_RingPos: %x, m_WritePos: %x", tag.command, m_RingPos, m_WritePos); + Console.Error("GSThreadProc, bad packet (%x) at m_ReadPos: %x, m_WritePos: %x", tag.command, m_ReadPos, m_WritePos); pxFail( "Bad packet encountered in the MTGS Ringbuffer." ); - m_RingPos = m_WritePos; + m_ReadPos = m_WritePos; continue; #else // Optimized performance in non-Dev builds. @@ -450,23 +487,29 @@ void SysMtgsThread::ExecuteTaskInThread() } } - uint newringpos = m_RingPos + ringposinc; - pxAssert( newringpos <= RingBufferSize ); - m_RingPos = newringpos & RingBufferMask; + uint newringpos = (m_ReadPos + ringposinc) & RingBufferMask; + + if( EmuConfig.GS.SynchronousMTGS ) + { + pxAssert( m_WritePos == newringpos ); + } + + m_ReadPos = newringpos; if( m_SignalRingEnable != 0 ) { // The EEcore has requested a signal after some amount of processed data. if( AtomicExchangeSub( m_SignalRingPosition, ringposinc ) <= 0 ) { - // Make sure to post the signal after the m_RingPos has been updated... + // Make sure to post the signal after the m_ReadPos has been updated... AtomicExchange( m_SignalRingEnable, 0 ); m_sem_OnRingReset.Post(); continue; } } } - } + + busy.Release(); // Safety valve in case standard signals fail for some reason -- this ensures the EEcore // won't sleep the eternity, even if SignalRingPosition didn't reach 0 for some reason. @@ -479,7 +522,10 @@ void SysMtgsThread::ExecuteTaskInThread() m_sem_OnRingReset.Post(); } - //Console.Warning( "(MTGS Thread) Nothing to do! ringpos=0x%06x", m_RingPos ); + if (!!AtomicExchange(m_VsyncSignalListener, false)) + m_sem_Vsync.Post(); + + //Console.Warning( "(MTGS Thread) Nothing to do! ringpos=0x%06x", m_ReadPos ); } } @@ -519,15 +565,15 @@ void SysMtgsThread::WaitGS() if( m_ExecMode == ExecMode_NoThreadYet || !IsRunning() ) return; if( !pxAssertDev( IsOpen(), "MTGS Warning! WaitGS issued on a closed thread." ) ) return; - if( volatize(m_RingPos) != m_WritePos ) + if( volatize(m_ReadPos) != m_WritePos ) { SetEvent(); RethrowException(); do { - m_lock_RingBufferBusy.Wait(); + m_mtx_RingBufferBusy.Wait(); RethrowException(); - } while( volatize(m_RingPos) != m_WritePos ); + } while( volatize(m_ReadPos) != m_WritePos ); } // Completely synchronize GS and MTGS register states. @@ -546,7 +592,7 @@ void SysMtgsThread::SetEvent() u8* SysMtgsThread::GetDataPacketPtr() const { - return (u8*)&RingBuffer[m_packet_ringpos]; + return (u8*)&RingBuffer[m_packet_writepos & RingBufferMask]; } // Closes the data packet send command, and initiates the gs thread (if needed). @@ -555,31 +601,14 @@ void SysMtgsThread::SendDataPacket() // make sure a previous copy block has been started somewhere. pxAssert( m_packet_size != 0 ); - uint temp = m_packet_ringpos + m_packet_size; - pxAssert( temp <= RingBufferSize ); - temp &= RingBufferMask; + uint actualSize = ((m_packet_writepos - m_packet_startpos) & RingBufferMask)-1; + pxAssert( actualSize <= m_packet_size ); + pxAssert( m_packet_writepos < RingBufferSize ); - if( IsDebugBuild ) - { - if( m_packet_ringpos + m_packet_size < RingBufferSize ) - { - uint readpos = volatize(m_RingPos); - if( readpos != m_WritePos ) - { - // The writepos should never leapfrog the readpos - // since that indicates a bad write. - if( m_packet_ringpos < readpos ) - pxAssert( temp < readpos ); - } + PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos]; + tag.data[0] = actualSize; - // Updating the writepos should never make it equal the readpos, since - // that would stop the buffer prematurely (and indicates bad code in the - // ringbuffer manager) - pxAssert( readpos != temp ); - } - } - - m_WritePos = temp; + m_WritePos = m_packet_writepos; if( EmuConfig.GS.SynchronousMTGS ) { @@ -596,142 +625,95 @@ void SysMtgsThread::SendDataPacket() //m_PacketLocker.Release(); } -int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size ) +void SysMtgsThread::GenericStall( uint size ) { // Note on volatiles: m_WritePos is not modified by the GS thread, so there's no need // to use volatile reads here. We do cache it though, since we know it never changes, // except for calls to RingbufferRestert() -- handled below. - uint writepos = m_WritePos; - - // Checks if a previous copy was started without an accompanying call to GSRINGBUF_DONECOPY - pxAssert( m_packet_size == 0 ); + const uint writepos = m_WritePos; // Sanity checks! (within the confines of our ringbuffer please!) pxAssert( size < RingBufferSize ); pxAssert( writepos < RingBufferSize ); + // generic gs wait/stall. + // if the writepos is past the readpos then we're safe. + // But if not then we need to make sure the readpos is outside the scope of + // the block about to be written (writepos + size) + + uint readpos = volatize(m_ReadPos); + uint freeroom; + + if (writepos < readpos) + freeroom = readpos - writepos; + else + freeroom = RingBufferSize - (writepos - readpos); + + if (freeroom <= size) + { + // writepos will overlap readpos if we commit the data, so we need to wait until + // readpos is out past the end of the future write pos, or until it wraps around + // (in which case writepos will be >= readpos). + + // Ideally though we want to wait longer, because if we just toss in this packet + // the next packet will likely stall up too. So lets set a condition for the MTGS + // thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied. + + uint somedone = (RingBufferSize - freeroom) / 4; + if( somedone < size+1 ) somedone = size + 1; + + // FMV Optimization: FMVs typically send *very* little data to the GS, in some cases + // every other frame is nothing more than a page swap. Sleeping the EEcore is a + // waste of time, and we get better results using a spinwait. + + if( somedone > 0x80 ) + { + pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" ); + m_SignalRingPosition = somedone; + + //Console.WriteLn( Color_Blue, "(EEcore Sleep) PrepDataPacker \tringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", readpos, writepos, m_SignalRingPosition ); + + while(true) { + AtomicExchange( m_SignalRingEnable, 1 ); + SetEvent(); + m_sem_OnRingReset.WaitWithoutYield(); + readpos = volatize(m_ReadPos); + //Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos ); + + if (writepos < readpos) + freeroom = readpos - writepos; + else + freeroom = RingBufferSize - (writepos - readpos); + + if (freeroom > size) break; + } + + pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" ); + } + else + { + //Console.WriteLn( Color_StrongGray, "(EEcore Spin) PrepDataPacket!" ); + SetEvent(); + while(true) { + SpinWait(); + readpos = volatize(m_ReadPos); + + if (writepos < readpos) + freeroom = readpos - writepos; + else + freeroom = RingBufferSize - (writepos - readpos); + + if (freeroom > size) break; + } + } + } +} + +void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size ) +{ m_packet_size = size; ++size; // takes into account our RingCommand QWC. - - if( writepos + size < RingBufferSize ) - { - // generic gs wait/stall. - // if the writepos is past the readpos then we're safe. - // But if not then we need to make sure the readpos is outside the scope of - // the block about to be written (writepos + size) - - uint readpos = volatize(m_RingPos); - if( (writepos < readpos) && (writepos+size >= readpos) ) - { - // writepos is behind the readpos and will overlap it if we commit the data, - // so we need to wait until readpos is out past the end of the future write pos, - // or until it wraps around (in which case writepos will be >= readpos). - - // Ideally though we want to wait longer, because if we just toss in this packet - // the next packet will likely stall up too. So lets set a condition for the MTGS - // thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied. - - uint totalAccum = (m_RingWrapSpot - readpos) + writepos; - uint somedone = totalAccum / 4; - if( somedone < size+1 ) somedone = size + 1; - - // FMV Optimization: FMVs typically send *very* little data to the GS, in some cases - // every other frame is nothing more than a page swap. Sleeping the EEcore is a - // waste of time, and we get better results using a spinwait. - - if( somedone > 0x80 ) - { - pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" ); - m_SignalRingPosition = somedone; - - //Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition ); - - do { - AtomicExchange( m_SignalRingEnable, 1 ); - SetEvent(); - m_sem_OnRingReset.WaitWithoutYield(); - readpos = volatize(m_RingPos); - //Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos ); - } while( (writepos < readpos) && (writepos+size >= readpos) ); - - pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" ); - } - else - { - SetEvent(); - do { - SpinWait(); - readpos = volatize(m_RingPos); - } while( (writepos < readpos) && (writepos+size >= readpos) ); - } - } - } - else if( writepos + size > RingBufferSize ) - { - pxAssert( writepos != 0 ); - - // If the incoming packet doesn't fit, then start over from the start of the ring - // buffer (it's a lot easier than trying to wrap the packet around the end of the - // buffer). - - //Console.WriteLn( "MTGS > Ringbuffer Got Filled!"); - RestartRingbuffer( size ); - writepos = m_WritePos; - } - else // always true - if( writepos + size == MTGS_RINGBUFFEREND ) - { - // Yay. Perfect fit. What are the odds? - // Copy is ready so long as readpos is less than writepos and *not* equal to the - // base of the ringbuffer (otherwise the buffer will stop when the writepos is - // wrapped around to zero later-on in SendDataPacket). - - uint readpos = volatize(m_RingPos); - //Console.WriteLn( "MTGS > Perfect Fit!\tringpos=0x%06x, writepos=0x%06x", readpos, writepos ); - if( readpos > writepos || readpos == 0 ) - { - uint totalAccum = (readpos == 0) ? RingBufferSize : ((m_RingWrapSpot - readpos) + writepos); - uint somedone = totalAccum / 4; - if( somedone < size+1 ) somedone = size + 1; - - // FMV Optimization: (see above) This condition of a perfect fit is so rare that optimizing - // for it is pointless -- but it was also mindlessly simple copy-paste. So there. :p - - if( somedone > 0x80 ) - { - m_SignalRingPosition = somedone; - - //Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Sleep!\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition ); - - do { - AtomicExchange( m_SignalRingEnable, 1 ); - SetEvent(); - m_sem_OnRingReset.WaitWithoutYield(); - readpos = volatize(m_RingPos); - //Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Post-sleep Report!\tringpos=0x%06x", readpos ); - } while( (writepos < readpos) || (readpos==0) ); - - pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" ); - } - else - { - //Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Spin!" ); - SetEvent(); - do { - SpinWait(); - readpos = volatize(m_RingPos); - } while( (writepos < readpos) || (readpos==0) ); - } - } - - m_QueuedFrameCount = 0; - m_RingWrapSpot = RingBufferSize; - } - -#ifdef RINGBUF_DEBUG_STACK - m_lock_Stack.Lock(); - ringposStack.push_front( writepos ); - m_lock_Stack.Release(); -#endif + GenericStall(size); // Command qword: Low word is the command, and the high word is the packet // length in SIMDs (128 bits). @@ -739,9 +721,8 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size ) PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos]; tag.command = cmd; tag.data[0] = m_packet_size; - m_packet_ringpos = m_WritePos + 1; - - return m_packet_size; + m_packet_startpos = m_WritePos; + m_packet_writepos = (m_WritePos + 1) & RingBufferMask; } // Returns the amount of giftag data processed (in simd128 values). @@ -749,132 +730,17 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size ) // around VU memory instead of having buffer overflow... // Parameters: // size - size of the packet data, in smd128's -int SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size ) +void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size ) { //m_PacketLocker.Acquire(); - return PrepDataPacket( (MTGS_RingCommand)pathidx, GIFPath_ParseTag(pathidx, srcdata, size) ); + PrepDataPacket( (MTGS_RingCommand)pathidx, size ); } -void SysMtgsThread::RestartRingbuffer( uint packsize ) +__forceinline void SysMtgsThread::_FinishSimplePacket() { - if( m_WritePos == 0 ) return; - const uint thefuture = packsize; - - //Console.WriteLn( Color_Magenta, "**** Ringbuffer Restart!!" ); - // Always kick the MTGS into action for a ringbuffer restart. - SetEvent(); - - uint readpos = volatize(m_RingPos); - - if( (readpos > m_WritePos) || (readpos <= thefuture) ) - { - // We have to be careful not to leapfrog our read-position, which would happen if - // it's greater than the current write position (since wrapping writepos to 0 would - // be the act of skipping PAST readpos). Stall until it loops around to the - // beginning of the buffer, and past the size of our packet allocation. - - uint somedone; - - if( readpos > m_WritePos ) - somedone = (m_RingWrapSpot - readpos) + packsize + 1; - else - somedone = (packsize + 1) - readpos; - - if( somedone > 0x80 ) - { - m_SignalRingPosition = somedone; - //Console.WriteLn( Color_Blue, "(EEcore Sleep) Restart!\tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", - // readpos, m_WritePos, m_RingWrapSpot, m_SignalRingPosition ); - - do { - AtomicExchange( m_SignalRingEnable, 1 ); - SetEvent(); - m_sem_OnRingReset.WaitWithoutYield(); - readpos = volatize(m_RingPos); - //Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos ); - } while( (readpos > m_WritePos) || (readpos <= thefuture) ); - } - else - { - SetEvent(); - do { - SpinWait(); - readpos = volatize(m_RingPos); - } while( (readpos > m_WritePos) || (readpos <= thefuture) ); - } - } - - PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos]; - - tag.command = GS_RINGTYPE_RESTART; - - m_RingWrapSpot = m_WritePos; - m_WritePos = 0; - m_QueuedFrameCount = 0; - - if( EmuConfig.GS.SynchronousMTGS ) - WaitGS(); -} - -__forceinline uint SysMtgsThread::_PrepForSimplePacket() -{ -#ifdef RINGBUF_DEBUG_STACK - m_lock_Stack.Lock(); - ringposStack.push_front( m_WritePos ); - m_lock_Stack.Release(); -#endif - - uint future_writepos = m_WritePos+1; - pxAssert( future_writepos <= RingBufferSize ); - - future_writepos &= RingBufferMask; - if( future_writepos == 0 ) - { - m_QueuedFrameCount = 0; - m_RingWrapSpot = RingBufferSize; - } - - uint readpos = volatize(m_RingPos); - if( future_writepos == readpos ) - { - // The ringbuffer read pos is blocking the future write position, so stall out - // until the read position has moved. - - uint totalAccum = (m_RingWrapSpot - readpos) + future_writepos; - uint somedone = totalAccum / 4; - - if( somedone > 0x80 ) - { - m_SignalRingPosition = somedone; - - //Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Simple Sleep!\t\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition ); - - do { - AtomicExchange( m_SignalRingEnable, 1 ); - SetEvent(); - m_sem_OnRingReset.WaitWithoutYield(); - readpos = volatize(m_RingPos); - //Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Simple Post-sleep Report!\tringpos=0x%06x", readpos ); - } while( future_writepos == readpos ); - - pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" ); - } - else - { - SetEvent(); - do { - SpinWait(); - } while( future_writepos == volatize(m_RingPos) ); - } - } - - return future_writepos; -} - -__forceinline void SysMtgsThread::_FinishSimplePacket( uint future_writepos ) -{ - pxAssert( future_writepos != volatize(m_RingPos) ); + uint future_writepos = (m_WritePos+1) & RingBufferMask; + pxAssert( future_writepos != volatize(m_ReadPos) ); m_WritePos = future_writepos; if( EmuConfig.GS.SynchronousMTGS ) @@ -887,7 +753,7 @@ void SysMtgsThread::SendSimplePacket( MTGS_RingCommand type, int data0, int data { //ScopedLock locker( m_PacketLocker ); - const uint thefuture = _PrepForSimplePacket(); + GenericStall(1); PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos]; tag.command = type; @@ -895,21 +761,21 @@ void SysMtgsThread::SendSimplePacket( MTGS_RingCommand type, int data0, int data tag.data[1] = data1; tag.data[2] = data2; - _FinishSimplePacket( thefuture ); + _FinishSimplePacket(); } void SysMtgsThread::SendPointerPacket( MTGS_RingCommand type, u32 data0, void* data1 ) { //ScopedLock locker( m_PacketLocker ); - const uint thefuture = _PrepForSimplePacket(); + GenericStall(1); PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos]; tag.command = type; tag.data[0] = data0; *(uptr*)&tag.data[1] = (uptr)data1; - _FinishSimplePacket( thefuture ); + _FinishSimplePacket(); } void SysMtgsThread::SendGameCRC( u32 crc ) diff --git a/pcsx2/Pcsx2Config.cpp b/pcsx2/Pcsx2Config.cpp index efa01f4c59..a1aa88f307 100644 --- a/pcsx2/Pcsx2Config.cpp +++ b/pcsx2/Pcsx2Config.cpp @@ -217,6 +217,7 @@ Pcsx2Config::GSOptions::GSOptions() SynchronousMTGS = false; DisableOutput = false; + VsyncQueueSize = 2; DefaultRegionMode = Region_NTSC; FramesToDraw = 2; @@ -234,6 +235,7 @@ void Pcsx2Config::GSOptions::LoadSave( IniInterface& ini ) IniEntry( SynchronousMTGS ); IniEntry( DisableOutput ); + IniEntry( VsyncQueueSize ); IniEntry( FrameLimitEnable ); IniEntry( FrameSkipEnable ); diff --git a/pcsx2/PluginManager.cpp b/pcsx2/PluginManager.cpp index 080f1f5e9d..558a12180f 100644 --- a/pcsx2/PluginManager.cpp +++ b/pcsx2/PluginManager.cpp @@ -144,6 +144,7 @@ static s32 CALLBACK fallback_test() { return 0; } _GSvsync GSvsync; _GSopen GSopen; _GSopen2 GSopen2; +_GSgifTransfer GSgifTransfer; _GSgifTransfer1 GSgifTransfer1; _GSgifTransfer2 GSgifTransfer2; _GSgifTransfer3 GSgifTransfer3; @@ -309,7 +310,8 @@ static const LegacyApi_ReqMethod s_MethMessReq_GS[] = { { "GSopen", (vMeth**)&GSopen, NULL }, { "GSvsync", (vMeth**)&GSvsync, NULL }, - { "GSgifTransfer1", (vMeth**)&GSgifTransfer1, NULL }, + { "GSgifTransfer", (vMeth**)&GSgifTransfer, NULL }, + //{ "GSgifTransfer1", (vMeth**)&GSgifTransfer1, NULL }, { "GSgifTransfer2", (vMeth**)&GSgifTransfer2, NULL }, { "GSgifTransfer3", (vMeth**)&GSgifTransfer3, NULL }, { "GSreadFIFO2", (vMeth**)&GSreadFIFO2, NULL }, diff --git a/pcsx2/VUops.cpp b/pcsx2/VUops.cpp index 6baaacebad..0172fd1d9f 100644 --- a/pcsx2/VUops.cpp +++ b/pcsx2/VUops.cpp @@ -2057,21 +2057,8 @@ void _vuXGKICK(VURegs * VU) u8* data = ((u8*)VU->Mem + ((VU->VI[_Is_].US[0]*16) & 0x3fff)); u32 size; - size = GetMTGS().PrepDataPacket( GIF_PATH_1, data, (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4); - u8* pmem = GetMTGS().GetDataPacketPtr(); - - if((size << 4) > (u32)(0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff))) - { - //DevCon.Warning("addr + Size = 0x%x, transferring %x then doing %x", ((VU->VI[_Is_].US[0]*16) & 0x3fff) + (size << 4), (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4, size - (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff) >> 4)); - memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)); - size -= (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4; - //DevCon.Warning("Size left %x", size); - pmem += 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff); - memcpy_aligned(pmem, (u8*)VU->Mem, size<<4); - } - else { - memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), size<<4); - } + GetMTGS().PrepDataPacket( GIF_PATH_1, 0x400 ); + size = GIFPath_CopyTag( GIF_PATH_1, (u128*)data, (0x400-(VU->VI[_Is_].US[0] & 0x3ff)) ); GetMTGS().SendDataPacket(); } diff --git a/pcsx2/Vif1_Dma.cpp b/pcsx2/Vif1_Dma.cpp index c7e42ad814..8f738a590e 100644 --- a/pcsx2/Vif1_Dma.cpp +++ b/pcsx2/Vif1_Dma.cpp @@ -345,7 +345,6 @@ __forceinline void vif1Interrupt() if(GSTransferStatus.PTH2 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH2) { - gifRegs->stat.OPH = false; gifRegs->stat.APATH = GIF_APATH_IDLE; if(gifRegs->stat.P1Q) gsPath1Interrupt(); } @@ -440,11 +439,6 @@ __forceinline void vif1Interrupt() if (vif1.cmd != 0) Console.WriteLn("vif1.cmd still set %x tag size %x", vif1.cmd, vif1.tag.size); #endif - - if((vif1ch->chcr.DIR == VIF_NORMAL_TO_MEM_MODE) && vif1.GSLastDownloadSize <= 16) - { //Reverse fifo has finished and nothing is left, so lets clear the outputting flag - gifRegs->stat.OPH = false; - } vif1ch->chcr.STR = false; vif1.vifstalled = false; g_vifCycles = 0; diff --git a/pcsx2/Vif1_MFIFO.cpp b/pcsx2/Vif1_MFIFO.cpp index 64ff291b90..86f8008e6f 100644 --- a/pcsx2/Vif1_MFIFO.cpp +++ b/pcsx2/Vif1_MFIFO.cpp @@ -239,7 +239,6 @@ void vifMFIFOInterrupt() if(GSTransferStatus.PTH2 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH2) { GSTransferStatus.PTH2 = STOPPED_MODE; - if(gifRegs->stat.DIR == 0)gifRegs->stat.OPH = false; gifRegs->stat.APATH = GIF_APATH_IDLE; if(gifRegs->stat.P1Q) gsPath1Interrupt(); /*gifRegs->stat.APATH = GIF_APATH_IDLE; diff --git a/pcsx2/Vif_Codes.cpp b/pcsx2/Vif_Codes.cpp index fb2fb3a9f9..d7208d1976 100644 --- a/pcsx2/Vif_Codes.cpp +++ b/pcsx2/Vif_Codes.cpp @@ -167,10 +167,16 @@ template _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) { return 0; } - - + // HACK ATTACK! + // we shouldn't be clearing the queue flag here at all. Ideally, the queue statuses + // should be checked, handled, and cleared from the EOP check in GIFPath only. --air gifRegs->stat.clear_flags(GIF_STAT_P2Q); + // the tag size should ALWAYS be 128 bits (qwc). If it isn't, it means there's a serious bug + // somewhere in the VIF (likely relating to +/-'ing the tag.size during processing). + // NOTE: ICO [PAL] exploits this during bootup. Needs investigation. --air + //pxAssumeMsg( (vif1.tag.size & 3) == 0, "Invalid Vif1 DIRECT packet size detected!" ); + nVifStruct& v = nVif[1]; const int ret = aMin(vif1.vifpacketsize, vif1.tag.size); u32 size = ret << 2; @@ -184,8 +190,6 @@ template _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) { if(vif1.vifpacketsize < 4 && v.bSize < 16) { - nVifStruct& v = nVif[idx]; - memcpy(&v.buffer[v.bPtr], data, vif1.vifpacketsize << 2); v.bSize += vif1.vifpacketsize << 2; v.bPtr += vif1.vifpacketsize << 2; @@ -199,7 +203,6 @@ template _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) { } else { - nVifStruct& v = nVif[idx]; if(v.bSize) { int ret = 0; @@ -213,8 +216,8 @@ template _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) { v.bSize = 0; v.bPtr = 0; } - const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, v.buffer, 1); - memcpy_fast(GetMTGS().GetDataPacketPtr(), v.buffer, count << 4); + GetMTGS().PrepDataPacket(GIF_PATH_2, 1); + GIFPath_CopyTag(GIF_PATH_2, (u128*)v.buffer, 1); GetMTGS().SendDataPacket(); if(vif1.tag.size == 0) @@ -226,16 +229,17 @@ template _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) { } else { - const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, data, size >> 4); - memcpy_fast(GetMTGS().GetDataPacketPtr(), data, count << 4); + GetMTGS().PrepDataPacket(GIF_PATH_2, size/16); + uint count = GIFPath_CopyTag(GIF_PATH_2, (u128*)data, size/16) * 4; GetMTGS().SendDataPacket(); - vif1.tag.size -= count << 2; + + vif1.tag.size -= count; if(vif1.tag.size == 0) { vif1.cmd = 0; } vif1.vifstalled = true; - return count << 2; + return count; } } diff --git a/pcsx2/Vif_Transfer.cpp b/pcsx2/Vif_Transfer.cpp index a5f80db686..21aafa5529 100644 --- a/pcsx2/Vif_Transfer.cpp +++ b/pcsx2/Vif_Transfer.cpp @@ -36,16 +36,8 @@ _vifT bool analyzeIbit(u32* &data, int iBit) { if (iBit && !vifX.cmd && !vifXRegs->err.MII) { //DevCon.WriteLn("Vif I-Bit IRQ"); vifX.irq++; - // On i-bit, the command is run, vif stalls etc, - // however if the vifcode is MARK, you do NOT stall, just send IRQ. - Max Payne shows this up. - //if(((vifXRegs->code >> 24) & 0x7f) == 0x7) return 0; - // If we have a vifcode with i-bit, the following instruction - // should stall unless its MARK?.. we test that case here... - // Not 100% sure if this is the correct behavior, so printing - // a console message to see games that use this. (cottonvibes) - - // Okay did some testing with Max Payne, it does this + // Okay did some testing with Max Payne, it does this: // VifMark value = 0x666 (i know, evil!) // NOP with I Bit // VifMark value = 0 @@ -53,6 +45,23 @@ _vifT bool analyzeIbit(u32* &data, int iBit) { // If you break after the 2nd Mark has run, the game reports invalid mark 0 and the game dies. // So it has to occur here, testing a theory that it only doesn't stall if the command with // the iBit IS mark, but still sends the IRQ to let the cpu know the mark is there. (Refraction) + // + // -------------------------- + // + // This is how it probably works: i-bit sets the IRQ flag, and VIF keeps running until it encounters + // a non-MARK instruction. This includes the *current* instruction. ie, execution only continues + // unimpeded if MARK[i] is specified, and keeps executing unimpeded until any non-MARK command. + // Any other command with an I bit should stall immediately. + // Example: + // + // VifMark[i] value = 0x321 (with I bit) + // VifMark value = 0 + // VifMark value = 0x333 + // NOP + // + // ... the VIF should not stall and raise the interrupt until after the NOP is processed. + // So the final value for MARK as the game sees it will be 0x333. --air + return runMark(data); } return 0; diff --git a/pcsx2/gui/AppAssert.cpp b/pcsx2/gui/AppAssert.cpp index 6d33082872..87f55d2449 100644 --- a/pcsx2/gui/AppAssert.cpp +++ b/pcsx2/gui/AppAssert.cpp @@ -134,10 +134,10 @@ bool AppDoAssert( const DiagnosticOrigin& origin, const wxChar *msg ) wxString trace( pxGetStackTrace(origin.function) ); wxString dbgmsg( origin.ToString( msg ) ); - wxMessageOutputDebug().Printf( dbgmsg ); + wxMessageOutputDebug().Printf( L"%s", dbgmsg ); - Console.Error( dbgmsg ); - Console.WriteLn( trace ); + Console.Error( L"%s", dbgmsg ); + Console.WriteLn( L"%s", trace ); wxString windowmsg( L"Assertion failed: " ); if( msg != NULL ) diff --git a/pcsx2/gui/AppInit.cpp b/pcsx2/gui/AppInit.cpp index 70985ad576..7d36249b39 100644 --- a/pcsx2/gui/AppInit.cpp +++ b/pcsx2/gui/AppInit.cpp @@ -189,13 +189,13 @@ void Pcsx2App::DetectCpuAndUserMode() x86caps.CountCores(); x86caps.SIMD_EstablishMXCSRmask(); - if( !x86caps.hasMultimediaExtensions ) + if( !x86caps.hasMultimediaExtensions || !x86caps.hasStreamingSIMDExtensions ) { - // Note: due to memcpy_fast, we need minimum MMX even for interpreters. This will - // hopefully change later once we have a dynamically recompiled memcpy. + // Note: Due to optimizations to GIFpath parsers, memcpy, and possibly other things, we need + // a bare minimum of SSE supported by the CPU. throw Exception::HardwareDeficiency() - .SetDiagMsg(L"Critical Failure: MMX Extensions not available.") - .SetUserMsg(_("MMX extensions are not available. PCSX2 requires cpu with MMX extension support to run.")); + .SetDiagMsg(L"Critical Failure: SSE Extensions not available.") + .SetUserMsg(_("SSE extensions are not available. PCSX2 requires a cpu that supports the SSE instruction set.")); } ReadUserModeSettings(); diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp index b311361c84..1c9366ec21 100644 --- a/pcsx2/ps2/GIFpath.cpp +++ b/pcsx2/ps2/GIFpath.cpp @@ -19,6 +19,7 @@ #include "Gif.h" #include "Vif_Dma.h" #include "Vif.h" +#include // -------------------------------------------------------------------------------------- // GIFpath -- the GIFtag Parser @@ -92,12 +93,16 @@ struct GIFPath void Reset(); void PrepPackedRegs(); - void SetTag(const void* mem); bool StepReg(); u8 GetReg(); bool IsActive() const; - int ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size); + template< bool Aligned > + void SetTag(const void* mem); + + template< GIF_PATH pathidx, bool Aligned > + int CopyTag(const u128* pMem, u32 size); + int ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size); }; @@ -285,9 +290,11 @@ __forceinline void GIFPath::PrepPackedRegs() } } + +template< bool Aligned > __forceinline void GIFPath::SetTag(const void* mem) { - const_cast(tag) = *((GIFTAG*)mem); + _mm_store_ps( (float*)&tag, Aligned ? _mm_load_ps((const float*)mem) : _mm_loadu_ps((const float*)mem) ); nloop = tag.NLOOP; curreg = 0; @@ -350,7 +357,8 @@ static __forceinline void gsHandler(const u8* pMem) // qwords, rounded down; any extra bits are lost // games must take care to ensure transfer rectangles are exact multiples of a qword vif1.GSLastDownloadSize = vif1.TRXREG.RRW * vif1.TRXREG.RRH * bpp >> 7; - gifRegs->stat.OPH = true; + //DevCon.Warning("GS download in progress. OPH = %x", gifRegs->stat.OPH); + //gifRegs->stat.OPH = true; // Too early to set it here. It should be done on a BUSDIR call (rama) } } if (reg >= 0x60) @@ -371,10 +379,9 @@ static __forceinline void gsHandler(const u8* pMem) #define aMin(x, y) std::min(x, y) // Parameters: -// size (path1) - difference between the end of VU memory and pMem. -// size (path2/3) - max size of incoming data stream, in qwc (simd128) - - +// size - max size of incoming data stream, in qwc (simd128). If the path is PATH1, and the +// path does not terminate (EOP) within the specified size, it is assumed that the path must +// loop around to the start of VU memory and continue processing. __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size) { u32 startSize = size; // Start Size @@ -382,7 +389,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s while (size > 0) { if (!nloop) { - SetTag(pMem); + SetTag(pMem); incTag(1); } else @@ -509,6 +516,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s Console.Warning("GIFTAG error, size exceeded VU memory size %x", startSize); nloop = 0; + const_cast(tag).EOP = 1; } } } @@ -521,15 +529,65 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s return size; } -__forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) +__forceinline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len ) { + uint endpos = destStart + len; + if( endpos < destSize ) + { + memcpy_qwc(&destBase[destStart], src, len ); + destStart += len; + } + else + { + uint firstcopylen = destSize - destStart; + memcpy_qwc(&destBase[destStart], src, firstcopylen ); + + destStart = endpos % destSize; + memcpy_qwc(destBase, src+firstcopylen, destStart ); + } +} + +__forceinline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len ) +{ + uint endpos = srcStart + len; + if( endpos < srcSize ) + { + memcpy_qwc(dest, &srcBase[srcStart], len ); + srcStart += len; + } + else + { + uint firstcopylen = srcSize - srcStart; + memcpy_qwc(dest, &srcBase[srcStart], firstcopylen ); + + srcStart = endpos % srcSize; + memcpy_qwc(dest+firstcopylen, srcBase, srcStart ); + } +} + +#define copyTag() do { \ + _mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], Aligned ? _mm_load_ps((float*)pMem128) : _mm_loadu_ps((float*)pMem128)); \ + ++pMem128; --size; \ + ringpos = (ringpos+1)&RingBufferMask; \ +} while(false) + +// Parameters: +// size - max size of incoming data stream, in qwc (simd128). If the path is PATH1, and the +// path does not terminate (EOP) within the specified size, it is assumed that the path must +// loop around to the start of VU memory and continue processing. +template< GIF_PATH pathidx, bool Aligned > +__forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size) +{ + uint& ringpos = GetMTGS().m_packet_writepos; + const uint original_ringpos = ringpos; + u32 startSize = size; // Start Size while (size > 0) { if (!nloop) { - SetTag(pMem); - incTag(1); + SetTag((u8*)pMem128); + copyTag(); if(nloop > 0) { @@ -560,7 +618,7 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) } if(GSTransferStatus.PTH3 < PENDINGSTOP_MODE || pathidx != 2) { - gifRegs->stat.OPH = true; + //gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama) gifRegs->stat.APATH = pathidx + 1; } @@ -588,7 +646,7 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) break; } gifRegs->stat.APATH = pathidx + 1; - gifRegs->stat.OPH = true; + //gifRegs->stat.OPH = true; // why set the GS output path flag here? (rama) switch(tag.FLG) { case GIF_FLG_PACKED: @@ -599,9 +657,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) { do { if (GetReg() == 0xe) { - gsHandler(pMem); + gsHandler((u8*)pMem128); } - incTag(1); + copyTag(); } while(StepReg() && size > 0 && SIGNAL_IMR_Pending == false); } else @@ -644,11 +702,14 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) curreg = 0; nloop = 0; } - incTag(len); + + MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len ); + pMem128 += len; + size -= len; } break; case GIF_FLG_REGLIST: - { + { GIF_LOG("Reglist Mode EOP %x", tag.EOP); // In reglist mode, the GIF packs 2 registers into each QWC. The nloop however @@ -687,8 +748,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) nloop = 0; } - incTag(len); - + MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len ); + pMem128 += len; + size -= len; } break; case GIF_FLG_IMAGE: @@ -696,13 +758,15 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) { GIF_LOG("IMAGE Mode EOP %x", tag.EOP); int len = aMin(size, nloop); - incTag(len); + + MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len ); + + pMem128 += len; + size -= len; nloop -= len; } break; } - - } if(pathidx == GIF_PATH_1) @@ -713,11 +777,11 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) { size = 0x3ff - startSize; startSize = 0x3ff; - pMem -= 0x4000; + pMem128 -= 0x400; } else { - // Note: The BIOS does an XGKICK on the VU1 and lets yt DMA to the GS without an EOP + // Note: The BIOS does an XGKICK on the VU1 and lets it DMA to the GS without an EOP // (seemingly to loop forever), only to write an EOP later on. No other game is known to // do anything of the sort. // So lets just cap the DMA at 16k, and force it to "look" like it's terminated for now. @@ -727,6 +791,12 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) Console.Warning("GIFTAG error, size exceeded VU memory size %x", startSize); nloop = 0; + const_cast(tag).EOP = 1; + + // Don't send the packet to the GS -- its incomplete and might cause the GS plugin + // to get confused and die. >_< + + ringpos = original_ringpos; } } } @@ -749,6 +819,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) gsIrq(); } } + + // [TODO] : DMAC Arbitration rights should select the next queued GIF transfer here. + break; } if(SIGNAL_IMR_Pending == true) @@ -793,47 +866,40 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) gif->qwc -= size; } } - - return size; } -// Processes a GIFtag & packet, and throws out some gsIRQs as needed. -// Used to keep interrupts in sync with the EE, while the GS itself -// runs potentially several frames behind. // Parameters: -// size - max size of incoming data stream, in qwc (simd128) -__forceinline int GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size) +// size - max size of incoming data stream, in qwc (simd128). If the path is PATH1, and the +// path does not terminate (EOP) within the specified size, it is assumed that the path must +// loop around to the start of VU memory and continue processing. +__forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size) { -#ifdef PCSX2_GSRING_SAMPLING_STATS - static uptr profStartPtr = 0; - static uptr profEndPtr = 0; - if (profStartPtr == 0) { - __asm - { - __beginfunc: - mov profStartPtr, offset __beginfunc; - mov profEndPtr, offset __endfunc; - } - ProfilerRegisterSource( "GSRingBufCopy", (void*)profStartPtr, profEndPtr - profStartPtr ); - } -#endif - - int retSize = s_gifPath[pathidx].ParseTag(pathidx, pMem, size); - -#ifdef PCSX2_GSRING_SAMPLING_STATS - __asm + switch( pathidx ) { - __endfunc: - nop; + case GIF_PATH_1: + pxAssertMsg(!s_gifPath[GIF_PATH_2].IsActive(), "GIFpath conflict: Attempted to start PATH1 while PATH2 is already active."); + pxAssertMsg(!s_gifPath[GIF_PATH_3].IsActive() || (GSTransferStatus.PTH3 == IMAGE_MODE), "GIFpath conflict: Attempted to start PATH1 while PATH3 is already active."); + return s_gifPath[GIF_PATH_1].CopyTag(pMem, size); + case GIF_PATH_2: + pxAssertMsg(!s_gifPath[GIF_PATH_1].IsActive(), "GIFpath conflict: Attempted to start PATH2 while PATH1 is already active."); + pxAssertMsg(!s_gifPath[GIF_PATH_3].IsActive() || (GSTransferStatus.PTH3 == IMAGE_MODE), "GIFpath conflict: Attempted to start PATH2 while PATH3 is already active."); + return s_gifPath[GIF_PATH_2].CopyTag(pMem, size); + case GIF_PATH_3: + pxAssertMsg(!s_gifPath[GIF_PATH_1].IsActive(), "GIFpath conflict: Attempted to start PATH3 while PATH1 is already active."); + pxAssertMsg(!s_gifPath[GIF_PATH_2].IsActive(), "GIFpath conflict: Attempted to start PATH3 while PATH2 is already active."); + return s_gifPath[GIF_PATH_3].CopyTag(pMem, size); + + jNO_DEFAULT; } -#endif - return retSize; + + return 0; // unreachable } -//Quick version for queueing PATH1 data - +// Quick version for queuing PATH1 data. +// This version calculates the real length of the packet data only. It does not process +// IRQs or DMA status updates. __forceinline int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size) { int retSize = s_gifPath[pathidx].ParseTagQuick(pathidx, pMem, size); diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp index fe4510b4a5..87dbeb055c 100644 --- a/pcsx2/x86/ix86-32/iR5900-32.cpp +++ b/pcsx2/x86/ix86-32/iR5900-32.cpp @@ -1258,11 +1258,11 @@ void recompileNextInstruction(int delayslot) // Calling of this function can be enabled or disabled through the use of EmuConfig.Recompiler.PreBlockChecks static void __fastcall PreBlockCheck( u32 blockpc ) { - static int lastrec = 0; + /*static int lastrec = 0; static int curcount = 0; const int skip = 0; - /*if( blockpc != 0x81fc0 ) {//&& lastrec != g_lastpc ) { + if( blockpc != 0x81fc0 ) {//&& lastrec != g_lastpc ) { curcount++; if( curcount > skip ) { diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl index 7b1cb1edbb..41a38861fd 100644 --- a/pcsx2/x86/microVU_Lower.inl +++ b/pcsx2/x86/microVU_Lower.inl @@ -1097,7 +1097,6 @@ void __fastcall mVU_XGKICK_(u32 addr) { u8* data = microVU1.regs->Mem + (addr*16); u32 diff = 0x400 - addr; u32 size; - u8* pDest; if(gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.APATH == GIF_APATH3 && gifRegs->stat.IP3 == true) && SIGNAL_IMR_Pending == false) { @@ -1106,23 +1105,12 @@ void __fastcall mVU_XGKICK_(u32 addr) { //Flush any pending transfers so things dont go up in the wrong order while(gifRegs->stat.P1Q == true) gsPath1Interrupt(); } - size = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff); - pDest = GetMTGS().GetDataPacketPtr(); - - if (size > diff) { - //DevCon.WriteLn("XGkick Wrap!"); - memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff); - size -= diff; - pDest += diff*16; - memcpy_qwc(pDest, microVU1.regs->Mem, size); - } - else { - memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size); - } + GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400); + size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff); GetMTGS().SendDataPacket(); + if(GSTransferStatus.PTH1 == STOPPED_MODE) { - gifRegs->stat.OPH = false; gifRegs->stat.APATH = GIF_APATH_IDLE; } } @@ -1130,17 +1118,16 @@ void __fastcall mVU_XGKICK_(u32 addr) { { //DevCon.Warning("GIF APATH busy %x Holding for later W %x, R %x", gifRegs->stat.APATH, Path1WritePos, Path1ReadPos); size = GIFPath_ParseTagQuick(GIF_PATH_1, data, diff); - pDest = &Path1Buffer[Path1WritePos*16]; + u8* pDest = &Path1Buffer[Path1WritePos*16]; - pxAssumeMsg((Path1WritePos+size < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!"); + Path1WritePos += size; + + pxAssumeMsg((Path1WritePos < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!"); //DevCon.Warning("Storing size %x PATH 1", size); if (size > diff) { - // fixme: one of these days the following *16's will get cleaned up when we introduce - // a special qwc/simd16 optimized version of memcpy_aligned. :) //DevCon.Status("XGkick Wrap!"); memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff); - Path1WritePos += size; size -= diff; pDest += diff*16; memcpy_qwc(pDest, microVU1.regs->Mem, size); diff --git a/pcsx2/x86/sVU_Lower.cpp b/pcsx2/x86/sVU_Lower.cpp index c8d103477b..5191748f78 100644 --- a/pcsx2/x86/sVU_Lower.cpp +++ b/pcsx2/x86/sVU_Lower.cpp @@ -1988,24 +1988,12 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr) //Flush any pending transfers so things dont go up in the wrong order while(gifRegs->stat.P1Q == true) gsPath1Interrupt(); } - size = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff); - pDest = GetMTGS().GetDataPacketPtr(); - if (size > diff) { - // fixme: one of these days the following *16's will get cleaned up when we introduce - // a special qwc/simd16 optimized version of memcpy_aligned. :) - - memcpy_aligned(pDest, VU1.Mem + addr, diff*16); - size -= diff; - pDest += diff*16; - memcpy_aligned(pDest, VU1.Mem, size*16); - } - else { - memcpy_aligned(pDest, VU1.Mem + addr, size*16); - } + GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400); + size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff); GetMTGS().SendDataPacket(); + if(GSTransferStatus.PTH1 == STOPPED_MODE ) { - gifRegs->stat.OPH = false; gifRegs->stat.APATH = GIF_APATH_IDLE; } } @@ -2015,8 +2003,6 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr) size = GIFPath_ParseTagQuick(GIF_PATH_1, data, diff); pDest = &Path1Buffer[Path1WritePos*16]; - - pxAssumeMsg((Path1WritePos+size < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!"); //DevCon.Warning("Storing size %x PATH 1", size); @@ -2024,14 +2010,14 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr) // fixme: one of these days the following *16's will get cleaned up when we introduce // a special qwc/simd16 optimized version of memcpy_aligned. :) //DevCon.Status("XGkick Wrap!"); - memcpy_aligned(pDest, VU1.Mem + addr, diff*16); + memcpy_aligned(pDest, VU1.Mem + addr, diff); Path1WritePos += size; size -= diff; pDest += diff*16; - memcpy_aligned(pDest, VU1.Mem, size*16); + memcpy_aligned(pDest, VU1.Mem, size); } else { - memcpy_aligned(pDest, VU1.Mem + addr, size*16); + memcpy_aligned(pDest, VU1.Mem + addr, size); Path1WritePos += size; } //if(!gifRegs->stat.P1Q) CPU_INT(28, 128); diff --git a/plugins/spu2-x/src/Linux/SPU2-X.cbp b/plugins/spu2-x/src/Linux/SPU2-X.cbp index c262b9f674..fa27c73506 100644 --- a/plugins/spu2-x/src/Linux/SPU2-X.cbp +++ b/plugins/spu2-x/src/Linux/SPU2-X.cbp @@ -195,8 +195,6 @@ - - diff --git a/plugins/zzogl-pg/opengl/GS.h b/plugins/zzogl-pg/opengl/GS.h index 3ac73bde9b..8ef2d0175b 100644 --- a/plugins/zzogl-pg/opengl/GS.h +++ b/plugins/zzogl-pg/opengl/GS.h @@ -635,7 +635,7 @@ typedef struct int imageTransfer; int imageWnew, imageHnew, imageX, imageY, imageEndX, imageEndY; - pathInfo path[3]; + pathInfo path[4]; GIFRegDIMX dimx; void setRGBA(u32 r, u32 g, u32 b, u32 a) { diff --git a/plugins/zzogl-pg/opengl/GifTransfer.cpp b/plugins/zzogl-pg/opengl/GifTransfer.cpp index d8776eff13..4939f53dd9 100644 --- a/plugins/zzogl-pg/opengl/GifTransfer.cpp +++ b/plugins/zzogl-pg/opengl/GifTransfer.cpp @@ -265,8 +265,17 @@ void CALLBACK GSgifTransfer3(u32 *pMem, u32 size) _GSgifTransfer<2>(pMem, size); } -void InitPath() +void CALLBACK GSgifTransfer(u32 *pMem, u32 size) { - gs.path[0].mode = gs.path[1].mode = gs.path[2].mode = 0; + FUNCLOG + + //ZZLog::GS_Log("GSgifTransfer3 size = %lx (mode %d, gs.path3.tag.nloop = %d).", size, gs.path[2].mode, gs.path[2].tag.nloop); + + _GSgifTransfer<3>(pMem, size); +} + +void InitPath() +{ + gs.path[0].mode = gs.path[1].mode = gs.path[2].mode = gs.path[3].mode = 0; }