mirror of https://github.com/PCSX2/pcsx2.git
ReorderingMTGS: Added a new optimized memcpy_amd_qwc, for use by GIFpath copies. After much studying, we determined this is about as efficient as memcpy will ever get, for what we're doing with it.
DevNot: Win32-only at the moment -- needs a GAS port (but that shouldn't be hard). I made some notes in the code about it. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3472 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
a6b3acb5d0
commit
934578c8fe
|
@ -28,6 +28,7 @@
|
||||||
# include "win_memzero.h"
|
# include "win_memzero.h"
|
||||||
|
|
||||||
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
||||||
|
extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
|
||||||
extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
||||||
extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
||||||
|
|
||||||
|
@ -37,6 +38,8 @@
|
||||||
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||||
|
|
||||||
#define memcpy_fast memcpy_amd_ // Fast memcpy
|
#define memcpy_fast memcpy_amd_ // Fast memcpy
|
||||||
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c*16) // Memcpy with 16-byte Aligned addresses
|
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses
|
||||||
#define memcpy_const memcpy_amd_ // Memcpy with constant size
|
#define memcpy_const memcpy_amd_ // Memcpy with constant size
|
||||||
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
|
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
|
||||||
|
#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)
|
||||||
|
//#define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16)
|
||||||
|
|
|
@ -129,6 +129,10 @@ namespace Threading
|
||||||
|
|
||||||
// For use in spin/wait loops.
|
// For use in spin/wait loops.
|
||||||
extern void SpinWait();
|
extern void SpinWait();
|
||||||
|
|
||||||
|
// Use prior to committing data to another thread (internal memcpy_qwc does not use fencing,
|
||||||
|
// so that many memcpys can be issued in a row more efficiently)
|
||||||
|
extern void StoreFence();
|
||||||
|
|
||||||
// Optional implementation to enable hires thread/process scheduler for the operating system.
|
// Optional implementation to enable hires thread/process scheduler for the operating system.
|
||||||
// Needed by Windows, but might not be relevant to other platforms.
|
// Needed by Windows, but might not be relevant to other platforms.
|
||||||
|
|
|
@ -36,6 +36,11 @@ __forceinline void Threading::SpinWait()
|
||||||
__asm pause;
|
__asm pause;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__forceinline void Threading::StoreFence()
|
||||||
|
{
|
||||||
|
__asm sfence;
|
||||||
|
}
|
||||||
|
|
||||||
__forceinline void Threading::EnableHiresScheduler()
|
__forceinline void Threading::EnableHiresScheduler()
|
||||||
{
|
{
|
||||||
// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of
|
// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of
|
||||||
|
|
|
@ -146,7 +146,7 @@ $memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
||||||
|
|
||||||
add esi, 64 ; update source pointer
|
add esi, 64 ; update source pointer
|
||||||
add edi, 64 ; update destination pointer
|
add edi, 64 ; update destination pointer
|
||||||
dec eax ; count down
|
sub eax, 1
|
||||||
jnz $memcpy_ic_1 ; last 64-byte block?
|
jnz $memcpy_ic_1 ; last 64-byte block?
|
||||||
|
|
||||||
$memcpy_ic_2:
|
$memcpy_ic_2:
|
||||||
|
@ -189,64 +189,15 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy
|
||||||
movq mm1,[esi-8]
|
movq mm1,[esi-8]
|
||||||
movntq [edi-24], mm2
|
movntq [edi-24], mm2
|
||||||
movntq [edi-16], mm0
|
movntq [edi-16], mm0
|
||||||
dec eax
|
|
||||||
movntq [edi-8], mm1
|
movntq [edi-8], mm1
|
||||||
|
|
||||||
|
sub eax, 1
|
||||||
jnz $memcpy_uc_1 ; last 64-byte block?
|
jnz $memcpy_uc_1 ; last 64-byte block?
|
||||||
|
|
||||||
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
|
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
|
||||||
|
|
||||||
// For the largest size blocks, a special technique called Block Prefetch
|
// Note: Pcsx2 rarely invokes large copies, so the large copy "block prefetch" mode has been
|
||||||
// can be used to accelerate the read operations. Block Prefetch reads
|
// disabled to help keep the code cache footprint of memcpy_fast to a minimum.
|
||||||
// one address per cache line, for a series of cache lines, in a short loop.
|
|
||||||
// This is faster than using software prefetch. The technique is great for
|
|
||||||
// getting maximum read bandwidth, especially in DDR memory systems.
|
|
||||||
|
|
||||||
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
|
|
||||||
// help keep the code cache footprint of memcpy_fast to a minimum.
|
|
||||||
/*
|
|
||||||
$memcpy_bp_1: ; large blocks, block prefetch copy
|
|
||||||
|
|
||||||
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
|
||||||
jl $memcpy_64_test ; no, back to regular uncached copy
|
|
||||||
|
|
||||||
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
|
|
||||||
add esi, CACHEBLOCK * 64 ; move to the top of the block
|
|
||||||
align 16
|
|
||||||
$memcpy_bp_2:
|
|
||||||
mov edx, [esi-64] ; grab one address per cache line
|
|
||||||
mov edx, [esi-128] ; grab one address per cache line
|
|
||||||
sub esi, 128 ; go reverse order to suppress HW prefetcher
|
|
||||||
dec eax ; count down the cache lines
|
|
||||||
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
|
|
||||||
|
|
||||||
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
|
|
||||||
align 16
|
|
||||||
$memcpy_bp_3:
|
|
||||||
movq mm0, [esi ] ; read 64 bits
|
|
||||||
movq mm1, [esi+ 8]
|
|
||||||
movq mm2, [esi+16]
|
|
||||||
movq mm3, [esi+24]
|
|
||||||
movq mm4, [esi+32]
|
|
||||||
movq mm5, [esi+40]
|
|
||||||
movq mm6, [esi+48]
|
|
||||||
movq mm7, [esi+56]
|
|
||||||
add esi, 64 ; update source pointer
|
|
||||||
movntq [edi ], mm0 ; write 64 bits, bypassing cache
|
|
||||||
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
|
|
||||||
movntq [edi+16], mm2 ; from READING the destination address
|
|
||||||
movntq [edi+24], mm3 ; into the cache, only to be over-written,
|
|
||||||
movntq [edi+32], mm4 ; so that also helps performance
|
|
||||||
movntq [edi+40], mm5
|
|
||||||
movntq [edi+48], mm6
|
|
||||||
movntq [edi+56], mm7
|
|
||||||
add edi, 64 ; update dest pointer
|
|
||||||
|
|
||||||
dec eax ; count down
|
|
||||||
|
|
||||||
jnz $memcpy_bp_3 ; keep copying
|
|
||||||
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
|
||||||
jmp $memcpy_bp_1 ; keep processing chunks
|
|
||||||
*/
|
|
||||||
|
|
||||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||||
// form which is an "unrolled loop". Then it handles the last few bytes.
|
// form which is an "unrolled loop". Then it handles the last few bytes.
|
||||||
|
@ -274,17 +225,99 @@ $memcpy_last_few: ; dword aligned from before movsd's
|
||||||
rep movsb ; the last 1, 2, or 3 bytes
|
rep movsb ; the last 1, 2, or 3 bytes
|
||||||
|
|
||||||
$memcpy_final:
|
$memcpy_final:
|
||||||
|
pop esi
|
||||||
|
pop edi
|
||||||
|
|
||||||
emms ; clean up the MMX state
|
emms ; clean up the MMX state
|
||||||
sfence ; flush the write buffer
|
sfence ; flush the write buffer
|
||||||
//mov eax, [dest] ; ret value = destination pointer
|
//mov eax, [dest] ; ret value = destination pointer
|
||||||
|
|
||||||
pop esi
|
|
||||||
pop edi
|
|
||||||
|
|
||||||
ret 4
|
ret 4
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
|
||||||
|
__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
|
||||||
|
{
|
||||||
|
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
|
||||||
|
// registers will improve copy performance, because they won't. Use of XMMs is only
|
||||||
|
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
|
||||||
|
// and even then the benefits are typically minimal (sometimes slower depending on the
|
||||||
|
// amount of data being copied).
|
||||||
|
//
|
||||||
|
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
|
||||||
|
// --air
|
||||||
|
|
||||||
|
// Linux Conversion note:
|
||||||
|
// This code would benefit nicely from having inline-able GAS syntax, since it should
|
||||||
|
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
|
||||||
|
// And its called enough times to probably merit the extra effort to ensure proper
|
||||||
|
// optimization. --air
|
||||||
|
|
||||||
|
__asm
|
||||||
|
{
|
||||||
|
mov ecx, [dest]
|
||||||
|
mov edx, [src]
|
||||||
|
mov eax, [qwc] ; keep a copy of count
|
||||||
|
shr eax, 1
|
||||||
|
jz $memcpy_qwc_1 ; only one 16 byte block to copy?
|
||||||
|
|
||||||
|
cmp eax, IN_CACHE_COPY/32
|
||||||
|
jb $memcpy_qwc_loop1 ; small copies should be cached (definite speedup --air)
|
||||||
|
|
||||||
|
$memcpy_qwc_loop2: ; 32-byte blocks, uncached copy
|
||||||
|
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
|
||||||
|
|
||||||
|
movq mm0,[edx+0] ; read 64 bits
|
||||||
|
movq mm1,[edx+8]
|
||||||
|
movq mm2,[edx+16]
|
||||||
|
movntq [ecx+0], mm0 ; write 64 bits, bypassing the cache
|
||||||
|
movntq [ecx+8], mm1
|
||||||
|
movq mm3,[edx+24]
|
||||||
|
movntq [ecx+16], mm2
|
||||||
|
movntq [ecx+24], mm3
|
||||||
|
|
||||||
|
add edx,32 ; update source pointer
|
||||||
|
add ecx,32 ; update destination pointer
|
||||||
|
sub eax,1
|
||||||
|
jnz $memcpy_qwc_loop2 ; last 64-byte block?
|
||||||
|
sfence ; flush the write buffer
|
||||||
|
jmp $memcpy_qwc_1
|
||||||
|
|
||||||
|
; 32-byte blocks, cached!
|
||||||
|
; This *is* important. Removing this and using exclusively non-temporal stores
|
||||||
|
; results in noticable speed loss!
|
||||||
|
|
||||||
|
$memcpy_qwc_loop1:
|
||||||
|
prefetchnta [edx + 568] ; start reading ahead (tested: it helps! --air)
|
||||||
|
|
||||||
|
movq mm0,[edx+0] ; read 64 bits
|
||||||
|
movq mm1,[edx+8]
|
||||||
|
movq mm2,[edx+16]
|
||||||
|
movq [ecx+0], mm0 ; write 64 bits, bypassing the cache
|
||||||
|
movq [ecx+8], mm1
|
||||||
|
movq mm3,[edx+24]
|
||||||
|
movq [ecx+16], mm2
|
||||||
|
movq [ecx+24], mm3
|
||||||
|
|
||||||
|
add edx,32 ; update source pointer
|
||||||
|
add ecx,32 ; update destination pointer
|
||||||
|
sub eax,1
|
||||||
|
jnz $memcpy_qwc_loop1 ; last 64-byte block?
|
||||||
|
|
||||||
|
$memcpy_qwc_1:
|
||||||
|
test [qwc],1
|
||||||
|
jz $memcpy_qwc_final
|
||||||
|
movq mm0,[edx]
|
||||||
|
movq mm1,[edx+8]
|
||||||
|
movq [ecx], mm0
|
||||||
|
movq [ecx+8], mm1
|
||||||
|
|
||||||
|
$memcpy_qwc_final:
|
||||||
|
emms ; clean up the MMX state
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// mmx mem-compare implementation, size has to be a multiple of 8
|
// mmx mem-compare implementation, size has to be a multiple of 8
|
||||||
// returns 0 is equal, nonzero value if not equal
|
// returns 0 is equal, nonzero value if not equal
|
||||||
// ~10 times faster than standard memcmp
|
// ~10 times faster than standard memcmp
|
||||||
|
|
|
@ -395,6 +395,7 @@ struct Pcsx2Config
|
||||||
// style. Useful for debugging potential bugs in the MTGS pipeline.
|
// style. Useful for debugging potential bugs in the MTGS pipeline.
|
||||||
bool SynchronousMTGS;
|
bool SynchronousMTGS;
|
||||||
bool DisableOutput;
|
bool DisableOutput;
|
||||||
|
int VsyncQueueSize;
|
||||||
|
|
||||||
bool FrameLimitEnable;
|
bool FrameLimitEnable;
|
||||||
bool FrameSkipEnable;
|
bool FrameSkipEnable;
|
||||||
|
@ -420,6 +421,8 @@ struct Pcsx2Config
|
||||||
return
|
return
|
||||||
OpEqu( SynchronousMTGS ) &&
|
OpEqu( SynchronousMTGS ) &&
|
||||||
OpEqu( DisableOutput ) &&
|
OpEqu( DisableOutput ) &&
|
||||||
|
OpEqu( VsyncQueueSize ) &&
|
||||||
|
|
||||||
OpEqu( FrameSkipEnable ) &&
|
OpEqu( FrameSkipEnable ) &&
|
||||||
OpEqu( FrameLimitEnable ) &&
|
OpEqu( FrameLimitEnable ) &&
|
||||||
OpEqu( VsyncEnable ) &&
|
OpEqu( VsyncEnable ) &&
|
||||||
|
|
|
@ -142,14 +142,11 @@ void SysMtgsThread::PostVsyncEnd()
|
||||||
|
|
||||||
SendDataPacket();
|
SendDataPacket();
|
||||||
|
|
||||||
// Alter-frame flushing! Restarts the ringbuffer (wraps) on every other frame. This is a
|
// If the MTGS is allowed to queue a lot of frames in advance, it creates input lag.
|
||||||
// mandatory feature that prevents the MTGS from queuing more than 2 frames at any time.
|
// Use the Queued FrameCount to stall the EE if another vsync is already queued in
|
||||||
// (queued frames cause input lag and desynced audio -- bad!). Ring restarts work for this
|
// the ringbuffer.
|
||||||
// because they act as sync points where the EE must stall to wait for the GS to catch-up,
|
|
||||||
// and they also allow us to reuse the front of the ringbuffer more often, which should improve
|
|
||||||
// L2 cache performance.
|
|
||||||
|
|
||||||
if( AtomicIncrement(m_QueuedFrameCount) < 2 ) return;
|
if( AtomicIncrement(m_QueuedFrameCount) < EmuConfig.GS.VsyncQueueSize ) return;
|
||||||
|
|
||||||
uint readpos = volatize(m_RingPos);
|
uint readpos = volatize(m_RingPos);
|
||||||
uint freeroom;
|
uint freeroom;
|
||||||
|
@ -190,7 +187,7 @@ void SysMtgsThread::OpenPlugin()
|
||||||
{
|
{
|
||||||
if( m_PluginOpened ) return;
|
if( m_PluginOpened ) return;
|
||||||
|
|
||||||
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 );
|
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
|
||||||
GSsetBaseMem( RingBuffer.Regs );
|
GSsetBaseMem( RingBuffer.Regs );
|
||||||
GSirqCallback( dummyIrqCallback );
|
GSirqCallback( dummyIrqCallback );
|
||||||
|
|
||||||
|
@ -624,6 +621,7 @@ void SysMtgsThread::SendDataPacket()
|
||||||
PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
|
PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
|
||||||
tag.data[0] = actualSize;
|
tag.data[0] = actualSize;
|
||||||
|
|
||||||
|
//Threading::StoreFence();
|
||||||
m_WritePos = m_packet_ringpos;
|
m_WritePos = m_packet_ringpos;
|
||||||
|
|
||||||
if( EmuConfig.GS.SynchronousMTGS )
|
if( EmuConfig.GS.SynchronousMTGS )
|
||||||
|
|
|
@ -217,6 +217,7 @@ Pcsx2Config::GSOptions::GSOptions()
|
||||||
|
|
||||||
SynchronousMTGS = false;
|
SynchronousMTGS = false;
|
||||||
DisableOutput = false;
|
DisableOutput = false;
|
||||||
|
VsyncQueueSize = 2;
|
||||||
|
|
||||||
DefaultRegionMode = Region_NTSC;
|
DefaultRegionMode = Region_NTSC;
|
||||||
FramesToDraw = 2;
|
FramesToDraw = 2;
|
||||||
|
@ -234,6 +235,7 @@ void Pcsx2Config::GSOptions::LoadSave( IniInterface& ini )
|
||||||
|
|
||||||
IniEntry( SynchronousMTGS );
|
IniEntry( SynchronousMTGS );
|
||||||
IniEntry( DisableOutput );
|
IniEntry( DisableOutput );
|
||||||
|
IniEntry( VsyncQueueSize );
|
||||||
|
|
||||||
IniEntry( FrameLimitEnable );
|
IniEntry( FrameLimitEnable );
|
||||||
IniEntry( FrameSkipEnable );
|
IniEntry( FrameSkipEnable );
|
||||||
|
|
|
@ -526,36 +526,36 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
|
||||||
void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
|
void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
|
||||||
{
|
{
|
||||||
uint endpos = destStart + len;
|
uint endpos = destStart + len;
|
||||||
if( endpos >= destSize )
|
if( endpos < destSize )
|
||||||
{
|
{
|
||||||
uint firstcopylen = destSize - destStart;
|
memcpy_qwc(&destBase[destStart], src, len );
|
||||||
memcpy_aligned(&destBase[destStart], src, firstcopylen );
|
destStart += len;
|
||||||
|
|
||||||
destStart = endpos % destSize;
|
|
||||||
memcpy_aligned(destBase, src+firstcopylen, destStart );
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
memcpy_aligned(&destBase[destStart], src, len );
|
uint firstcopylen = destSize - destStart;
|
||||||
destStart += len;
|
memcpy_qwc(&destBase[destStart], src, firstcopylen );
|
||||||
|
|
||||||
|
destStart = endpos % destSize;
|
||||||
|
memcpy_qwc(destBase, src+firstcopylen, destStart );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
|
void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
|
||||||
{
|
{
|
||||||
uint endpos = srcStart + len;
|
uint endpos = srcStart + len;
|
||||||
if( endpos >= srcSize )
|
if( endpos < srcSize )
|
||||||
{
|
{
|
||||||
uint firstcopylen = srcSize - srcStart;
|
memcpy_qwc(dest, &srcBase[srcStart], len );
|
||||||
memcpy_aligned(dest, &srcBase[srcStart], firstcopylen );
|
srcStart += len;
|
||||||
|
|
||||||
srcStart = endpos & srcSize;
|
|
||||||
memcpy_aligned(dest+firstcopylen, srcBase, srcStart );
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
memcpy_aligned(dest, &srcBase[srcStart], len );
|
uint firstcopylen = srcSize - srcStart;
|
||||||
srcStart += len;
|
memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
|
||||||
|
|
||||||
|
srcStart = endpos & srcSize;
|
||||||
|
memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1129,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
|
||||||
// fixme: one of these days the following *16's will get cleaned up when we introduce
|
// fixme: one of these days the following *16's will get cleaned up when we introduce
|
||||||
// a special qwc/simd16 optimized version of memcpy_aligned. :)
|
// a special qwc/simd16 optimized version of memcpy_aligned. :)
|
||||||
//DevCon.Status("XGkick Wrap!");
|
//DevCon.Status("XGkick Wrap!");
|
||||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff);
|
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
|
||||||
Path1WritePos += size;
|
Path1WritePos += size;
|
||||||
size -= diff;
|
size -= diff;
|
||||||
pDest += diff*16;
|
pDest += diff*16;
|
||||||
memcpy_aligned(pDest, microVU1.regs->Mem, size);
|
memcpy_qwc(pDest, microVU1.regs->Mem, size);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size);
|
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
|
||||||
Path1WritePos += size;
|
Path1WritePos += size;
|
||||||
}
|
}
|
||||||
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);
|
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);
|
||||||
|
|
Loading…
Reference in New Issue