ReorderingMTGS: Templated the GIFPath parsers, to allow for SSE optimizations.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3474 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2010-07-13 05:20:42 +00:00
parent 934578c8fe
commit 43cd559801
3 changed files with 62 additions and 23 deletions

View File

@ -49,6 +49,7 @@ void gsSetRegionMode( GS_RegionMode region )
void gsInit() void gsInit()
{ {
memzero(g_RealGSMem); memzero(g_RealGSMem);
GIFPath_Initialize();
} }
extern bool SIGNAL_IMR_Pending; extern bool SIGNAL_IMR_Pending;

View File

@ -18,6 +18,15 @@
#include "Common.h" #include "Common.h"
#include "System/SysThreads.h" #include "System/SysThreads.h"
enum CpuExtType
{
CpuExt_Base,
CpuExt_MMX,
CpuExt_SSE,
CpuExt_SSE2,
CpuExt_SSE41,
};
extern __aligned16 u8 g_RealGSMem[Ps2MemSize::GSregs]; extern __aligned16 u8 g_RealGSMem[Ps2MemSize::GSregs];
enum CSR_FifoState enum CSR_FifoState
@ -229,6 +238,7 @@ enum GIF_PATH
GIF_PATH_3, GIF_PATH_3,
}; };
extern void GIFPath_Initialize();
extern int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size); extern int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size);
extern int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size); extern int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
extern void GIFPath_Reset(); extern void GIFPath_Reset();

View File

@ -93,12 +93,16 @@ struct GIFPath
void Reset(); void Reset();
void PrepPackedRegs(); void PrepPackedRegs();
void SetTag(const void* mem);
bool StepReg(); bool StepReg();
u8 GetReg(); u8 GetReg();
bool IsActive() const; bool IsActive() const;
int CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size); template< CpuExtType CpuExt, bool Aligned >
void SetTag(const void* mem);
template< CpuExtType CpuExt, int pathidx >
int CopyTag(const u128* pMem, u32 size);
int ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size); int ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
}; };
@ -286,10 +290,14 @@ __forceinline void GIFPath::PrepPackedRegs()
} }
} }
template< CpuExtType CpuExt, bool Aligned >
__forceinline void GIFPath::SetTag(const void* mem) __forceinline void GIFPath::SetTag(const void* mem)
{ {
_mm_store_ps( (float*)&tag, _mm_loadu_ps((float*)mem) ); if( CpuExt >= CpuExt_SSE )
//const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem); _mm_store_ps( (float*)&tag, Aligned ? _mm_load_ps((const float*)mem) : _mm_loadu_ps((const float*)mem) );
else
const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);
nloop = tag.NLOOP; nloop = tag.NLOOP;
curreg = 0; curreg = 0;
@ -373,10 +381,9 @@ static __forceinline void gsHandler(const u8* pMem)
#define aMin(x, y) std::min(x, y) #define aMin(x, y) std::min(x, y)
// Parameters: // Parameters:
// size (path1) - difference between the end of VU memory and pMem. // size - max size of incoming data stream, in qwc (simd128). If the path is PATH1, and the
// size (path2/3) - max size of incoming data stream, in qwc (simd128) // path does not terminate (EOP) within the specified size, it is assumed that the path must
// loop around to the start of VU memory and continue processing.
__forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size) __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
{ {
u32 startSize = size; // Start Size u32 startSize = size; // Start Size
@ -384,7 +391,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
while (size > 0) { while (size > 0) {
if (!nloop) { if (!nloop) {
SetTag(pMem); SetTag<CpuExt_Base,false>(pMem);
incTag(1); incTag(1);
} }
else else
@ -523,7 +530,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
return size; return size;
} }
void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len ) __forceinline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
{ {
uint endpos = destStart + len; uint endpos = destStart + len;
if( endpos < destSize ) if( endpos < destSize )
@ -541,7 +548,7 @@ void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint
} }
} }
void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len ) __forceinline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
{ {
uint endpos = srcStart + len; uint endpos = srcStart + len;
if( endpos < srcSize ) if( endpos < srcSize )
@ -559,16 +566,21 @@ void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128
} }
} }
// [TODO] optimization: If later templated, we can have Paths 1 and 3 use aligned SSE movs,
// since only PATH2 can feed us unaligned source data.
#define copyTag() do { \ #define copyTag() do { \
/*RingBuffer.m_Ring[ringpos] = *pMem128;*/ \ if( CpuExt >= CpuExt_SSE ) \
_mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], _mm_loadu_ps((float*)pMem128)); \ _mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], (pathidx!=GIF_PATH_2) ? _mm_load_ps((float*)pMem128) : _mm_loadu_ps((float*)pMem128)); \
else \
RingBuffer.m_Ring[ringpos] = *pMem128; \
++pMem128; --size; \ ++pMem128; --size; \
ringpos = (ringpos+1)&RingBufferMask; \ ringpos = (ringpos+1)&RingBufferMask; \
} while(false) } while(false)
__forceinline int GIFPath::CopyTag(GIF_PATH pathidx, const u128* pMem128, u32 size) // Parameters:
// size - max size of incoming data stream, in qwc (simd128). If the path is PATH1, and the
// path does not terminate (EOP) within the specified size, it is assumed that the path must
// loop around to the start of VU memory and continue processing.
template< CpuExtType CpuExt, int pathidx >
__forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
{ {
uint& ringpos = GetMTGS().m_packet_ringpos; uint& ringpos = GetMTGS().m_packet_ringpos;
const uint original_ringpos = ringpos; const uint original_ringpos = ringpos;
@ -578,12 +590,7 @@ __forceinline int GIFPath::CopyTag(GIF_PATH pathidx, const u128* pMem128, u32 si
while (size > 0) { while (size > 0) {
if (!nloop) { if (!nloop) {
// [TODO] Optimization: Use MMX intrinsics for SetTag and CopyTag, which both currently SetTag<CpuExt, (pathidx!=GIF_PATH_2)>((u8*)pMem128);
// produce a series of mov eax,[src]; mov [dest],eax instructions to copy these
// individual qwcs. Warning: Path2 transfers are not always QWC-aligned, but they are
// always aligned on an 8 byte boundary; so its probably best to use MMX here.
SetTag((u8*)pMem128);
copyTag(); copyTag();
if(nloop > 0) if(nloop > 0)
@ -863,9 +870,30 @@ __forceinline int GIFPath::CopyTag(GIF_PATH pathidx, const u128* pMem128, u32 si
return size; return size;
} }
typedef int __fastcall FnType_CopyTag(const u128* pMem, u32 size);
static __aligned16 FnType_CopyTag* tbl_CopyTag[3];
// Parameters:
// size - max size of incoming data stream, in qwc (simd128). If the path is PATH1, and the
// path does not terminate (EOP) within the specified size, it is assumed that the path must
// loop around to the start of VU memory and continue processing.
template< CpuExtType CpuExt, int pathidx >
static int __fastcall _CopyTag_tmpl(const u128* pMem, u32 size)
{
return s_gifPath[pathidx].CopyTag<CpuExt,pathidx>(pMem, size);
}
void GIFPath_Initialize()
{
tbl_CopyTag[0] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 0> : _CopyTag_tmpl<CpuExt_Base, 0>;
tbl_CopyTag[1] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 1> : _CopyTag_tmpl<CpuExt_Base, 1>;
tbl_CopyTag[2] = x86caps.hasStreamingSIMDExtensions ? _CopyTag_tmpl<CpuExt_SSE, 2> : _CopyTag_tmpl<CpuExt_Base, 2>;
}
__forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size) __forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
{ {
return s_gifPath[pathidx].CopyTag(pathidx, pMem, size); return tbl_CopyTag[pathidx](pMem, size);
} }
// Quick version for queueing PATH1 data. // Quick version for queueing PATH1 data.