mirror of https://github.com/PCSX2/pcsx2.git
* Disable newVifUnpack, which I left enabled in the prev commit (it's not ready yet!)
* Added feature to align call targets for EErec functions and blocks on P4's and AMDs, and pack them on Core2/i7's. * Fixed some svn:native props. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2347 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
b5f643950c
commit
b3fead5dc9
|
@ -157,9 +157,12 @@ template< typename T > void xWrite( T val );
|
|||
class ModSibBase;
|
||||
|
||||
extern void xSetPtr( void* ptr );
|
||||
extern u8* xGetPtr();
|
||||
extern void xAlignPtr( uint bytes );
|
||||
extern void xAdvancePtr( uint bytes );
|
||||
extern void xAlignCallTarget();
|
||||
|
||||
extern u8* xGetPtr();
|
||||
extern u8* xGetAlignedCallTarget();
|
||||
|
||||
extern JccComparisonType xInvertCond( JccComparisonType src );
|
||||
|
||||
|
|
|
@ -395,6 +395,32 @@ __emitinline void xAlignPtr( uint bytes )
|
|||
x86Ptr = (u8*)( ( (uptr)x86Ptr + bytes - 1) & ~(bytes - 1) );
|
||||
}
|
||||
|
||||
// Performs best-case alignment for the target CPU, for use prior to starting a new
|
||||
// function. This is not meant to be used prior to jump targets, since it doesn't
|
||||
// add padding (additionally, speed benefit from jump alignment is minimal, and often
|
||||
// a loss).
|
||||
__emitinline void xAlignCallTarget()
|
||||
{
|
||||
// Core2/i7 CPUs prefer unaligned addresses. Checking for SSSE3 is a decent filter.
|
||||
// (also align in debug modes for disasm convenience)
|
||||
|
||||
if( IsDebugBuild || !x86caps.hasSupplementalStreamingSIMD3Extensions )
|
||||
{
|
||||
// - P4's and earlier prefer 16 byte alignment.
|
||||
// - AMD Athlons and Phenoms prefer 8 byte alignment, but I don't have an easy
|
||||
// heuristic for it yet.
|
||||
// - AMD Phenom IIs are unknown (either prefer 8 byte, or unaligned).
|
||||
|
||||
xAlignPtr( 16 );
|
||||
}
|
||||
}
|
||||
|
||||
__emitinline u8* xGetAlignedCallTarget()
|
||||
{
|
||||
xAlignCallTarget();
|
||||
return x86Ptr;
|
||||
}
|
||||
|
||||
__emitinline void xAdvancePtr( uint bytes )
|
||||
{
|
||||
if( IsDevBuild )
|
||||
|
|
|
@ -58,6 +58,11 @@ __forceinline void vif1FLUSH()
|
|||
|
||||
void vif1Init()
|
||||
{
|
||||
#ifdef newVif1
|
||||
extern void initNewVif(int idx);
|
||||
initNewVif(1);
|
||||
#endif
|
||||
|
||||
SetNewMask(g_vif1Masks, g_vif1HasMask3, 0, 0xffffffff);
|
||||
}
|
||||
|
||||
|
@ -313,19 +318,13 @@ static int __fastcall Vif1TransDirectHL(u32 *data)
|
|||
|
||||
return ret;
|
||||
}
|
||||
#ifdef newVif1
|
||||
extern void initNewVif(int idx);
|
||||
extern int nVifUnpack(int idx, u32 *data);
|
||||
static int testVif = 0;
|
||||
#endif
|
||||
static int __fastcall Vif1TransUnpack(u32 *data)
|
||||
{
|
||||
#ifdef newVif1
|
||||
if (!testVif) { initNewVif(1); testVif = 1; }
|
||||
//int temp = nVifUnpack(1, data);
|
||||
//if (temp >= 0) return temp;
|
||||
extern int nVifUnpack(int idx, u32 *data);
|
||||
return nVifUnpack(1, data);
|
||||
#endif
|
||||
|
||||
XMMRegisters::Freeze();
|
||||
|
||||
if (vif1.vifpacketsize < vif1.tag.size)
|
||||
|
|
|
@ -60,7 +60,7 @@ static __forceinline u32 vif_size(u8 num)
|
|||
return (num == 0) ? 0x1000 : 0x4000;
|
||||
}
|
||||
|
||||
#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
|
||||
#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
|
||||
//#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
|
||||
//#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
|
||||
//#define newVif0 // Use New Code for Vif0 Unpacks (not implemented)
|
||||
#endif
|
||||
|
|
|
@ -371,7 +371,7 @@ static DynGenFunc* _DynGen_JITCompile()
|
|||
{
|
||||
pxAssertMsg( DispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple. Thanks." );
|
||||
|
||||
u8* retval = xGetPtr();
|
||||
u8* retval = xGetAlignedCallTarget();
|
||||
_DynGen_StackFrameCheck();
|
||||
|
||||
xMOV( ecx, &cpuRegs.pc );
|
||||
|
@ -388,7 +388,7 @@ static DynGenFunc* _DynGen_JITCompile()
|
|||
|
||||
static DynGenFunc* _DynGen_JITCompileInBlock()
|
||||
{
|
||||
u8* retval = xGetPtr();
|
||||
u8* retval = xGetAlignedCallTarget();
|
||||
xJMP( JITCompile );
|
||||
return (DynGenFunc*)retval;
|
||||
}
|
||||
|
@ -396,7 +396,7 @@ static DynGenFunc* _DynGen_JITCompileInBlock()
|
|||
// called when jumping to variable pc address
|
||||
static DynGenFunc* _DynGen_DispatcherReg()
|
||||
{
|
||||
u8* retval = xGetPtr();
|
||||
u8* retval = xGetPtr(); // fallthrough target, can't align it!
|
||||
_DynGen_StackFrameCheck();
|
||||
|
||||
xMOV( eax, &cpuRegs.pc );
|
||||
|
@ -410,7 +410,7 @@ static DynGenFunc* _DynGen_DispatcherReg()
|
|||
|
||||
static DynGenFunc* _DynGen_EnterRecompiledCode()
|
||||
{
|
||||
u8* retval = xGetPtr();
|
||||
u8* retval = xGetAlignedCallTarget();
|
||||
|
||||
// "standard" frame pointer setup for aligned stack: Record the original
|
||||
// esp into ebp, and then align esp. ebp references the original esp base
|
||||
|
@ -446,6 +446,8 @@ static DynGenFunc* _DynGen_EnterRecompiledCode()
|
|||
xMOV( &s_store_ebp, ebp );
|
||||
|
||||
xJMP( ptr32[&DispatcherReg] );
|
||||
|
||||
xAlignCallTarget();
|
||||
imm = (uptr)xGetPtr();
|
||||
ExitRecompiledCode = (DynGenFunc*)xGetPtr();
|
||||
|
||||
|
@ -1254,7 +1256,7 @@ void recompileNextInstruction(int delayslot)
|
|||
// _flushCachedRegs();
|
||||
// g_cpuHasConstReg = 1;
|
||||
|
||||
if (!delayslot && x86Ptr - recPtr > 0x1000)
|
||||
if (!delayslot && (xGetPtr() - recPtr > 0x1000) )
|
||||
s_nEndBlock = pc;
|
||||
}
|
||||
|
||||
|
@ -1335,9 +1337,8 @@ static void __fastcall recRecompile( const u32 startpc )
|
|||
recResetEE();
|
||||
}
|
||||
|
||||
x86SetPtr( recPtr );
|
||||
x86Align(16);
|
||||
recPtr = x86Ptr;
|
||||
xSetPtr( recPtr );
|
||||
recPtr = xGetAlignedCallTarget();
|
||||
|
||||
s_nBlockFF = false;
|
||||
if (HWADDR(startpc) == 0x81fc0)
|
||||
|
@ -1718,14 +1719,14 @@ StartRecomp:
|
|||
}
|
||||
}
|
||||
|
||||
pxAssert( x86Ptr < recMem+REC_CACHEMEM );
|
||||
pxAssert( xGetPtr() < recMem+REC_CACHEMEM );
|
||||
pxAssert( recConstBufPtr < recConstBuf + RECCONSTBUF_SIZE );
|
||||
pxAssert( x86FpuState == 0 );
|
||||
|
||||
pxAssert(x86Ptr - recPtr < 0x10000);
|
||||
s_pCurBlockEx->x86size = x86Ptr - recPtr;
|
||||
pxAssert(xGetPtr() - recPtr < 0x10000);
|
||||
s_pCurBlockEx->x86size = xGetPtr() - recPtr;
|
||||
|
||||
recPtr = x86Ptr;
|
||||
recPtr = xGetPtr();
|
||||
|
||||
pxAssert( (g_cpuHasConstReg&g_cpuFlushedConstReg) == g_cpuHasConstReg );
|
||||
|
||||
|
|
|
@ -26,7 +26,8 @@ struct nVifStruct {
|
|||
u32 vuMemLimit; // Use for fast AND
|
||||
BlockBuffer* vifBlock; // Block Buffer
|
||||
};
|
||||
nVifStruct nVif[2];
|
||||
|
||||
static __aligned16 nVifStruct nVif[2];
|
||||
|
||||
void initNewVif(int idx) {
|
||||
nVif[idx].idx = idx;
|
||||
|
@ -112,6 +113,7 @@ static void setMasks(const VIFregisters& v) {
|
|||
// has a lot of setup code to establish which unpack function to call. The best way to
|
||||
// optimize this is to cache the unpack function's base (see fnbase below) and update it
|
||||
// when the variables it depends on are modified: writes to vif->tag.cmd and vif->usn.
|
||||
// Problem: vif->tag.cmd is modified a lot. Like, constantly. So won't work.
|
||||
//
|
||||
// A secondary optimization would be adding special handlers for packets where vifRegs->num==1.
|
||||
// (which would remove the loop, simplify the incVUptr code, etc). But checking for it has
|
||||
|
@ -119,11 +121,13 @@ static void setMasks(const VIFregisters& v) {
|
|||
// -- air
|
||||
|
||||
|
||||
template< int idx, bool doMode, bool isFill >
|
||||
__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
|
||||
//template< int idx, bool doMode, bool isFill >
|
||||
//__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
|
||||
__releaseinline void __fastcall _nVifUnpackLoop( int idx, u8 *data, u32 size )
|
||||
{
|
||||
// Eh... template attempt, tho not sure it helped much. There's too much setup code (see
|
||||
// optimization note above) -- air
|
||||
// comment out the following 2 lines to test templated version...
|
||||
const bool doMode = !!vifRegs->mode;
|
||||
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
|
||||
|
||||
const int usn = !!(vif->usn);
|
||||
const int doMask = !!(vif->tag.cmd & 0x10);
|
||||
|
@ -131,12 +135,13 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
|
|||
const u32& vift = nVifT[upkNum];
|
||||
|
||||
u8* dest = setVUptr(idx, vif->tag.addr);
|
||||
const VIFUnpackFuncTable& ft = VIFfuncTable[vif->tag.cmd & 0xf];
|
||||
UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS;
|
||||
const VIFUnpackFuncTable& ft = VIFfuncTable[upkNum];
|
||||
UNPACKFUNCTYPE func = usn ? ft.funcU : ft.funcS;
|
||||
|
||||
const nVifCall* fnbase = &nVifUpk[
|
||||
((usn*2*16) + (doMask*16) + (upkNum)) * (4*4)
|
||||
];
|
||||
// Did a bunch of work to make it so I could optimize this index lookup to outside
|
||||
// the main loop but it was for naught -- too often the loop is only 1-2 iterations,
|
||||
// so this setup code ends up being slower (1 iter) or same speed (2 iters).
|
||||
const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*4) ];
|
||||
|
||||
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
|
||||
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
|
||||
|
@ -145,6 +150,11 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
|
|||
setMasks(*vifRegs);
|
||||
|
||||
if (vif->cl >= blockSize) {
|
||||
|
||||
// This condition doesn't appear to ever occur, and really it never should.
|
||||
// Normally it wouldn't matter, but even simple setup code matters here (see
|
||||
// optimization notes above) >_<
|
||||
|
||||
vif->cl = 0;
|
||||
}
|
||||
|
||||
|
@ -167,7 +177,6 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
|
|||
vifRegs->num--;
|
||||
}
|
||||
else {
|
||||
|
||||
//DevCon.WriteLn("SSE Unpack!");
|
||||
int c = aMin((cycleSize - vif->cl), 3);
|
||||
size -= vift * c;
|
||||
|
@ -185,10 +194,10 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
|
|||
}
|
||||
incVUptr(idx, dest, 16);
|
||||
|
||||
// Removing this modulo was a huge speedup for God of War. (62->73 fps)
|
||||
// (GoW uses a lot of blockSize==1 packets, resulting in tons of loops -- so the biggest
|
||||
// factor in performance ends up being the top-level conditionals of the loop, and
|
||||
// also the loop prep code.) --air
|
||||
// Removing this modulo was a huge speedup for God of War start menu. (62->73 fps)
|
||||
// (GoW and tri-ace games both use a lot of blockSize==1 packets, resulting in tons
|
||||
// of loops -- so the biggest factor in performance ends up being the top-level
|
||||
// conditionals of the loop, and also the loop prep code.) --air
|
||||
|
||||
//vif->cl = (vif->cl+1) % blockSize;
|
||||
if( ++vif->cl == blockSize ) vif->cl = 0;
|
||||
|
@ -202,9 +211,18 @@ void _nVifUnpack(int idx, u8 *data, u32 size) {
|
|||
return;
|
||||
}
|
||||
else*/ { // filling write
|
||||
|
||||
vif = nVif[idx].vif;
|
||||
vifRegs = nVif[idx].vifRegs;
|
||||
|
||||
#if 1
|
||||
_nVifUnpackLoop( idx, data, size );
|
||||
#else
|
||||
// Eh... template attempt, tho it didn't help much. There's too much setup code,
|
||||
// and the template only optimizes code inside the loop, which often times seems to
|
||||
// only be run once or twice anyway. Better to use recompilation than templating
|
||||
// anyway, but I'll leave it in for now for reference. -- air
|
||||
|
||||
const bool doMode = !!vifRegs->mode;
|
||||
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
|
||||
|
||||
|
@ -231,7 +249,7 @@ void _nVifUnpack(int idx, u8 *data, u32 size) {
|
|||
{
|
||||
pxFailDev( "No VIF0 support yet, sorry!" );
|
||||
}
|
||||
|
||||
#endif
|
||||
//if (isFill)
|
||||
//DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
|
||||
//DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);
|
||||
|
|
|
@ -85,8 +85,7 @@ struct VifUnpackIndexer
|
|||
|
||||
void xSetCall( int packType ) const
|
||||
{
|
||||
xAlignPtr(16);
|
||||
GetCall( packType ) = (nVifCall)xGetPtr();
|
||||
GetCall( packType ) = (nVifCall)xGetAlignedCallTarget();
|
||||
}
|
||||
|
||||
void xSetNullCall( int packType ) const
|
||||
|
|
Loading…
Reference in New Issue