* Disable newVifUnpack, which I left enabled in the prev commit (it's not ready yet!)

* Added feature to align call targets for EErec functions and blocks on P4's and AMDs, and pack them on Core2/i7's.
 * Fixed some svn:native props.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2347 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-12-15 20:46:30 +00:00
parent b5f643950c
commit b3fead5dc9
17 changed files with 6908 additions and 6862 deletions

View File

@ -157,9 +157,12 @@ template< typename T > void xWrite( T val );
class ModSibBase;
extern void xSetPtr( void* ptr );
extern u8* xGetPtr();
extern void xAlignPtr( uint bytes );
extern void xAdvancePtr( uint bytes );
extern void xAlignCallTarget();
extern u8* xGetPtr();
extern u8* xGetAlignedCallTarget();
extern JccComparisonType xInvertCond( JccComparisonType src );

View File

@ -395,6 +395,32 @@ __emitinline void xAlignPtr( uint bytes )
x86Ptr = (u8*)( ( (uptr)x86Ptr + bytes - 1) & ~(bytes - 1) );
}
// Performs best-case alignment for the target CPU, for use prior to starting a new
// function. This is not meant to be used prior to jump targets, since it doesn't
// add padding (additionally, speed benefit from jump alignment is minimal, and often
// a loss).
__emitinline void xAlignCallTarget()
{
// Core2/i7 CPUs prefer unaligned addresses. Checking for SSSE3 is a decent filter.
// (also align in debug modes for disasm convenience)
if( IsDebugBuild || !x86caps.hasSupplementalStreamingSIMD3Extensions )
{
// - P4's and earlier prefer 16 byte alignment.
// - AMD Athlons and Phenoms prefer 8 byte alignment, but I don't have an easy
// heuristic for it yet.
// - AMD Phenom IIs are unknown (either prefer 8 byte, or unaligned).
xAlignPtr( 16 );
}
}
__emitinline u8* xGetAlignedCallTarget()
{
xAlignCallTarget();
return x86Ptr;
}
__emitinline void xAdvancePtr( uint bytes )
{
if( IsDevBuild )

View File

@ -58,6 +58,11 @@ __forceinline void vif1FLUSH()
void vif1Init()
{
#ifdef newVif1
extern void initNewVif(int idx);
initNewVif(1);
#endif
SetNewMask(g_vif1Masks, g_vif1HasMask3, 0, 0xffffffff);
}
@ -313,19 +318,13 @@ static int __fastcall Vif1TransDirectHL(u32 *data)
return ret;
}
#ifdef newVif1
extern void initNewVif(int idx);
extern int nVifUnpack(int idx, u32 *data);
static int testVif = 0;
#endif
static int __fastcall Vif1TransUnpack(u32 *data)
{
#ifdef newVif1
if (!testVif) { initNewVif(1); testVif = 1; }
//int temp = nVifUnpack(1, data);
//if (temp >= 0) return temp;
extern int nVifUnpack(int idx, u32 *data);
return nVifUnpack(1, data);
#endif
XMMRegisters::Freeze();
if (vif1.vifpacketsize < vif1.tag.size)

View File

@ -60,7 +60,7 @@ static __forceinline u32 vif_size(u8 num)
return (num == 0) ? 0x1000 : 0x4000;
}
#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
//#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
//#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
//#define newVif0 // Use New Code for Vif0 Unpacks (not implemented)
#endif

View File

@ -371,7 +371,7 @@ static DynGenFunc* _DynGen_JITCompile()
{
pxAssertMsg( DispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple. Thanks." );
u8* retval = xGetPtr();
u8* retval = xGetAlignedCallTarget();
_DynGen_StackFrameCheck();
xMOV( ecx, &cpuRegs.pc );
@ -388,7 +388,7 @@ static DynGenFunc* _DynGen_JITCompile()
static DynGenFunc* _DynGen_JITCompileInBlock()
{
u8* retval = xGetPtr();
u8* retval = xGetAlignedCallTarget();
xJMP( JITCompile );
return (DynGenFunc*)retval;
}
@ -396,7 +396,7 @@ static DynGenFunc* _DynGen_JITCompileInBlock()
// called when jumping to variable pc address
static DynGenFunc* _DynGen_DispatcherReg()
{
u8* retval = xGetPtr();
u8* retval = xGetPtr(); // fallthrough target, can't align it!
_DynGen_StackFrameCheck();
xMOV( eax, &cpuRegs.pc );
@ -410,7 +410,7 @@ static DynGenFunc* _DynGen_DispatcherReg()
static DynGenFunc* _DynGen_EnterRecompiledCode()
{
u8* retval = xGetPtr();
u8* retval = xGetAlignedCallTarget();
// "standard" frame pointer setup for aligned stack: Record the original
// esp into ebp, and then align esp. ebp references the original esp base
@ -446,6 +446,8 @@ static DynGenFunc* _DynGen_EnterRecompiledCode()
xMOV( &s_store_ebp, ebp );
xJMP( ptr32[&DispatcherReg] );
xAlignCallTarget();
imm = (uptr)xGetPtr();
ExitRecompiledCode = (DynGenFunc*)xGetPtr();
@ -1254,7 +1256,7 @@ void recompileNextInstruction(int delayslot)
// _flushCachedRegs();
// g_cpuHasConstReg = 1;
if (!delayslot && x86Ptr - recPtr > 0x1000)
if (!delayslot && (xGetPtr() - recPtr > 0x1000) )
s_nEndBlock = pc;
}
@ -1335,9 +1337,8 @@ static void __fastcall recRecompile( const u32 startpc )
recResetEE();
}
x86SetPtr( recPtr );
x86Align(16);
recPtr = x86Ptr;
xSetPtr( recPtr );
recPtr = xGetAlignedCallTarget();
s_nBlockFF = false;
if (HWADDR(startpc) == 0x81fc0)
@ -1718,14 +1719,14 @@ StartRecomp:
}
}
pxAssert( x86Ptr < recMem+REC_CACHEMEM );
pxAssert( xGetPtr() < recMem+REC_CACHEMEM );
pxAssert( recConstBufPtr < recConstBuf + RECCONSTBUF_SIZE );
pxAssert( x86FpuState == 0 );
pxAssert(x86Ptr - recPtr < 0x10000);
s_pCurBlockEx->x86size = x86Ptr - recPtr;
pxAssert(xGetPtr() - recPtr < 0x10000);
s_pCurBlockEx->x86size = xGetPtr() - recPtr;
recPtr = x86Ptr;
recPtr = xGetPtr();
pxAssert( (g_cpuHasConstReg&g_cpuFlushedConstReg) == g_cpuHasConstReg );

View File

@ -26,7 +26,8 @@ struct nVifStruct {
u32 vuMemLimit; // Use for fast AND
BlockBuffer* vifBlock; // Block Buffer
};
nVifStruct nVif[2];
static __aligned16 nVifStruct nVif[2];
void initNewVif(int idx) {
nVif[idx].idx = idx;
@ -112,6 +113,7 @@ static void setMasks(const VIFregisters& v) {
// has a lot of setup code to establish which unpack function to call. The best way to
// optimize this is to cache the unpack function's base (see fnbase below) and update it
// when the variables it depends on are modified: writes to vif->tag.cmd and vif->usn.
// Problem: vif->tag.cmd is modified a lot. Like, constantly. So won't work.
//
// A secondary optimization would be adding special handlers for packets where vifRegs->num==1.
// (which would remove the loop, simplify the incVUptr code, etc). But checking for it has
@ -119,11 +121,13 @@ static void setMasks(const VIFregisters& v) {
// -- air
template< int idx, bool doMode, bool isFill >
__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
//template< int idx, bool doMode, bool isFill >
//__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
__releaseinline void __fastcall _nVifUnpackLoop( int idx, u8 *data, u32 size )
{
// Eh... template attempt, tho not sure it helped much. There's too much setup code (see
// optimization note above) -- air
// comment out the following 2 lines to test templated version...
const bool doMode = !!vifRegs->mode;
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
const int usn = !!(vif->usn);
const int doMask = !!(vif->tag.cmd & 0x10);
@ -131,12 +135,13 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
const u32& vift = nVifT[upkNum];
u8* dest = setVUptr(idx, vif->tag.addr);
const VIFUnpackFuncTable& ft = VIFfuncTable[vif->tag.cmd & 0xf];
UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS;
const VIFUnpackFuncTable& ft = VIFfuncTable[upkNum];
UNPACKFUNCTYPE func = usn ? ft.funcU : ft.funcS;
const nVifCall* fnbase = &nVifUpk[
((usn*2*16) + (doMask*16) + (upkNum)) * (4*4)
];
// Did a bunch of work to make it so I could optimize this index lookup to outside
// the main loop but it was for naught -- too often the loop is only 1-2 iterations,
// so this setup code ends up being slower (1 iter) or same speed (2 iters).
const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*4) ];
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
@ -145,6 +150,11 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
setMasks(*vifRegs);
if (vif->cl >= blockSize) {
// This condition doesn't appear to ever occur, and really it never should.
// Normally it wouldn't matter, but even simple setup code matters here (see
// optimization notes above) >_<
vif->cl = 0;
}
@ -167,7 +177,6 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
vifRegs->num--;
}
else {
//DevCon.WriteLn("SSE Unpack!");
int c = aMin((cycleSize - vif->cl), 3);
size -= vift * c;
@ -185,10 +194,10 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
}
incVUptr(idx, dest, 16);
// Removing this modulo was a huge speedup for God of War. (62->73 fps)
// (GoW uses a lot of blockSize==1 packets, resulting in tons of loops -- so the biggest
// factor in performance ends up being the top-level conditionals of the loop, and
// also the loop prep code.) --air
// Removing this modulo was a huge speedup for God of War start menu. (62->73 fps)
// (GoW and tri-ace games both use a lot of blockSize==1 packets, resulting in tons
// of loops -- so the biggest factor in performance ends up being the top-level
// conditionals of the loop, and also the loop prep code.) --air
//vif->cl = (vif->cl+1) % blockSize;
if( ++vif->cl == blockSize ) vif->cl = 0;
@ -202,9 +211,18 @@ void _nVifUnpack(int idx, u8 *data, u32 size) {
return;
}
else*/ { // filling write
vif = nVif[idx].vif;
vifRegs = nVif[idx].vifRegs;
#if 1
_nVifUnpackLoop( idx, data, size );
#else
// Eh... template attempt, tho it didn't help much. There's too much setup code,
// and the template only optimizes code inside the loop, which often times seems to
// only be run once or twice anyway. Better to use recompilation than templating
// anyway, but I'll leave it in for now for reference. -- air
const bool doMode = !!vifRegs->mode;
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
@ -231,7 +249,7 @@ void _nVifUnpack(int idx, u8 *data, u32 size) {
{
pxFailDev( "No VIF0 support yet, sorry!" );
}
#endif
//if (isFill)
//DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
//DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);

View File

@ -85,8 +85,7 @@ struct VifUnpackIndexer
void xSetCall( int packType ) const
{
xAlignPtr(16);
GetCall( packType ) = (nVifCall)xGetPtr();
GetCall( packType ) = (nVifCall)xGetAlignedCallTarget();
}
void xSetNullCall( int packType ) const