* Disable newVifUnpack, which I left enabled in the prev commit (it's not ready yet!)

* Added feature to align call targets for EErec functions and blocks on P4's and AMDs, and pack them on Core2/i7's. * Fixed some svn:native props. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2347 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-12-15 20:46:30 +00:00 · 2009-12-15 20:46:30 +00:00 · b3fead5dc9
parent b5f643950c
commit b3fead5dc9
17 changed files with 6908 additions and 6862 deletions
--- a/common/include/x86emitter/x86types.h
+++ b/common/include/x86emitter/x86types.h
@ -157,9 +157,12 @@ template< typename T > void xWrite( T val );
 	class ModSibBase;

 	extern void xSetPtr( void* ptr );
-	extern u8* xGetPtr();
 	extern void xAlignPtr( uint bytes );
 	extern void xAdvancePtr( uint bytes );
+	extern void xAlignCallTarget();
+
+	extern u8* xGetPtr();
+	extern u8* xGetAlignedCallTarget();

 	extern JccComparisonType xInvertCond( JccComparisonType src );

--- a/common/src/x86emitter/x86emitter.cpp
+++ b/common/src/x86emitter/x86emitter.cpp
@ -395,6 +395,32 @@ __emitinline void xAlignPtr( uint bytes )
 	x86Ptr = (u8*)( ( (uptr)x86Ptr + bytes - 1) & ~(bytes - 1) );
 }

+// Performs best-case alignment for the target CPU, for use prior to starting a new
+// function.  This is not meant to be used prior to jump targets, since it doesn't
+// add padding (additionally, speed benefit from jump alignment is minimal, and often
+// a loss).
+__emitinline void xAlignCallTarget()
+{
+	// Core2/i7 CPUs prefer unaligned addresses.  Checking for SSSE3 is a decent filter.
+	// (also align in debug modes for disasm convenience)
+	
+	if( IsDebugBuild || !x86caps.hasSupplementalStreamingSIMD3Extensions )
+	{
+		// - P4's and earlier prefer 16 byte alignment.
+		// - AMD Athlons and Phenoms prefer 8 byte alignment, but I don't have an easy
+		//   heuristic for it yet.
+		// - AMD Phenom IIs are unknown (either prefer 8 byte, or unaligned).
+
+		xAlignPtr( 16 );
+	}
+}
+
+__emitinline u8* xGetAlignedCallTarget()
+{
+	xAlignCallTarget();
+	return x86Ptr;
+}
+
 __emitinline void xAdvancePtr( uint bytes )
 {
 	if( IsDevBuild )
--- a/pcsx2/Vif1Dma.cpp
+++ b/pcsx2/Vif1Dma.cpp
@ -58,6 +58,11 @@ __forceinline void vif1FLUSH()

 void vif1Init()
 {
+#ifdef  newVif1
+	extern void initNewVif(int idx);
+	initNewVif(1);
+#endif
+
 	SetNewMask(g_vif1Masks, g_vif1HasMask3, 0, 0xffffffff);
 }

@ -313,19 +318,13 @@ static int __fastcall Vif1TransDirectHL(u32 *data)

 	return ret;
 }
-#ifdef  newVif1
-	extern void initNewVif(int idx);
-	extern int nVifUnpack(int idx, u32 *data);
-	static int testVif = 0;
-#endif
 static int  __fastcall Vif1TransUnpack(u32 *data)
 {
 #ifdef newVif1
-	if (!testVif) { initNewVif(1); testVif = 1; }
-	//int temp = nVifUnpack(1, data);
-	//if (temp >= 0) return temp;
+	extern int nVifUnpack(int idx, u32 *data);
 	return nVifUnpack(1, data);
 #endif
+
    XMMRegisters::Freeze();

 	if (vif1.vifpacketsize < vif1.tag.size)
--- a/pcsx2/VifDma_internal.h
+++ b/pcsx2/VifDma_internal.h
@ -60,7 +60,7 @@ static __forceinline u32 vif_size(u8 num)
    return (num == 0) ? 0x1000 : 0x4000;
 }

-#define newVif  // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
-#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
+//#define newVif  // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
+//#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
 //#define newVif0 // Use New Code for Vif0 Unpacks (not implemented)
 #endif
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@ -371,7 +371,7 @@ static DynGenFunc* _DynGen_JITCompile()
 {
 	pxAssertMsg( DispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple.  Thanks." );

-	u8* retval = xGetPtr();
+	u8* retval = xGetAlignedCallTarget();
 	_DynGen_StackFrameCheck();

 	xMOV( ecx, &cpuRegs.pc );
@ -388,7 +388,7 @@ static DynGenFunc* _DynGen_JITCompile()

 static DynGenFunc* _DynGen_JITCompileInBlock()
 {
-	u8* retval = xGetPtr();
+	u8* retval = xGetAlignedCallTarget();
 	xJMP( JITCompile );
 	return (DynGenFunc*)retval;
 }
@ -396,7 +396,7 @@ static DynGenFunc* _DynGen_JITCompileInBlock()
 // called when jumping to variable pc address
 static DynGenFunc* _DynGen_DispatcherReg()
 {
-	u8* retval = xGetPtr();
+	u8* retval = xGetPtr();		// fallthrough target, can't align it!
 	_DynGen_StackFrameCheck();

 	xMOV( eax, &cpuRegs.pc );
@ -410,7 +410,7 @@ static DynGenFunc* _DynGen_DispatcherReg()

 static DynGenFunc* _DynGen_EnterRecompiledCode()
 {
-	u8* retval = xGetPtr();
+	u8* retval = xGetAlignedCallTarget();

 	// "standard" frame pointer setup for aligned stack: Record the original
 	//   esp into ebp, and then align esp.  ebp references the original esp base
@ -446,6 +446,8 @@ static DynGenFunc* _DynGen_EnterRecompiledCode()
 	xMOV( &s_store_ebp, ebp );

 	xJMP( ptr32[&DispatcherReg] );
+	
+	xAlignCallTarget();
 	imm = (uptr)xGetPtr();
 	ExitRecompiledCode = (DynGenFunc*)xGetPtr();

@ -1254,7 +1256,7 @@ void recompileNextInstruction(int delayslot)
 //	_flushCachedRegs();
 //	g_cpuHasConstReg = 1;

-	if (!delayslot && x86Ptr - recPtr > 0x1000)
+	if (!delayslot && (xGetPtr() - recPtr > 0x1000) )
 		s_nEndBlock = pc;
 }

@ -1335,9 +1337,8 @@ static void __fastcall recRecompile( const u32 startpc )
 		recResetEE();
 	}

-	x86SetPtr( recPtr );
-	x86Align(16);
-	recPtr = x86Ptr;
+	xSetPtr( recPtr );
+	recPtr = xGetAlignedCallTarget();

 	s_nBlockFF = false;
 	if (HWADDR(startpc) == 0x81fc0)
@ -1718,14 +1719,14 @@ StartRecomp:
 		}
 	}

-	pxAssert( x86Ptr < recMem+REC_CACHEMEM );
+	pxAssert( xGetPtr() < recMem+REC_CACHEMEM );
 	pxAssert( recConstBufPtr < recConstBuf + RECCONSTBUF_SIZE );
 	pxAssert( x86FpuState == 0 );

-	pxAssert(x86Ptr - recPtr < 0x10000);
-	s_pCurBlockEx->x86size = x86Ptr - recPtr;
+	pxAssert(xGetPtr() - recPtr < 0x10000);
+	s_pCurBlockEx->x86size = xGetPtr() - recPtr;

-	recPtr = x86Ptr;
+	recPtr = xGetPtr();

 	pxAssert( (g_cpuHasConstReg&g_cpuFlushedConstReg) == g_cpuHasConstReg );

--- a/pcsx2/x86/newVif_Unpack.inl
+++ b/pcsx2/x86/newVif_Unpack.inl
@ -26,7 +26,8 @@ struct nVifStruct {
 	u32				vuMemLimit; // Use for fast AND
 	BlockBuffer*	vifBlock;	// Block Buffer
 };
-nVifStruct nVif[2];
+
+static __aligned16 nVifStruct nVif[2];

 void initNewVif(int idx) {
 	nVif[idx].idx		= idx;
@ -112,6 +113,7 @@ static void setMasks(const VIFregisters& v) {
 //  has a lot of setup code to establish which unpack function to call.  The best way to
 //  optimize this is to cache the unpack function's base (see fnbase below) and update it
 //  when the variables it depends on are modified: writes to vif->tag.cmd and vif->usn.
+//  Problem: vif->tag.cmd is modified a lot.  Like, constantly.  So won't work.
 //
 //  A secondary optimization would be adding special handlers for packets where vifRegs->num==1.
 //  (which would remove the loop, simplify the incVUptr code, etc).  But checking for it has
@ -119,11 +121,13 @@ static void setMasks(const VIFregisters& v) {
 //   -- air


-template< int idx, bool doMode, bool isFill >
-__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
+//template< int idx, bool doMode, bool isFill >
+//__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
+__releaseinline void __fastcall _nVifUnpackLoop( int idx, u8 *data, u32 size )
 {
-	// Eh... template attempt, tho not sure it helped much.  There's too much setup code (see
-	// optimization note above) -- air
+	// comment out the following 2 lines to test templated version...
+	const bool	doMode	= !!vifRegs->mode;
+	const bool	isFill	= (vifRegs->cycle.cl < vifRegs->cycle.wl);

 	const int	usn		= !!(vif->usn);
 	const int	doMask	= !!(vif->tag.cmd & 0x10);
@ -131,12 +135,13 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
 	const u32&	vift	= nVifT[upkNum];

 	u8* dest					 = setVUptr(idx, vif->tag.addr);
-	const VIFUnpackFuncTable& ft = VIFfuncTable[vif->tag.cmd & 0xf];
-	UNPACKFUNCTYPE func			 = vif->usn ? ft.funcU : ft.funcS;
+	const VIFUnpackFuncTable& ft = VIFfuncTable[upkNum];
+	UNPACKFUNCTYPE func			 = usn ? ft.funcU : ft.funcS;

-	const nVifCall*	fnbase = &nVifUpk[
-		((usn*2*16) + (doMask*16) + (upkNum)) * (4*4)
-	];
+	// Did a bunch of work to make it so I could optimize this index lookup to outside
+	// the main loop but it was for naught -- too often the loop is only 1-2 iterations,
+	// so this setup code ends up being slower (1 iter) or same speed (2 iters).
+	const nVifCall*	fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*4) ];

 	const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
 	const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
@ -145,6 +150,11 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
 		setMasks(*vifRegs);

 	if (vif->cl >= blockSize) {
+	
+		// This condition doesn't appear to ever occur, and really it never should.
+		// Normally it wouldn't matter, but even simple setup code matters here (see 
+		// optimization notes above) >_<
+
 		vif->cl  = 0;
 	}

@ -167,7 +177,6 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
 				vifRegs->num--;
 			}
 			else {
-			
 				//DevCon.WriteLn("SSE Unpack!");
 				int c = aMin((cycleSize - vif->cl), 3);
 				size -= vift * c;
@ -185,10 +194,10 @@ __releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
 		}
 		incVUptr(idx, dest, 16);
 		
-		// Removing this modulo was a huge speedup for God of War. (62->73 fps)
-		// (GoW uses a lot of blockSize==1 packets, resulting in tons of loops -- so the biggest
-		//  factor in performance ends up being the top-level conditionals of the loop, and
-		//  also the loop prep code.) --air
+		// Removing this modulo was a huge speedup for God of War start menu. (62->73 fps)
+		// (GoW and tri-ace games both use a lot of blockSize==1 packets, resulting in tons
+		//  of loops -- so the biggest factor in performance ends up being the top-level
+		//  conditionals of the loop, and also the loop prep code.) --air

 		//vif->cl = (vif->cl+1) % blockSize;
 		if( ++vif->cl == blockSize ) vif->cl = 0;
@ -202,9 +211,18 @@ void _nVifUnpack(int idx, u8 *data, u32 size) {
 		return;
 	}
 	else*/ { // filling write
+
 		vif        = nVif[idx].vif;
 		vifRegs    = nVif[idx].vifRegs;

+#if 1
+		_nVifUnpackLoop( idx, data, size );
+#else		
+		// Eh... template attempt, tho it didn't help much.  There's too much setup code,
+		// and the template only optimizes code inside the loop, which often times seems to
+		// only be run once or twice anyway.  Better to use recompilation than templating
+		// anyway, but I'll leave it in for now for reference. -- air
+
 		const bool	doMode	= !!vifRegs->mode;
 		const bool	isFill	= (vifRegs->cycle.cl < vifRegs->cycle.wl);

@ -231,7 +249,7 @@ void _nVifUnpack(int idx, u8 *data, u32 size) {
 		{
 			pxFailDev( "No VIF0 support yet, sorry!" );
 		}
-		
+#endif
 		//if (isFill)
 		//DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
 		//DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);
--- a/pcsx2/x86/newVif_UnpackGen.inl
+++ b/pcsx2/x86/newVif_UnpackGen.inl
@ -85,8 +85,7 @@ struct VifUnpackIndexer
 	
 	void xSetCall( int packType ) const
 	{
-		xAlignPtr(16);
-		GetCall( packType ) = (nVifCall)xGetPtr();
+		GetCall( packType ) = (nVifCall)xGetAlignedCallTarget();
 	}

 	void xSetNullCall( int packType ) const