Bugfix for assertion breaks not working in Devbuilds.

newVif: * Bugfix to HashBucket::find() cuts microprogram caches misses in half. * Dynarec version now uses alternating XMM registers for unmasked unpacks (very minor speedup, ~1%). git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2397 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-12-25 18:00:51 +00:00 · 2009-12-25 18:00:51 +00:00 · de637fc921
parent 9473e69b7f
commit de637fc921
7 changed files with 141 additions and 101 deletions
--- a/common/include/x86emitter/x86types.h
+++ b/common/include/x86emitter/x86types.h
@ -356,6 +356,18 @@ template< typename T > void xWrite( T val );

 		bool operator==( const xRegisterSSE& src ) const	{ return this->Id == src.Id; }
 		bool operator!=( const xRegisterSSE& src ) const	{ return this->Id != src.Id; }
+		
+		xRegisterSSE& operator++()
+		{
+			++Id &= (iREGCNT_XMM-1);
+			return *this;
+		}
+
+		xRegisterSSE& operator--()
+		{
+			--Id &= (iREGCNT_XMM-1);
+			return *this;
+		}
 	};

 	class xRegisterCL : public xRegister8
--- a/common/src/Utilities/Exceptions.cpp
+++ b/common/src/Utilities/Exceptions.cpp
@ -76,6 +76,26 @@ bool pxAssertImpl_LogIt( const DiagnosticOrigin& origin, const wxChar *msg )
 	return false;
 }

+// Because wxTrap isn't available on Linux builds of wxWidgets (non-Debug, typically)
+void pxTrap()
+{
+#if defined(__WXMSW__) && !defined(__WXMICROWIN__)
+    __debugbreak();
+#elif defined(__WXMAC__) && !defined(__DARWIN__)
+    #if __powerc
+        Debugger();
+    #else
+        SysBreak();
+    #endif
+#elif defined(_MSL_USING_MW_C_HEADERS) && _MSL_USING_MW_C_HEADERS
+    Debugger();
+#elif defined(__UNIX__)
+    raise(SIGTRAP);
+#else
+    // TODO
+#endif // Win/Unix
+}
+
 DEVASSERT_INLINE void pxOnAssert( const DiagnosticOrigin& origin, const wxChar* msg )
 {
 	RecursionGuard guard( s_assert_guard );
@ -98,7 +118,7 @@ DEVASSERT_INLINE void pxOnAssert( const DiagnosticOrigin& origin, const wxChar*
 		trapit = pxDoAssert( origin, msg );
 	}

-	if( trapit ) { wxTrap(); }
+	if( trapit ) { pxTrap(); }
 }

 __forceinline void pxOnAssert( const DiagnosticOrigin& origin, const char* msg)
--- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
+++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
@ -863,10 +863,6 @@
 									RelativePath="..\..\x86\newVif_Unpack.cpp"
 									>
 								</File>
-								<File
-									RelativePath="..\..\x86\newVif_UnpackGen.inl"
-									>
-								</File>
 								<Filter
 									Name="Dynarec"
 									>
--- a/pcsx2/x86/VifUnpackSSE.cpp
+++ b/pcsx2/x86/VifUnpackSSE.cpp
@ -32,12 +32,14 @@ static __pagealigned u8 nVifUpkExec[__pagesize*4];
 VifUnpackSSE_Base::VifUnpackSSE_Base()
 	: dstIndirect(ecx)		// parameter 1 of __fastcall
 	, srcIndirect(edx)		// parameter 2 of __fastcall
+	, workReg( xmm1 )
+	, destReg( xmm0 )
 {
 }

-void VifUnpackSSE_Base::xMovDest(const xRegisterSSE& srcReg) const {
-	if (IsUnmaskedOp())	{ xMOVAPS (ptr[dstIndirect], srcReg); }
-	else				{ doMaskWrite(srcReg); }
+void VifUnpackSSE_Base::xMovDest() const {
+	if (IsUnmaskedOp())	{ xMOVAPS (ptr[dstIndirect], destReg); }
+	else				{ doMaskWrite(destReg); }
 }

 void VifUnpackSSE_Base::xShiftR(const xRegisterSSE& regX, int n) const {
@ -56,145 +58,132 @@ void VifUnpackSSE_Base::xPMOVXX16(const xRegisterSSE& regX) const {
 }

 void VifUnpackSSE_Base::xUPK_S_32() const {
-	xMOV32     (xmm0, ptr32[srcIndirect]);
-	xPSHUF.D   (xmm1, xmm0, _v0);
-	xMovDest   (xmm1);
+	xMOV32     (workReg, ptr32[srcIndirect]);
+	xPSHUF.D   (destReg, workReg, _v0);
 }

 void VifUnpackSSE_Base::xUPK_S_16() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
-	xPMOVXX16  (xmm0);
+	xPMOVXX16  (workReg);
 }
 else {
-	xMOV16     (xmm0, ptr32[srcIndirect]);
-	xPUNPCK.LWD(xmm0, xmm0);
-	xShiftR    (xmm0, 16);
+	xMOV16     (workReg, ptr32[srcIndirect]);
+	xPUNPCK.LWD(workReg, workReg);
+	xShiftR    (workReg, 16);
 }
-	xPSHUF.D   (xmm1, xmm0, _v0);
-	xMovDest   (xmm1);
+	xPSHUF.D   (destReg, workReg, _v0);
 }

 void VifUnpackSSE_Base::xUPK_S_8() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
-	xPMOVXX8   (xmm0);
+	xPMOVXX8   (workReg);
 }
 else {
-	xMOV8      (xmm0, ptr32[srcIndirect]);
-	xPUNPCK.LBW(xmm0, xmm0);
-	xPUNPCK.LWD(xmm0, xmm0);
-	xShiftR    (xmm0, 24);
+	xMOV8      (workReg, ptr32[srcIndirect]);
+	xPUNPCK.LBW(workReg, workReg);
+	xPUNPCK.LWD(workReg, workReg);
+	xShiftR    (workReg, 24);
 }
-	xPSHUF.D   (xmm1, xmm0, _v0);
-	xMovDest   (xmm1);
+	xPSHUF.D   (destReg, workReg, _v0);
 }

 void VifUnpackSSE_Base::xUPK_V2_32() const {
-	xMOV64     (xmm0, ptr32[srcIndirect]);
-	xMovDest   (xmm0);
+	xMOV64     (destReg, ptr32[srcIndirect]);
 }

 void VifUnpackSSE_Base::xUPK_V2_16() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
-	xPMOVXX16  (xmm0);
+	xPMOVXX16  (destReg);
 }
 else {
-	xMOV32     (xmm0, ptr32[srcIndirect]);
-	xPUNPCK.LWD(xmm0, xmm0);
-	xShiftR    (xmm0, 16);
+	xMOV32     (destReg, ptr32[srcIndirect]);
+	xPUNPCK.LWD(destReg, destReg);
+	xShiftR    (destReg, 16);
 }
-	xMovDest   (xmm0);
 }

 void VifUnpackSSE_Base::xUPK_V2_8() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
-	xPMOVXX8   (xmm0);
+	xPMOVXX8   (destReg);
 }
 else {
-	xMOV16     (xmm0, ptr32[srcIndirect]);
-	xPUNPCK.LBW(xmm0, xmm0);
-	xPUNPCK.LWD(xmm0, xmm0);
-	xShiftR    (xmm0, 24);
+	xMOV16     (destReg, ptr32[srcIndirect]);
+	xPUNPCK.LBW(destReg, destReg);
+	xPUNPCK.LWD(destReg, destReg);
+	xShiftR    (destReg, 24);
 }
-	xMovDest   (xmm0);
 }

 void VifUnpackSSE_Base::xUPK_V3_32() const {
-	xMOV128    (xmm0, ptr32[srcIndirect]);
-	xMovDest   (xmm0);
+	xMOV128    (destReg, ptr32[srcIndirect]);
 }

 void VifUnpackSSE_Base::xUPK_V3_16() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
-	xPMOVXX16  (xmm0);
+	xPMOVXX16  (destReg);
 }
 else {
-	xMOV64     (xmm0, ptr32[srcIndirect]);
-	xPUNPCK.LWD(xmm0, xmm0);
-	xShiftR    (xmm0, 16);
+	xMOV64     (destReg, ptr32[srcIndirect]);
+	xPUNPCK.LWD(destReg, destReg);
+	xShiftR    (destReg, 16);
 }
-	xMovDest   (xmm0);
 }

 void VifUnpackSSE_Base::xUPK_V3_8() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
-	xPMOVXX8   (xmm0);
+	xPMOVXX8   (destReg);
 }
 else {
-	xMOV32     (xmm0, ptr32[srcIndirect]);
-	xPUNPCK.LBW(xmm0, xmm0);
-	xPUNPCK.LWD(xmm0, xmm0);
-	xShiftR    (xmm0, 24);
+	xMOV32     (destReg, ptr32[srcIndirect]);
+	xPUNPCK.LBW(destReg, destReg);
+	xPUNPCK.LWD(destReg, destReg);
+	xShiftR    (destReg, 24);
 }
-	xMovDest   (xmm0);
 }

 void VifUnpackSSE_Base::xUPK_V4_32() const {
-	xMOV128    (xmm0, ptr32[srcIndirect]);
-	xMovDest   (xmm0);
+	xMOV128    (destReg, ptr32[srcIndirect]);
 }

 void VifUnpackSSE_Base::xUPK_V4_16() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
-	xPMOVXX16  (xmm0);
+	xPMOVXX16  (destReg);
 }
 else {
-	xMOV64     (xmm0, ptr32[srcIndirect]);
-	xPUNPCK.LWD(xmm0, xmm0);
-	xShiftR    (xmm0, 16);
+	xMOV64     (destReg, ptr32[srcIndirect]);
+	xPUNPCK.LWD(destReg, destReg);
+	xShiftR    (destReg, 16);
 }
-	xMovDest   (xmm0);
 }

 void VifUnpackSSE_Base::xUPK_V4_8() const {
 if (x86caps.hasStreamingSIMD4Extensions) {
-	xPMOVXX8   (xmm0);
+	xPMOVXX8   (destReg);
 }
 else {
-	xMOV32     (xmm0, ptr32[srcIndirect]);
-	xPUNPCK.LBW(xmm0, xmm0);
-	xPUNPCK.LWD(xmm0, xmm0);
-	xShiftR    (xmm0, 24);
+	xMOV32     (destReg, ptr32[srcIndirect]);
+	xPUNPCK.LBW(destReg, destReg);
+	xPUNPCK.LWD(destReg, destReg);
+	xShiftR    (destReg, 24);
 }
-	xMovDest   (xmm0);
 }

 void VifUnpackSSE_Base::xUPK_V4_5() const {
-	xMOV16		(xmm0, ptr32[srcIndirect]);
-	xPSHUF.D	(xmm0, xmm0, _v0);
-	xPSLL.D		(xmm0, 3);			// ABG|R5.000
-	xMOVAPS		(xmm1, xmm0);		// x|x|x|R
-	xPSRL.D		(xmm0, 8);			// ABG
-	xPSLL.D		(xmm0, 3);			// AB|G5.000
-	mVUmergeRegs(XMM1, XMM0, 0x4);	// x|x|G|R
-	xPSRL.D		(xmm0, 8);			// AB
-	xPSLL.D		(xmm0, 3);			// A|B5.000
-	mVUmergeRegs(XMM1, XMM0, 0x2);	// x|B|G|R
-	xPSRL.D		(xmm0, 8);			// A
-	xPSLL.D		(xmm0, 7);			// A.0000000
-	mVUmergeRegs(XMM1, XMM0, 0x1);	// A|B|G|R
-	xPSLL.D		(xmm1, 24); // can optimize to
-	xPSRL.D		(xmm1, 24); // single AND...
-	xMovDest	(xmm1);
+	xMOV16		(workReg, ptr32[srcIndirect]);
+	xPSHUF.D	(workReg, workReg, _v0);
+	xPSLL.D		(workReg, 3);			// ABG|R5.000
+	xMOVAPS		(destReg, workReg);		// x|x|x|R
+	xPSRL.D		(workReg, 8);			// ABG
+	xPSLL.D		(workReg, 3);			// AB|G5.000
+	mVUmergeRegs(destReg.Id, workReg.Id, 0x4);	// x|x|G|R
+	xPSRL.D		(workReg, 8);			// AB
+	xPSLL.D		(workReg, 3);			// A|B5.000
+	mVUmergeRegs(destReg.Id, workReg.Id, 0x2);	// x|B|G|R
+	xPSRL.D		(workReg, 8);			// A
+	xPSLL.D		(workReg, 7);			// A.0000000
+	mVUmergeRegs(destReg.Id, workReg.Id, 0x1);	// A|B|G|R
+	xPSLL.D		(destReg, 24); // can optimize to
+	xPSRL.D		(destReg, 24); // single AND...
 }

 void VifUnpackSSE_Base::xUnpack( int upknum ) const
@ -263,6 +252,7 @@ static void nVifGen(int usn, int mask, int curCycle) {
 		
 		ucall = (nVifCall)xGetAlignedCallTarget();
 		vpugen.xUnpack(i);
+		vpugen.xMovDest();
 		xRET();

 		pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
--- a/pcsx2/x86/VifUnpackSSE.h
+++ b/pcsx2/x86/VifUnpackSSE.h
@ -38,18 +38,20 @@ public:
 protected:
 	xAddressInfo	dstIndirect;
 	xAddressInfo	srcIndirect;
-
+	xRegisterSSE	workReg;
+	xRegisterSSE	destReg;
+	
 public:
 	VifUnpackSSE_Base();
 	virtual ~VifUnpackSSE_Base() throw() {}

 	virtual void xUnpack( int upktype ) const;
 	virtual bool IsUnmaskedOp() const=0;
+	virtual void xMovDest() const;

 protected:
 	virtual void doMaskWrite(const xRegisterSSE& regX ) const=0;

-	virtual void xMovDest(const xRegisterSSE& srcReg) const;
 	virtual void xShiftR(const xRegisterSSE& regX, int n) const;
 	virtual void xPMOVXX8(const xRegisterSSE& regX) const;
 	virtual void xPMOVXX16(const xRegisterSSE& regX) const;
--- a/pcsx2/x86/VifUnpackSSE_Dynarec.cpp
+++ b/pcsx2/x86/VifUnpackSSE_Dynarec.cpp
@ -163,16 +163,24 @@ static void ShiftDisplacementWindow( xAddressInfo& addr, const xRegister32& modR
 	}
 	if(addImm) xADD(modReg, addImm);
 }
+static bool UsesTwoRegs[] = 
+{
+	true, true, true, true,
+	false, false, false, false,
+	false, false, false, false,
+	false, false, false, true,
+
+};

 void VifUnpackSSE_Dynarec::CompileRoutine() {
-	const int  upkNum		=  vB.upkType & 0xf;
+	const int  upkNum		=  v.vif->cmd & 0xf;
 	const u8&  vift			=  nVifT[upkNum];
 	const int  cycleSize	=  isFill ?  vB.cl : vB.wl;
 	const int  blockSize	=  isFill ?  vB.wl : vB.cl;
 	const int  skipSize		=  blockSize - cycleSize;

-	int  vNum	=  vifRegs->num;
-	vCL	=  vif->cl;
+	int  vNum	=  v.vifRegs->num;
+	vCL	=  v.vif->cl;

 	SetMasks(cycleSize);

@ -183,14 +191,25 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {

 		if (vCL < cycleSize) { 
 			xUnpack(upkNum);
-			srcIndirect += vift;
+			xMovDest();
+
 			dstIndirect += 16;
+			srcIndirect += vift;
+
+			if( IsUnmaskedOp() ) {
+				++destReg;
+				++workReg;
+			}
+			
 			vNum--;
 			if (++vCL == blockSize) vCL = 0;
 		}
 		else if (isFill) {
 			DevCon.WriteLn("filling mode!");
-			VifUnpackSSE_Dynarec::FillingWrite( *this ).xUnpack(upkNum);
+			VifUnpackSSE_Dynarec fill( VifUnpackSSE_Dynarec::FillingWrite( *this ) );
+			fill.xUnpack(upkNum);
+			fill.xMovDest();
+
 			dstIndirect += 16;
 			vNum--;
 			if (++vCL == blockSize) vCL = 0;
@ -200,9 +219,10 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
 			vCL = 0;
 		}
 	}
+
 	if (doMode==2) writeBackRow();
-	xMOV(ptr32[&vif->cl],	   vCL);
-	xMOV(ptr32[&vifRegs->num], vNum);
+	xMOV(ptr32[&v.vif->cl],	   vCL);
+	xMOV(ptr32[&v.vifRegs->num], vNum);
 	xRET();
 }

@ -227,29 +247,29 @@ static _f void dVifRecLimit(int idx) {
 _f void dVifUnpack(int idx, u8 *data, u32 size, bool isFill) {

 	const nVifStruct& v		= nVif[idx];
-	const u8	upkType		= vif->cmd & 0x1f | ((!!vif->usn) << 5);
-	const int	doMask		= (upkType>>4) & 1;
+	const u8	upkType		= v.vif->cmd & 0x1f | ((!!v.vif->usn) << 5);
+	const int	doMask		= v.vif->cmd & 0x10;

-	const int	cycle_cl	= vifRegs->cycle.cl;
-	const int	cycle_wl	= vifRegs->cycle.wl;
+	const int	cycle_cl	= v.vifRegs->cycle.cl;
+	const int	cycle_wl	= v.vifRegs->cycle.wl;
 	const int	cycleSize	= isFill ? cycle_cl : cycle_wl;
 	const int	blockSize	= isFill ? cycle_wl : cycle_cl;

-	if (vif->cl >= blockSize)  vif->cl = 0;
+	if (v.vif->cl >= blockSize)  v.vif->cl = 0;

 	_vBlock.upkType   = upkType;
-	_vBlock.num		  = *(u8*)&vifRegs->num;
-	_vBlock.mode	  = *(u8*)&vifRegs->mode;
-	_vBlock.scl		  = vif->cl;
+	_vBlock.num		  = *(u8*)&v.vifRegs->num;
+	_vBlock.mode	  = *(u8*)&v.vifRegs->mode;
+	_vBlock.scl		  = v.vif->cl;
 	_vBlock.cl		  = cycle_cl;
 	_vBlock.wl		  = cycle_wl;

 	// Zero out the mask parameter if it's unused -- games leave random junk
 	// values here which cause false recblock cache misses.
-	_vBlock.mask	  = doMask ? vifRegs->mask : 0x00;
+	_vBlock.mask	  = (doMask || ((_vBlock.mode&3)!=0) ) ? v.vifRegs->mask : 0x00;

 	if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
-		if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) {
+		if( u8* dest = dVifsetVUptr(v, v.vif->tag.addr) ) {
 			//DevCon.WriteLn("Running Recompiled Block!");
 			((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
 		}
--- a/pcsx2/x86/newVif_HashBucket.h
+++ b/pcsx2/x86/newVif_HashBucket.h
@ -65,11 +65,11 @@ public:
 		u32 d = *((u32*)dataPtr);
 		const SizeChain<T>& bucket( mBucket[d % hSize] );

-		for (int i=bucket.Size; i; --i) {
+		for (int i=bucket.Size-1; i>0; --i) {
 			// This inline version seems about 1-2% faster in tests of games that average 1
 			// program per bucket.  Games that average more should see a bigger improvement --air
-			int result = _mm_movemask_ps( (cast_m128) _mm_cmpeq_epi32( _mm_load_si128((__m128i*)&bucket.Chain[i]), _mm_load_si128((__m128i*)dataPtr) ) ) & 0x7;
-			if( result == 0x7 ) return &bucket.Chain[i];
+			int result = _mm_movemask_ps( (cast_m128) _mm_cmpeq_epi32( _mm_load_si128((__m128i*)&bucket.Chain[i]), _mm_load_si128((__m128i*)dataPtr) ) );
+			if( (result&0x7) == 0x7 ) return &bucket.Chain[i];

 			// Dynamically generated function version, can't be inlined. :(
 			//if ((((nVifCall)((void*)nVifMemCmp))(&bucket.Chain[i], dataPtr))==7) return &bucket.Chain[i];