Merge pull request #1706 from PCSX2/greg/vif-hash

Greg/vif hash
2016-12-21 22:30:27 +00:00 · 2016-12-21 22:30:27 +00:00 · 10eb88f6fe
parent 5a63a62454 58e4076620
commit 10eb88f6fe
4 changed files with 175 additions and 170 deletions
--- a/pcsx2/x86/newVif.h
+++ b/pcsx2/x86/newVif.h
@ -55,42 +55,21 @@ _vifT extern void  dVifUnpack  (const u8* data, bool isFill);
 #define xmmRow  xmm6
 #define xmmTemp xmm7

-// nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
-//             used as the hash bucket selector.
-struct __aligned16 nVifBlock {
-	u8 num; // [00] Num Field
-	u8 upkType; // [01] Unpack Type [usn1:mask1:upk*4]
-	u8 mode; // [02] Mode Field
-	u8 aligned; // [03] Packet Alignment
-	u32 mask; // [04] Mask Field
-	u16 cl; // [08] CL Field
-	u16 wl; // [10] WL Field
-	uptr startPtr; // [12] Start Ptr of RecGen Code
-}; // 16 bytes
-
-#define _hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
-#define _tParams nVifBlock, _hSize
 struct nVifStruct {
-
-	__aligned16 nVifBlock   block;
-
 	// Buffer for partial transfers (should always be first to ensure alignment)
 	// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
 	__aligned16 u8			buffer[256*16];
 	u32						bSize;			// Size of 'buffer'
-	u32						bPtr;
-
-	uint					recReserveSizeMB;	// reserve size, in megabytes.
-	RecompiledCodeReserve*	recReserve;
-	u8*						recWritePtr;		// current write pos into the reserve
-
-	HashBucket<_tParams>*	vifBlocks;		// Vif Blocks
-	int						numBlocks;		// # of Blocks Recompiled

 	// VIF0 or VIF1 - provided for debugging helpfulness only, and is generally unused.
 	// (templates are used for most or all VIF indexing)
 	u32						idx;

+	RecompiledCodeReserve*	recReserve;
+	u8*						recWritePtr;		// current write pos into the reserve
+
+	HashBucket				vifBlocks;		// Vif Blocks
+
 	nVifStruct();
 };

--- a/pcsx2/x86/newVif_Dynarec.cpp
+++ b/pcsx2/x86/newVif_Dynarec.cpp
@ -22,34 +22,30 @@
 #include "MTVU.h"
 #include "Utilities/Perf.h"

+static void recReset(int idx) {
+	nVif[idx].vifBlocks.reset();
+
+	nVif[idx].recReserve->Reset();
+
+	nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
+}
+
 void dVifReserve(int idx) {
 	if(!nVif[idx].recReserve)
 		nVif[idx].recReserve = new RecompiledCodeReserve(pxsFmt(L"VIF%u Unpack Recompiler Cache", idx), _8mb);

-	nVif[idx].recReserve->Reserve( nVif[idx].recReserveSizeMB * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec );
+	nVif[idx].recReserve->Reserve( 8 * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec );
 }

 void dVifReset(int idx) {
 	pxAssertDev(nVif[idx].recReserve, "Dynamic VIF recompiler reserve must be created prior to VIF use or reset!");

-	if(!nVif[idx].vifBlocks)
-		nVif[idx].vifBlocks = new HashBucket<_tParams>();
-	else
-		nVif[idx].vifBlocks->clear();
-
-	nVif[idx].recReserve->Reset();
-
-	nVif[idx].numBlocks   =  0;
-	nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
-	//memset(nVif[idx].recWritePtr, 0xcc, nVif[idx].recReserveSizeMB * _1mb);
+	recReset(idx);
 }

 void dVifClose(int idx) {
-	nVif[idx].numBlocks = 0;
 	if (nVif[idx].recReserve)
 		nVif[idx].recReserve->Reset();
-
-	safe_delete(nVif[idx].vifBlocks);
 }

 void dVifRelease(int idx) {
@ -61,7 +57,8 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo
 	: v(vif_)
 	, vB(vifBlock_)
 {
-	isFill		= (vB.cl < vB.wl);
+	const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
+	isFill		= (vB.cl < wl);
 	usn			= (vB.upkType>>5) & 1;
 	doMask		= (vB.upkType>>4) & 1;
 	doMode		= vB.mode & 3;
@ -201,11 +198,13 @@ void VifUnpackSSE_Dynarec::ModUnpack( int upknum, bool PostOp )
 	}

 }
+
 void VifUnpackSSE_Dynarec::CompileRoutine() {
+	const int  wl		 = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
 	const int  upkNum	 = vB.upkType & 0xf;
 	const u8&  vift		 = nVifT[upkNum];
-	const int  cycleSize = isFill ? vB.cl : vB.wl;
-	const int  blockSize = isFill ? vB.wl : vB.cl;
+	const int  cycleSize = isFill ? vB.cl : wl;
+	const int  blockSize = isFill ? wl : vB.cl;
 	const int  skipSize	 = blockSize - cycleSize;

 	uint vNum	= vB.num ? vB.num : 256;
@ -261,60 +260,42 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
 	xRET();
 }

-_vifT static __fi u8* dVifsetVUptr(uint cl, uint wl, bool isFill) {
-	nVifStruct&   v          = nVif[idx];
-	vifStruct&    vif        = MTVU_VifX;
-	const VURegs& VU         = vuRegs[idx];
-	const uint    vuMemLimit = idx ? 0x4000 : 0x1000;
+static u16 dVifComputeLength(uint cl, uint wl, u8 num, bool isFill) {
+	uint length   = (num > 0) ? (num * 16) : 4096; // 0 = 256

-	u8*  startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
-	u8*  endmem   = VU.Mem + vuMemLimit;
-	uint length   = (v.block.num > 0) ? (v.block.num * 16) : 4096; // 0 = 256
-
-	//wl = wl ? wl : 256; //0 is taken as 256 (KH2)
-	//if (wl == 256) isFill = true;
 	if (!isFill) {
 		uint skipSize  = (cl - wl) * 16;
-		uint blocks    = (v.block.num + (wl-1)) / wl; //Need to round up num's to calculate skip size correctly.
+		uint blocks    = (num + (wl-1)) / wl; //Need to round up num's to calculate skip size correctly.
 		length += (blocks-1) * skipSize;
 	}

-	if ((startmem + length) <= endmem) {
-		return startmem;
-	}
-	//Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, vif.tag.addr, vif.tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl);
-	return NULL; // Fall Back to Interpreters which have wrap-around logic
+	return std::min(length, 0xFFFFu);
 }

-// [TODO] :  Finish implementing support for VIF's growable recBlocks buffer.  Currently
-//    it clears the buffer only.
-static __fi void dVifRecLimit(int idx) {
-	if (nVif[idx].recWritePtr > (nVif[idx].recReserve->GetPtrEnd() - _256kb)) {
-		DevCon.WriteLn(L"nVif Recompiler Cache Reset! [%ls > %ls]",
-			pxsPtr(nVif[idx].recWritePtr), pxsPtr(nVif[idx].recReserve->GetPtrEnd())
-		);
-		nVif[idx].recReserve->Reset();
-		nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
-	}
-}
-
-_vifT static __ri bool dVifExecuteUnpack(const u8* data, bool isFill)
-{
+_vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill) {
 	nVifStruct& v = nVif[idx];
-	VIFregisters& vifRegs = MTVU_VifXRegs;

-	if (nVifBlock* b = v.vifBlocks->find(&v.block)) {
-		if (u8* dest = dVifsetVUptr<idx>(vifRegs.cycle.cl, vifRegs.cycle.wl, isFill)) {
-			//DevCon.WriteLn("Running Recompiled Block!");
-			((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
+	// Check size before the compilation
+	if (v.recWritePtr > (v.recReserve->GetPtrEnd() - _256kb)) {
+		DevCon.WriteLn(L"nVif Recompiler Cache Reset! [%ls > %ls]",
+			pxsPtr(v.recWritePtr), pxsPtr(v.recReserve->GetPtrEnd())
+		);
+		recReset(idx);
 	}
-		else {
-			VIF_LOG("Running Interpreter Block");
-			_nVifUnpack(idx, data, vifRegs.mode, isFill);
-		}
-		return true;
-	}
-	return false;
+
+	// Compile the block now
+	xSetPtr(v.recWritePtr);
+
+	block.startPtr = (uptr)xGetAlignedCallTarget();
+	block.length = dVifComputeLength(block.cl, block.wl, block.num, isFill);
+	v.vifBlocks.add(block);
+
+	VifUnpackSSE_Dynarec(v, block).CompileRoutine();
+
+	Perf::vif.map((uptr)v.recWritePtr, xGetPtr() - v.recWritePtr, block.upkType /* FIXME ideally a key*/);
+	v.recWritePtr = xGetPtr();
+
+	return &block;
 }

 _vifT __fi void dVifUnpack(const u8* data, bool isFill) {
@ -326,42 +307,56 @@ _vifT __fi void dVifUnpack(const u8* data, bool isFill) {
 	const u8	upkType   = (vif.cmd & 0x1f) | (vif.usn << 5);
 	const int	doMask    = isFill? 1 : (vif.cmd & 0x10);

-	v.block.upkType = upkType;
-	v.block.num     = (u8&)vifRegs.num;
-	v.block.mode    = (u8&)vifRegs.mode;
-	v.block.cl      = vifRegs.cycle.cl;
-	v.block.wl      = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256;
-	v.block.aligned = vif.start_aligned;  //MTVU doesn't have a packet size!
+	nVifBlock   block;

+	// Performance note: initial code was using u8/u16 field of the struct
+	// directly. However reading back the data (as u32) in HashBucket.find
+	// leads to various memory stalls. So it is way faster to manually build the data
+	// in u32 (aka x86 register).
+	//
+	// Warning the order of data in hash_key/key0/key1 depends on the nVifBlock struct
+	u32 hash_key = (u32)(upkType & 0xFF) << 8 | (vifRegs.num & 0xFF);
+
+	u32 key1 = ((u32)vifRegs.cycle.wl << 24) | ((u32)vifRegs.cycle.cl << 16) | ((u32)(vif.start_aligned & 0xFF) << 8) | ((u32)vifRegs.mode & 0xFF);
 	if ((upkType & 0xf) != 9)
-		v.block.aligned &= 0x1;
+		key1 &= 0xFFFF01FF;

-	//DevCon.Warning("Alignment %d", v.block.aligned);
 	// Zero out the mask parameter if it's unused -- games leave random junk
 	// values here which cause false recblock cache misses.
-	v.block.mask	= doMask ? vifRegs.mask : 0;
+	u32 key0 = doMask ? vifRegs.mask : 0;

-	//DevCon.WriteLn("nVif%d: Recompiled Block! [%d]", idx, nVif[idx].numBlocks++);
+	block.hash_key = hash_key;
+	block.key0 = key0;
+	block.key1 = key1;
+
+	//DevCon.WriteLn("nVif%d: Recompiled Block!", idx);
 	//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
-	//	v.Block.num, v.Block.upkType, v.Block.scl, v.Block.cl, v.Block.wl, v.Block.mode,
-	//	doMask >> 4, doMask ? wxsFormat( L"0x%08x", v.Block.mask ).c_str() : L"ignored"
+	//	block.num, block.upkType, block.scl, block.cl, block.wl, block.mode,
+	//	doMask >> 4, doMask ? wxsFormat( L"0x%08x", block.mask ).c_str() : L"ignored"
 	//);

-	if (dVifExecuteUnpack<idx>(data, isFill)) return;
+	// Seach in cache before trying to compile the block
+	nVifBlock*  b = v.vifBlocks.find(block);
+	if (unlikely(b == nullptr)) {
+		b = dVifCompile<idx>(block, isFill);
+	}

-	xSetPtr(v.recWritePtr);
-	v.block.startPtr = (uptr)xGetAlignedCallTarget();
-	v.vifBlocks->add(v.block);
-	VifUnpackSSE_Dynarec(v, v.block).CompileRoutine();
+	{ // Execute the block
+		const VURegs& VU         = vuRegs[idx];
+		const uint    vuMemLimit = idx ? 0x4000 : 0x1000;

-	Perf::vif.map((uptr)v.recWritePtr, xGetPtr() - v.recWritePtr, v.block.upkType /* FIXME ideally a key*/);
-	nVif[idx].recWritePtr = xGetPtr();
+		u8*  startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
+		u8*  endmem   = VU.Mem + vuMemLimit;

-	dVifRecLimit(idx);
-
-	// Run the block we just compiled.  Various conditions may force us to still use
-	// the interpreter unpacker though, so a recursive call is the safest way here...
-	dVifExecuteUnpack<idx>(data, isFill);
+		if (likely((startmem + b->length) <= endmem)) {
+			// No wrapping, you can run the fast dynarec
+			((nVifrecCall)b->startPtr)((uptr)startmem, (uptr)data);
+		} else {
+			VIF_LOG("Running Interpreter Block: nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x",
+					v.idx, vif.tag.addr, vif.tag.addr + (block.num * 16), block.num, block.wl, block.cl);
+			_nVifUnpack(idx, data, vifRegs.mode, isFill);
+		}
+	}
 }

 template void dVifUnpack<0>(const u8* data, bool isFill);
--- a/pcsx2/x86/newVif_HashBucket.h
+++ b/pcsx2/x86/newVif_HashBucket.h
@ -13,87 +13,122 @@
 *  If not, see <http://www.gnu.org/licenses/>.
 */

-#include "x86emitter/x86_intrin.h"
 #pragma once

-// Create some typecast operators for SIMD operations.  For some reason MSVC needs a
-// handle/reference typecast to avoid error.  GCC (and presumably other compilers)
-// generate an error if the handle/ref is used.  Honestly neither makes sense, since
-// both typecasts should be perfectly valid >_<.  --air
-#ifdef _MSC_VER
-#	define cast_m128		__m128&
-#	define cast_m128i		__m128i&
-#	define cast_m128d		__m128d&
-#else // defined(__GNUC__)
-#	define cast_m128		__m128
-#	define cast_m128i		__m128i
-#	define cast_m128d		__m128d
-#endif
+#include <array>

-template< typename T >
-struct SizeChain
-{
-	int Size;
-	T*  Chain;
-};
+// nVifBlock - Ordered for Hashing; the 'num' and 'upkType' fields are
+//             used as the hash bucket selector.
+union nVifBlock {
+	// Warning: order depends on the newVifDynaRec code
+	struct {
+		u8 num;			// [00] Num Field
+		u8 upkType; 	// [01] Unpack Type [usn1:mask1:upk*4]
+		u16 length; 	// [02] Extra: pre computed Length
+		u32 mask;		// [04] Mask Field
+		u8 mode;		// [08] Mode Field
+		u8 aligned; 	// [09] Packet Alignment
+		u8 cl;			// [10] CL Field
+		u8 wl;			// [11] WL Field
+		uptr startPtr;	// [12] Start Ptr of RecGen Code
+	};
+
+	struct {
+		u16 hash_key;
+		u16 _pad0;
+		u32 key0;
+		u32 key1;
+		uptr value;
+	};
+
+}; // 16 bytes
+
+// 0x4000 is enough but 0x10000 allow
+// * to skip the compare value of the first double world in lookup
+// * to use a 16 bits move instead of an 'and' mask to compute the hashed key
+#define hSize 0x10000 // [usn*1:mask*1:upk*4:num*8] hash...

 // HashBucket is a container which uses a built-in hash function
-// to perform quick searches.
-// T is a struct data type (note: size must be in multiples of 16 bytes!)
-// hSize determines the number of buckets HashBucket will use for sorting.
+// to perform quick searches. It is designed around the nVifBlock structure
+//
 // The hash function is determined by taking the first bytes of data and
 // performing a modulus the size of hSize. So the most diverse-data should
 // be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
-template<typename T, int hSize>
 class HashBucket {
 protected:
-	SizeChain<T> mBucket[hSize];
+	std::array<nVifBlock*, hSize> m_bucket;

 public:
 	HashBucket() {
-		for (int i = 0; i < hSize; i++) {
-			mBucket[i].Chain	= NULL;
-			mBucket[i].Size		= 0;
+		m_bucket.fill(nullptr);
 	}
-	}
-	virtual ~HashBucket() throw() { clear(); }
-	int quickFind(u32 data) {
-		return mBucket[data % hSize].Size;
-	}
-	__fi T* find(T* dataPtr) {
-		u32 d = *((u32*)dataPtr);
-		const SizeChain<T>& bucket( mBucket[d % hSize] );

-		const __m128i* endpos = (__m128i*)&bucket.Chain[bucket.Size];
-		const __m128i data128( _mm_load_si128((__m128i*)dataPtr) );
+	~HashBucket() throw() { clear(); }

-		for( const __m128i* chainpos = (__m128i*)bucket.Chain; chainpos<endpos; chainpos+=sizeof(T) / 16u ) {
-			// Note SSE4/AVX optimization (However it requires to only have the key in the first 16B without the pointer)
-			// tmp = xor (data128, load(chainpos))
-			// ptest tmp tmp (zf will be set if tmp == 0, i.e equality)
+	__fi nVifBlock* find(const nVifBlock& dataPtr) {
+		nVifBlock* chainpos = m_bucket[dataPtr.hash_key];

-			// This inline SSE code is generally faster than using emitter code, since it inlines nicely. --air
-			int result = _mm_movemask_ps( (cast_m128) _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) );
-			if( (result&0x7) == 0x7 ) return (T*)chainpos;
+		while (true) {
+			if (chainpos->key0 == dataPtr.key0 && chainpos->key1 == dataPtr.key1)
+				return chainpos;
+
+			if (chainpos->startPtr == 0)
+				return nullptr;
+
+			chainpos++;
 		}
-		return NULL;
 	}
-	__fi void add(const T& dataPtr) {
-		u32 d = (u32&)dataPtr;
-		SizeChain<T>& bucket( mBucket[d % hSize] );

-		if( (bucket.Chain = (T*)pcsx2_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16, sizeof(T)*bucket.Size)) == NULL ) {
+	void add(const nVifBlock& dataPtr) {
+		u32 b = dataPtr.hash_key;
+
+		u32 size = bucket_size( dataPtr );
+
+		// Warning there is an extra +1 due to the empty cell
+		// Performance note: 64B align to reduce cache miss penalty in `find`
+		if( (m_bucket[b] = (nVifBlock*)pcsx2_aligned_realloc( m_bucket[b], sizeof(nVifBlock)*(size+2), 64, sizeof(nVifBlock)*(size+1) )) == NULL ) {
 			throw Exception::OutOfMemory(
-				wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
+				wxsFormat(L"HashBucket Chain (bucket size=%d)", size+2)
 			);
 		}
-		memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
-		if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
+
+		// Replace the empty cell by the new block and create a new empty cell
+		memcpy(&m_bucket[b][size++], &dataPtr, sizeof(nVifBlock));
+		memset(&m_bucket[b][size], 0, sizeof(nVifBlock));
+
+		if( size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", b, size );
 	}
+
+	u32 bucket_size(const nVifBlock& dataPtr) {
+		nVifBlock* chainpos = m_bucket[dataPtr.hash_key];
+
+		u32 size = 0;
+
+		while (chainpos->startPtr != 0) {
+			size++;
+			chainpos++;
+		}
+
+		return size;
+	}
+
 	void clear() {
-		for (int i = 0; i < hSize; i++) {
-			safe_aligned_free(mBucket[i].Chain);
-			mBucket[i].Size = 0;
+		for (auto& bucket : m_bucket)
+			safe_aligned_free(bucket);
+	}
+
+	void reset() {
+		clear();
+
+		// Allocate an empty cell for all buckets
+		for (auto& bucket : m_bucket) {
+			if( (bucket = (nVifBlock*)_aligned_malloc( sizeof(nVifBlock), 64 )) == nullptr ) {
+				throw Exception::OutOfMemory(
+						wxsFormat(L"HashBucket Chain (bucket size=%d)", 1)
+						);
+			}
+
+			memset(bucket, 0, sizeof(nVifBlock));
 		}
 	}
 };
--- a/pcsx2/x86/newVif_Unpack.cpp
+++ b/pcsx2/x86/newVif_Unpack.cpp
@ -73,10 +73,6 @@ static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = {

 nVifStruct::nVifStruct()
 {
-	vifBlocks	=  NULL;
-	numBlocks	=  0;
-
-	recReserveSizeMB = 8;
 }

 void reserveNewVif(int idx)