Merge pull request #1706 from PCSX2/greg/vif-hash

Greg/vif hash
2016-12-21 22:30:27 +00:00 · 2016-12-21 22:30:27 +00:00 · 10eb88f6fe
parent 5a63a62454 58e4076620
commit 10eb88f6fe
4 changed files with 175 additions and 170 deletions
--- a/pcsx2/x86/newVif.h
+++ b/pcsx2/x86/newVif.h
@ -55,42 +55,21 @@ _vifT extern void  dVifUnpack  (const u8* data, bool isFill);
 #define xmmRow  xmm6
 #define xmmTemp xmm7
 // nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
 //             used as the hash bucket selector.
 struct __aligned16 nVifBlock {
 	u8 num; // [00] Num Field
 	u8 upkType; // [01] Unpack Type [usn1:mask1:upk*4]
 	u8 mode; // [02] Mode Field
 	u8 aligned; // [03] Packet Alignment
 	u32 mask; // [04] Mask Field
 	u16 cl; // [08] CL Field
 	u16 wl; // [10] WL Field
 	uptr startPtr; // [12] Start Ptr of RecGen Code
 }; // 16 bytes
 #define _hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
 #define _tParams nVifBlock, _hSize
 struct nVifStruct {
 	__aligned16 nVifBlock   block;
 	// Buffer for partial transfers (should always be first to ensure alignment)
 	// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
 	__aligned16 u8			buffer[256*16];
 	u32						bSize;			// Size of 'buffer'
 	u32						bPtr;
 	uint					recReserveSizeMB;	// reserve size, in megabytes.
 	RecompiledCodeReserve*	recReserve;
 	u8*						recWritePtr;		// current write pos into the reserve
 	HashBucket<_tParams>*	vifBlocks;		// Vif Blocks
 	int						numBlocks;		// # of Blocks Recompiled
 	// VIF0 or VIF1 - provided for debugging helpfulness only, and is generally unused.
 	// (templates are used for most or all VIF indexing)
 	u32						idx;
 	RecompiledCodeReserve*	recReserve;
 	u8*						recWritePtr;		// current write pos into the reserve
 	HashBucket				vifBlocks;		// Vif Blocks
 	nVifStruct();
 };
--- a/pcsx2/x86/newVif_Dynarec.cpp
+++ b/pcsx2/x86/newVif_Dynarec.cpp
@ -22,34 +22,30 @@
 #include "MTVU.h"
 #include "Utilities/Perf.h"
 static void recReset(int idx) {
 	nVif[idx].vifBlocks.reset();
 	nVif[idx].recReserve->Reset();
 	nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
 }
 void dVifReserve(int idx) {
 	if(!nVif[idx].recReserve)
 		nVif[idx].recReserve = new RecompiledCodeReserve(pxsFmt(L"VIF%u Unpack Recompiler Cache", idx), _8mb);
-	nVif[idx].recReserve->Reserve( nVif[idx].recReserveSizeMB * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec );
+	nVif[idx].recReserve->Reserve( 8 * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec );
 }
 void dVifReset(int idx) {
 	pxAssertDev(nVif[idx].recReserve, "Dynamic VIF recompiler reserve must be created prior to VIF use or reset!");
-	if(!nVif[idx].vifBlocks)
+	recReset(idx);
 		nVif[idx].vifBlocks = new HashBucket<_tParams>();
 	else
 		nVif[idx].vifBlocks->clear();
 	nVif[idx].recReserve->Reset();
 	nVif[idx].numBlocks   =  0;
 	nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
 	//memset(nVif[idx].recWritePtr, 0xcc, nVif[idx].recReserveSizeMB * _1mb);
 }
 void dVifClose(int idx) {
 	nVif[idx].numBlocks = 0;
 	if (nVif[idx].recReserve)
 		nVif[idx].recReserve->Reset();
 	safe_delete(nVif[idx].vifBlocks);
 }
 void dVifRelease(int idx) {
@ -61,7 +57,8 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo
 	: v(vif_)
 	, vB(vifBlock_)
 {
-	isFill		= (vB.cl < vB.wl);
+	const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
 	isFill		= (vB.cl < wl);
 	usn			= (vB.upkType>>5) & 1;
 	doMask		= (vB.upkType>>4) & 1;
 	doMode		= vB.mode & 3;
@ -201,11 +198,13 @@ void VifUnpackSSE_Dynarec::ModUnpack( int upknum, bool PostOp )
 	}
 }
 void VifUnpackSSE_Dynarec::CompileRoutine() {
 	const int  wl		 = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
 	const int  upkNum	 = vB.upkType & 0xf;
 	const u8&  vift		 = nVifT[upkNum];
-	const int  cycleSize = isFill ? vB.cl : vB.wl;
+	const int  cycleSize = isFill ? vB.cl : wl;
-	const int  blockSize = isFill ? vB.wl : vB.cl;
+	const int  blockSize = isFill ? wl : vB.cl;
 	const int  skipSize	 = blockSize - cycleSize;
 	uint vNum	= vB.num ? vB.num : 256;
@ -261,60 +260,42 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
 	xRET();
 }
-_vifT static __fi u8* dVifsetVUptr(uint cl, uint wl, bool isFill) {
+static u16 dVifComputeLength(uint cl, uint wl, u8 num, bool isFill) {
-	nVifStruct&   v          = nVif[idx];
+	uint length   = (num > 0) ? (num * 16) : 4096; // 0 = 256
 	vifStruct&    vif        = MTVU_VifX;
 	const VURegs& VU         = vuRegs[idx];
 	const uint    vuMemLimit = idx ? 0x4000 : 0x1000;
 	u8*  startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
 	u8*  endmem   = VU.Mem + vuMemLimit;
 	uint length   = (v.block.num > 0) ? (v.block.num * 16) : 4096; // 0 = 256
 	//wl = wl ? wl : 256; //0 is taken as 256 (KH2)
 	//if (wl == 256) isFill = true;
 	if (!isFill) {
 		uint skipSize  = (cl - wl) * 16;
-		uint blocks    = (v.block.num + (wl-1)) / wl; //Need to round up num's to calculate skip size correctly.
+		uint blocks    = (num + (wl-1)) / wl; //Need to round up num's to calculate skip size correctly.
 		length += (blocks-1) * skipSize;
 	}
-	if ((startmem + length) <= endmem) {
+	return std::min(length, 0xFFFFu);
 		return startmem;
 	}
 	//Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, vif.tag.addr, vif.tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl);
 	return NULL; // Fall Back to Interpreters which have wrap-around logic
 }
-// [TODO] :  Finish implementing support for VIF's growable recBlocks buffer.  Currently
+_vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill) {
 //    it clears the buffer only.
 static __fi void dVifRecLimit(int idx) {
 	if (nVif[idx].recWritePtr > (nVif[idx].recReserve->GetPtrEnd() - _256kb)) {
 		DevCon.WriteLn(L"nVif Recompiler Cache Reset! [%ls > %ls]",
 			pxsPtr(nVif[idx].recWritePtr), pxsPtr(nVif[idx].recReserve->GetPtrEnd())
 		);
 		nVif[idx].recReserve->Reset();
 		nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
 	}
 }
 _vifT static __ri bool dVifExecuteUnpack(const u8* data, bool isFill)
 {
 	nVifStruct& v = nVif[idx];
 	VIFregisters& vifRegs = MTVU_VifXRegs;
-	if (nVifBlock* b = v.vifBlocks->find(&v.block)) {
+	// Check size before the compilation
-		if (u8* dest = dVifsetVUptr<idx>(vifRegs.cycle.cl, vifRegs.cycle.wl, isFill)) {
+	if (v.recWritePtr > (v.recReserve->GetPtrEnd() - _256kb)) {
-			//DevCon.WriteLn("Running Recompiled Block!");
+		DevCon.WriteLn(L"nVif Recompiler Cache Reset! [%ls > %ls]",
-			((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
+			pxsPtr(v.recWritePtr), pxsPtr(v.recReserve->GetPtrEnd())
 		);
 		recReset(idx);
 	}
-		else {
+
-			VIF_LOG("Running Interpreter Block");
+	// Compile the block now
-			_nVifUnpack(idx, data, vifRegs.mode, isFill);
+	xSetPtr(v.recWritePtr);
-		}
+
-		return true;
+	block.startPtr = (uptr)xGetAlignedCallTarget();
-	}
+	block.length = dVifComputeLength(block.cl, block.wl, block.num, isFill);
-	return false;
+	v.vifBlocks.add(block);
 	VifUnpackSSE_Dynarec(v, block).CompileRoutine();
 	Perf::vif.map((uptr)v.recWritePtr, xGetPtr() - v.recWritePtr, block.upkType /* FIXME ideally a key*/);
 	v.recWritePtr = xGetPtr();
 	return &block;
 }
 _vifT __fi void dVifUnpack(const u8* data, bool isFill) {
@ -326,42 +307,56 @@ _vifT __fi void dVifUnpack(const u8* data, bool isFill) {
 	const u8	upkType   = (vif.cmd & 0x1f) | (vif.usn << 5);
 	const int	doMask    = isFill? 1 : (vif.cmd & 0x10);
-	v.block.upkType = upkType;
+	nVifBlock   block;
 	v.block.num     = (u8&)vifRegs.num;
 	v.block.mode    = (u8&)vifRegs.mode;
 	v.block.cl      = vifRegs.cycle.cl;
 	v.block.wl      = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256;
 	v.block.aligned = vif.start_aligned;  //MTVU doesn't have a packet size!
 	// Performance note: initial code was using u8/u16 field of the struct
 	// directly. However reading back the data (as u32) in HashBucket.find
 	// leads to various memory stalls. So it is way faster to manually build the data
 	// in u32 (aka x86 register).
 	//
 	// Warning the order of data in hash_key/key0/key1 depends on the nVifBlock struct
 	u32 hash_key = (u32)(upkType & 0xFF) << 8 | (vifRegs.num & 0xFF);
 	u32 key1 = ((u32)vifRegs.cycle.wl << 24) | ((u32)vifRegs.cycle.cl << 16) | ((u32)(vif.start_aligned & 0xFF) << 8) | ((u32)vifRegs.mode & 0xFF);
 	if ((upkType & 0xf) != 9)
-		v.block.aligned &= 0x1;
+		key1 &= 0xFFFF01FF;
 	//DevCon.Warning("Alignment %d", v.block.aligned);
 	// Zero out the mask parameter if it's unused -- games leave random junk
 	// values here which cause false recblock cache misses.
-	v.block.mask	= doMask ? vifRegs.mask : 0;
+	u32 key0 = doMask ? vifRegs.mask : 0;
-	//DevCon.WriteLn("nVif%d: Recompiled Block! [%d]", idx, nVif[idx].numBlocks++);
+	block.hash_key = hash_key;
 	block.key0 = key0;
 	block.key1 = key1;
 	//DevCon.WriteLn("nVif%d: Recompiled Block!", idx);
 	//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
-	//	v.Block.num, v.Block.upkType, v.Block.scl, v.Block.cl, v.Block.wl, v.Block.mode,
+	//	block.num, block.upkType, block.scl, block.cl, block.wl, block.mode,
-	//	doMask >> 4, doMask ? wxsFormat( L"0x%08x", v.Block.mask ).c_str() : L"ignored"
+	//	doMask >> 4, doMask ? wxsFormat( L"0x%08x", block.mask ).c_str() : L"ignored"
 	//);
-	if (dVifExecuteUnpack<idx>(data, isFill)) return;
+	// Seach in cache before trying to compile the block
 	nVifBlock*  b = v.vifBlocks.find(block);
 	if (unlikely(b == nullptr)) {
 		b = dVifCompile<idx>(block, isFill);
 	}
-	xSetPtr(v.recWritePtr);
+	{ // Execute the block
-	v.block.startPtr = (uptr)xGetAlignedCallTarget();
+		const VURegs& VU         = vuRegs[idx];
-	v.vifBlocks->add(v.block);
+		const uint    vuMemLimit = idx ? 0x4000 : 0x1000;
 	VifUnpackSSE_Dynarec(v, v.block).CompileRoutine();
-	Perf::vif.map((uptr)v.recWritePtr, xGetPtr() - v.recWritePtr, v.block.upkType /* FIXME ideally a key*/);
+		u8*  startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
-	nVif[idx].recWritePtr = xGetPtr();
+		u8*  endmem   = VU.Mem + vuMemLimit;
-	dVifRecLimit(idx);
+		if (likely((startmem + b->length) <= endmem)) {
-
+			// No wrapping, you can run the fast dynarec
-	// Run the block we just compiled.  Various conditions may force us to still use
+			((nVifrecCall)b->startPtr)((uptr)startmem, (uptr)data);
-	// the interpreter unpacker though, so a recursive call is the safest way here...
+		} else {
-	dVifExecuteUnpack<idx>(data, isFill);
+			VIF_LOG("Running Interpreter Block: nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x",
 					v.idx, vif.tag.addr, vif.tag.addr + (block.num * 16), block.num, block.wl, block.cl);
 			_nVifUnpack(idx, data, vifRegs.mode, isFill);
 		}
 	}
 }
 template void dVifUnpack<0>(const u8* data, bool isFill);
--- a/pcsx2/x86/newVif_HashBucket.h
+++ b/pcsx2/x86/newVif_HashBucket.h
@ -13,87 +13,122 @@
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "x86emitter/x86_intrin.h"
 #pragma once
-// Create some typecast operators for SIMD operations.  For some reason MSVC needs a
+#include <array>
 // handle/reference typecast to avoid error.  GCC (and presumably other compilers)
 // generate an error if the handle/ref is used.  Honestly neither makes sense, since
 // both typecasts should be perfectly valid >_<.  --air
 #ifdef _MSC_VER
 #	define cast_m128		__m128&
 #	define cast_m128i		__m128i&
 #	define cast_m128d		__m128d&
 #else // defined(__GNUC__)
 #	define cast_m128		__m128
 #	define cast_m128i		__m128i
 #	define cast_m128d		__m128d
 #endif
-template< typename T >
+// nVifBlock - Ordered for Hashing; the 'num' and 'upkType' fields are
-struct SizeChain
+//             used as the hash bucket selector.
-{
+union nVifBlock {
-	int Size;
+	// Warning: order depends on the newVifDynaRec code
-	T*  Chain;
+	struct {
-};
+		u8 num;			// [00] Num Field
 		u8 upkType; 	// [01] Unpack Type [usn1:mask1:upk*4]
 		u16 length; 	// [02] Extra: pre computed Length
 		u32 mask;		// [04] Mask Field
 		u8 mode;		// [08] Mode Field
 		u8 aligned; 	// [09] Packet Alignment
 		u8 cl;			// [10] CL Field
 		u8 wl;			// [11] WL Field
 		uptr startPtr;	// [12] Start Ptr of RecGen Code
 	};
 	struct {
 		u16 hash_key;
 		u16 _pad0;
 		u32 key0;
 		u32 key1;
 		uptr value;
 	};
 }; // 16 bytes
 // 0x4000 is enough but 0x10000 allow
 // * to skip the compare value of the first double world in lookup
 // * to use a 16 bits move instead of an 'and' mask to compute the hashed key
 #define hSize 0x10000 // [usn*1:mask*1:upk*4:num*8] hash...
 // HashBucket is a container which uses a built-in hash function
-// to perform quick searches.
+// to perform quick searches. It is designed around the nVifBlock structure
-// T is a struct data type (note: size must be in multiples of 16 bytes!)
+//
 // hSize determines the number of buckets HashBucket will use for sorting.
 // The hash function is determined by taking the first bytes of data and
 // performing a modulus the size of hSize. So the most diverse-data should
 // be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
 template<typename T, int hSize>
 class HashBucket {
 protected:
-	SizeChain<T> mBucket[hSize];
+	std::array<nVifBlock*, hSize> m_bucket;
 public:
 	HashBucket() {
-		for (int i = 0; i < hSize; i++) {
+		m_bucket.fill(nullptr);
 			mBucket[i].Chain	= NULL;
 			mBucket[i].Size		= 0;
 	}
 	}
 	virtual ~HashBucket() throw() { clear(); }
 	int quickFind(u32 data) {
 		return mBucket[data % hSize].Size;
 	}
 	__fi T* find(T* dataPtr) {
 		u32 d = *((u32*)dataPtr);
 		const SizeChain<T>& bucket( mBucket[d % hSize] );
-		const __m128i* endpos = (__m128i*)&bucket.Chain[bucket.Size];
+	~HashBucket() throw() { clear(); }
 		const __m128i data128( _mm_load_si128((__m128i*)dataPtr) );
-		for( const __m128i* chainpos = (__m128i*)bucket.Chain; chainpos<endpos; chainpos+=sizeof(T) / 16u ) {
+	__fi nVifBlock* find(const nVifBlock& dataPtr) {
-			// Note SSE4/AVX optimization (However it requires to only have the key in the first 16B without the pointer)
+		nVifBlock* chainpos = m_bucket[dataPtr.hash_key];
 			// tmp = xor (data128, load(chainpos))
 			// ptest tmp tmp (zf will be set if tmp == 0, i.e equality)
-			// This inline SSE code is generally faster than using emitter code, since it inlines nicely. --air
+		while (true) {
-			int result = _mm_movemask_ps( (cast_m128) _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) );
+			if (chainpos->key0 == dataPtr.key0 && chainpos->key1 == dataPtr.key1)
-			if( (result&0x7) == 0x7 ) return (T*)chainpos;
+				return chainpos;
 			if (chainpos->startPtr == 0)
 				return nullptr;
 			chainpos++;
 		}
 		return NULL;
 	}
 	__fi void add(const T& dataPtr) {
 		u32 d = (u32&)dataPtr;
 		SizeChain<T>& bucket( mBucket[d % hSize] );
-		if( (bucket.Chain = (T*)pcsx2_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16, sizeof(T)*bucket.Size)) == NULL ) {
+	void add(const nVifBlock& dataPtr) {
 		u32 b = dataPtr.hash_key;
 		u32 size = bucket_size( dataPtr );
 		// Warning there is an extra +1 due to the empty cell
 		// Performance note: 64B align to reduce cache miss penalty in `find`
 		if( (m_bucket[b] = (nVifBlock*)pcsx2_aligned_realloc( m_bucket[b], sizeof(nVifBlock)*(size+2), 64, sizeof(nVifBlock)*(size+1) )) == NULL ) {
 			throw Exception::OutOfMemory(
-				wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
+				wxsFormat(L"HashBucket Chain (bucket size=%d)", size+2)
 			);
 		}
-		memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
+
-		if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
+		// Replace the empty cell by the new block and create a new empty cell
 		memcpy(&m_bucket[b][size++], &dataPtr, sizeof(nVifBlock));
 		memset(&m_bucket[b][size], 0, sizeof(nVifBlock));
 		if( size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", b, size );
 	}
 	u32 bucket_size(const nVifBlock& dataPtr) {
 		nVifBlock* chainpos = m_bucket[dataPtr.hash_key];
 		u32 size = 0;
 		while (chainpos->startPtr != 0) {
 			size++;
 			chainpos++;
 		}
 		return size;
 	}
 	void clear() {
-		for (int i = 0; i < hSize; i++) {
+		for (auto& bucket : m_bucket)
-			safe_aligned_free(mBucket[i].Chain);
+			safe_aligned_free(bucket);
-			mBucket[i].Size = 0;
+	}
 	void reset() {
 		clear();
 		// Allocate an empty cell for all buckets
 		for (auto& bucket : m_bucket) {
 			if( (bucket = (nVifBlock*)_aligned_malloc( sizeof(nVifBlock), 64 )) == nullptr ) {
 				throw Exception::OutOfMemory(
 						wxsFormat(L"HashBucket Chain (bucket size=%d)", 1)
 						);
 			}
 			memset(bucket, 0, sizeof(nVifBlock));
 		}
 	}
 };
--- a/pcsx2/x86/newVif_Unpack.cpp
+++ b/pcsx2/x86/newVif_Unpack.cpp
@ -73,10 +73,6 @@ static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = {
 nVifStruct::nVifStruct()
 {
 	vifBlocks	=  NULL;
 	numBlocks	=  0;
 	recReserveSizeMB = 8;
 }
 void reserveNewVif(int idx)