From 10b3d429fed8bb836481a9bbcc44b7776efd9fa9 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Tue, 13 Dec 2016 19:47:31 +0100
Subject: [PATCH] vif: new implementation of the hash bucket

Previous implementation saved the both the chain pointer and the chain size
Rational: size is useful to add new element and to detect the end of the chain
Vif cache is rarely miss. So 'add' is barely called and the end of a chain is
barely reached.

New implementation will add a null cell at the end of the chain. As a
cell contains a x86 pointer, if is null you could conclude that you
reach the end of the chain.

The 'add' function will traverse the chain to get the current size. It is
a cold path besides the chain is often short (< 4).

The 'find' function only need to check the startPtr bytes to detect the end
of the loop.

Note: SizeChain was replaced with a std::array
---
 pcsx2/x86/newVif.h            |   3 +-
 pcsx2/x86/newVif_Dynarec.cpp  |   3 +-
 pcsx2/x86/newVif_HashBucket.h | 103 +++++++++++++++++++++-------------
 3 files changed, 67 insertions(+), 42 deletions(-)
diff --git a/pcsx2/x86/newVif.h b/pcsx2/x86/newVif.h
index cd491fa941..1a1b1760fb 100644
--- a/pcsx2/x86/newVif.h
+++ b/pcsx2/x86/newVif.h
@@ -55,7 +55,6 @@ _vifT extern void  dVifUnpack  (const u8* data, bool isFill);
 #define xmmRow  xmm6
 #define xmmTemp xmm7
 
-#define _hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
 struct nVifStruct {
 
 	__aligned16 nVifBlock   block;
@@ -72,7 +71,7 @@ struct nVifStruct {
 	RecompiledCodeReserve*	recReserve;
 	u8*						recWritePtr;		// current write pos into the reserve
 
-	HashBucket<_hSize>		vifBlocks;		// Vif Blocks
+	HashBucket				vifBlocks;		// Vif Blocks
 
 	nVifStruct();
 };
diff --git a/pcsx2/x86/newVif_Dynarec.cpp b/pcsx2/x86/newVif_Dynarec.cpp
index 4e06268a9c..65084d355d 100644
--- a/pcsx2/x86/newVif_Dynarec.cpp
+++ b/pcsx2/x86/newVif_Dynarec.cpp
@@ -23,7 +23,7 @@
 #include "Utilities/Perf.h"
 
 static void recReset(int idx) {
-	nVif[idx].vifBlocks.clear();
+	nVif[idx].vifBlocks.reset();
 
 	nVif[idx].recReserve->Reset();
 
@@ -341,6 +341,7 @@ _vifT __fi void dVifUnpack(const u8* data, bool isFill) {
 	v.block.cl      = vifRegs.cycle.cl;
 	v.block.wl      = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256;
 	v.block.aligned = vif.start_aligned;  //MTVU doesn't have a packet size!
+	v.block.startPtr = 0; // Ease the detection of the end of the hash bucket
 
 	if ((upkType & 0xf) != 9)
 		v.block.aligned &= 0x1;
diff --git a/pcsx2/x86/newVif_HashBucket.h b/pcsx2/x86/newVif_HashBucket.h
index 34998e960d..e4f6fe79a0 100644
--- a/pcsx2/x86/newVif_HashBucket.h
+++ b/pcsx2/x86/newVif_HashBucket.h
@@ -13,9 +13,10 @@
  *  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "x86emitter/x86_intrin.h"
 #pragma once
 
+#include <array>
+
 // nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
 //             used as the hash bucket selector.
 struct __aligned16 nVifBlock {
@@ -29,72 +30,96 @@ struct __aligned16 nVifBlock {
 	uptr startPtr; // [12] Start Ptr of RecGen Code
 }; // 16 bytes
 
-template< typename T >
-struct SizeChain
-{
-	int Size;
-	T*  Chain;
-};
+#define hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
 
 // HashBucket is a container which uses a built-in hash function
-// to perform quick searches.
-// hSize determines the number of buckets HashBucket will use for sorting.
+// to perform quick searches. It is designed around the nVifBlock structure
+//
 // The hash function is determined by taking the first bytes of data and
 // performing a modulus the size of hSize. So the most diverse-data should
 // be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
-template<int hSize>
 class HashBucket {
 protected:
-	SizeChain<nVifBlock> mBucket[hSize];
+	std::array<nVifBlock*, hSize> m_bucket;
 
 public:
 	HashBucket() {
-		for (int i = 0; i < hSize; i++) {
-			mBucket[i].Chain	= NULL;
-			mBucket[i].Size		= 0;
-		}
+		m_bucket.fill(nullptr);
 	}
 
-	virtual ~HashBucket() throw() { clear(); }
-	int quickFind(u32 data) {
-		return mBucket[data % hSize].Size;
-	}
+	~HashBucket() throw() { clear(); }
 
 	__fi nVifBlock* find(nVifBlock* dataPtr) {
 		u32 d = *((u32*)dataPtr);
-		const SizeChain<nVifBlock>& bucket( mBucket[d % hSize] );
+		const __m128i* chainpos = (__m128i*)m_bucket[d % m_bucket.size()];
 
-		const __m128i* endpos = (__m128i*)&bucket.Chain[bucket.Size];
 		const __m128i data128( _mm_load_si128((__m128i*)dataPtr) );
 
-		for( const __m128i* chainpos = (__m128i*)bucket.Chain; chainpos<endpos; chainpos+=sizeof(nVifBlock) / 16u ) {
-			// Note SSE4/AVX optimization (However it requires to only have the key in the first 16B without the pointer)
-			// tmp = xor (data128, load(chainpos))
-			// ptest tmp tmp (zf will be set if tmp == 0, i.e equality)
-
+		int result;
+		do {
 			// This inline SSE code is generally faster than using emitter code, since it inlines nicely. --air
-			int result = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) ) );
-			if( (result&0x7) == 0x7 ) return (nVifBlock*)chainpos;
-		}
-		return NULL;
+			result = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) ) );
+			// startPtr doesn't match (aka not nullptr) hence 4th bit must be 0
+			if (result == 0x7) return (nVifBlock*)chainpos;
+
+			chainpos += sizeof(nVifBlock) / sizeof(__m128i);
+
+		} while(result < 0x8);
+
+		return nullptr;
 	}
 
-	__fi void add(const nVifBlock& dataPtr) {
+	void add(const nVifBlock& dataPtr) {
 		u32 d = (u32&)dataPtr;
-		SizeChain<nVifBlock>& bucket( mBucket[d % hSize] );
+		u32 b = d % m_bucket.size();
 
-		if( (bucket.Chain = (nVifBlock*)pcsx2_aligned_realloc( bucket.Chain, sizeof(nVifBlock)*(bucket.Size+1), 16, sizeof(nVifBlock)*bucket.Size)) == NULL ) {
+		u32 size = bucket_size( dataPtr );
+
+		// Warning there is an extra +1 due to the empty cell
+		if( (m_bucket[b] = (nVifBlock*)pcsx2_aligned_realloc( m_bucket[b], sizeof(nVifBlock)*(size+2), 16, sizeof(nVifBlock)*(size+1) )) == NULL ) {
 			throw Exception::OutOfMemory(
-				wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
+				wxsFormat(L"HashBucket Chain (bucket size=%d)", size+2)
 			);
 		}
-		memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(nVifBlock));
-		if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
+
+		// Replace the empty cell by the new block and create a new empty cell
+		memcpy(&m_bucket[b][size++], &dataPtr, sizeof(nVifBlock));
+		memset(&m_bucket[b][size], 0, sizeof(nVifBlock));
+
+		if( size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", b, size );
 	}
+
+	u32 bucket_size(const nVifBlock& dataPtr) {
+		u32 d = (u32&)dataPtr;
+		nVifBlock* chainpos = m_bucket[d % m_bucket.size()];
+
+		u32 size = 0;
+
+		while (chainpos->startPtr != 0) {
+			size++;
+			chainpos++;
+		}
+
+		return size;
+	}
+
 	void clear() {
-		for (int i = 0; i < hSize; i++) {
-			safe_aligned_free(mBucket[i].Chain);
-			mBucket[i].Size = 0;
+		for (auto& bucket : m_bucket)
+			safe_aligned_free(bucket);
+	}
+
+	void reset() {
+		clear();
+
+		// Allocate an empty cell for all buckets
+		for (auto& bucket : m_bucket) {
+			if( (bucket = (nVifBlock*)_aligned_malloc( sizeof(nVifBlock), 16 )) == nullptr ) {
+				throw Exception::OutOfMemory(
+						wxsFormat(L"HashBucket Chain (bucket size=%d)", 1)
+						);
+			}
+
+			memset(bucket, 0, sizeof(nVifBlock));
 		}
 	}
 };