vif: new implementation of the hash bucket

Previous implementation saved the both the chain pointer and the chain size
Rational: size is useful to add new element and to detect the end of the chain
Vif cache is rarely miss. So 'add' is barely called and the end of a chain is
barely reached.

New implementation will add a null cell at the end of the chain. As a
cell contains a x86 pointer, if is null you could conclude that you
reach the end of the chain.

The 'add' function will traverse the chain to get the current size. It is
a cold path besides the chain is often short (< 4).

The 'find' function only need to check the startPtr bytes to detect the end
of the loop.

Note: SizeChain was replaced with a std::array
This commit is contained in:
Gregory Hainaut 2016-12-13 19:47:31 +01:00
parent c58b04979f
commit 10b3d429fe
3 changed files with 67 additions and 42 deletions

View File

@ -55,7 +55,6 @@ _vifT extern void dVifUnpack (const u8* data, bool isFill);
#define xmmRow xmm6
#define xmmTemp xmm7
#define _hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
struct nVifStruct {
__aligned16 nVifBlock block;
@ -72,7 +71,7 @@ struct nVifStruct {
RecompiledCodeReserve* recReserve;
u8* recWritePtr; // current write pos into the reserve
HashBucket<_hSize> vifBlocks; // Vif Blocks
HashBucket vifBlocks; // Vif Blocks
nVifStruct();
};

View File

@ -23,7 +23,7 @@
#include "Utilities/Perf.h"
static void recReset(int idx) {
nVif[idx].vifBlocks.clear();
nVif[idx].vifBlocks.reset();
nVif[idx].recReserve->Reset();
@ -341,6 +341,7 @@ _vifT __fi void dVifUnpack(const u8* data, bool isFill) {
v.block.cl = vifRegs.cycle.cl;
v.block.wl = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256;
v.block.aligned = vif.start_aligned; //MTVU doesn't have a packet size!
v.block.startPtr = 0; // Ease the detection of the end of the hash bucket
if ((upkType & 0xf) != 9)
v.block.aligned &= 0x1;

View File

@ -13,9 +13,10 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "x86emitter/x86_intrin.h"
#pragma once
#include <array>
// nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
// used as the hash bucket selector.
struct __aligned16 nVifBlock {
@ -29,72 +30,96 @@ struct __aligned16 nVifBlock {
uptr startPtr; // [12] Start Ptr of RecGen Code
}; // 16 bytes
template< typename T >
struct SizeChain
{
int Size;
T* Chain;
};
#define hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
// HashBucket is a container which uses a built-in hash function
// to perform quick searches.
// hSize determines the number of buckets HashBucket will use for sorting.
// to perform quick searches. It is designed around the nVifBlock structure
//
// The hash function is determined by taking the first bytes of data and
// performing a modulus the size of hSize. So the most diverse-data should
// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
template<int hSize>
class HashBucket {
protected:
SizeChain<nVifBlock> mBucket[hSize];
std::array<nVifBlock*, hSize> m_bucket;
public:
HashBucket() {
for (int i = 0; i < hSize; i++) {
mBucket[i].Chain = NULL;
mBucket[i].Size = 0;
}
m_bucket.fill(nullptr);
}
virtual ~HashBucket() throw() { clear(); }
int quickFind(u32 data) {
return mBucket[data % hSize].Size;
}
~HashBucket() throw() { clear(); }
__fi nVifBlock* find(nVifBlock* dataPtr) {
u32 d = *((u32*)dataPtr);
const SizeChain<nVifBlock>& bucket( mBucket[d % hSize] );
const __m128i* chainpos = (__m128i*)m_bucket[d % m_bucket.size()];
const __m128i* endpos = (__m128i*)&bucket.Chain[bucket.Size];
const __m128i data128( _mm_load_si128((__m128i*)dataPtr) );
for( const __m128i* chainpos = (__m128i*)bucket.Chain; chainpos<endpos; chainpos+=sizeof(nVifBlock) / 16u ) {
// Note SSE4/AVX optimization (However it requires to only have the key in the first 16B without the pointer)
// tmp = xor (data128, load(chainpos))
// ptest tmp tmp (zf will be set if tmp == 0, i.e equality)
int result;
do {
// This inline SSE code is generally faster than using emitter code, since it inlines nicely. --air
int result = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) ) );
if( (result&0x7) == 0x7 ) return (nVifBlock*)chainpos;
}
return NULL;
result = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) ) );
// startPtr doesn't match (aka not nullptr) hence 4th bit must be 0
if (result == 0x7) return (nVifBlock*)chainpos;
chainpos += sizeof(nVifBlock) / sizeof(__m128i);
} while(result < 0x8);
return nullptr;
}
__fi void add(const nVifBlock& dataPtr) {
void add(const nVifBlock& dataPtr) {
u32 d = (u32&)dataPtr;
SizeChain<nVifBlock>& bucket( mBucket[d % hSize] );
u32 b = d % m_bucket.size();
if( (bucket.Chain = (nVifBlock*)pcsx2_aligned_realloc( bucket.Chain, sizeof(nVifBlock)*(bucket.Size+1), 16, sizeof(nVifBlock)*bucket.Size)) == NULL ) {
u32 size = bucket_size( dataPtr );
// Warning there is an extra +1 due to the empty cell
if( (m_bucket[b] = (nVifBlock*)pcsx2_aligned_realloc( m_bucket[b], sizeof(nVifBlock)*(size+2), 16, sizeof(nVifBlock)*(size+1) )) == NULL ) {
throw Exception::OutOfMemory(
wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
wxsFormat(L"HashBucket Chain (bucket size=%d)", size+2)
);
}
memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(nVifBlock));
if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
// Replace the empty cell by the new block and create a new empty cell
memcpy(&m_bucket[b][size++], &dataPtr, sizeof(nVifBlock));
memset(&m_bucket[b][size], 0, sizeof(nVifBlock));
if( size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", b, size );
}
u32 bucket_size(const nVifBlock& dataPtr) {
u32 d = (u32&)dataPtr;
nVifBlock* chainpos = m_bucket[d % m_bucket.size()];
u32 size = 0;
while (chainpos->startPtr != 0) {
size++;
chainpos++;
}
return size;
}
void clear() {
for (int i = 0; i < hSize; i++) {
safe_aligned_free(mBucket[i].Chain);
mBucket[i].Size = 0;
for (auto& bucket : m_bucket)
safe_aligned_free(bucket);
}
void reset() {
clear();
// Allocate an empty cell for all buckets
for (auto& bucket : m_bucket) {
if( (bucket = (nVifBlock*)_aligned_malloc( sizeof(nVifBlock), 16 )) == nullptr ) {
throw Exception::OutOfMemory(
wxsFormat(L"HashBucket Chain (bucket size=%d)", 1)
);
}
memset(bucket, 0, sizeof(nVifBlock));
}
}
};