mirror of https://github.com/PCSX2/pcsx2.git
vif: new implementation of the hash bucket
Previous implementation saved the both the chain pointer and the chain size Rational: size is useful to add new element and to detect the end of the chain Vif cache is rarely miss. So 'add' is barely called and the end of a chain is barely reached. New implementation will add a null cell at the end of the chain. As a cell contains a x86 pointer, if is null you could conclude that you reach the end of the chain. The 'add' function will traverse the chain to get the current size. It is a cold path besides the chain is often short (< 4). The 'find' function only need to check the startPtr bytes to detect the end of the loop. Note: SizeChain was replaced with a std::array
This commit is contained in:
parent
c58b04979f
commit
10b3d429fe
|
@ -55,7 +55,6 @@ _vifT extern void dVifUnpack (const u8* data, bool isFill);
|
||||||
#define xmmRow xmm6
|
#define xmmRow xmm6
|
||||||
#define xmmTemp xmm7
|
#define xmmTemp xmm7
|
||||||
|
|
||||||
#define _hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
|
|
||||||
struct nVifStruct {
|
struct nVifStruct {
|
||||||
|
|
||||||
__aligned16 nVifBlock block;
|
__aligned16 nVifBlock block;
|
||||||
|
@ -72,7 +71,7 @@ struct nVifStruct {
|
||||||
RecompiledCodeReserve* recReserve;
|
RecompiledCodeReserve* recReserve;
|
||||||
u8* recWritePtr; // current write pos into the reserve
|
u8* recWritePtr; // current write pos into the reserve
|
||||||
|
|
||||||
HashBucket<_hSize> vifBlocks; // Vif Blocks
|
HashBucket vifBlocks; // Vif Blocks
|
||||||
|
|
||||||
nVifStruct();
|
nVifStruct();
|
||||||
};
|
};
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
#include "Utilities/Perf.h"
|
#include "Utilities/Perf.h"
|
||||||
|
|
||||||
static void recReset(int idx) {
|
static void recReset(int idx) {
|
||||||
nVif[idx].vifBlocks.clear();
|
nVif[idx].vifBlocks.reset();
|
||||||
|
|
||||||
nVif[idx].recReserve->Reset();
|
nVif[idx].recReserve->Reset();
|
||||||
|
|
||||||
|
@ -341,6 +341,7 @@ _vifT __fi void dVifUnpack(const u8* data, bool isFill) {
|
||||||
v.block.cl = vifRegs.cycle.cl;
|
v.block.cl = vifRegs.cycle.cl;
|
||||||
v.block.wl = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256;
|
v.block.wl = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256;
|
||||||
v.block.aligned = vif.start_aligned; //MTVU doesn't have a packet size!
|
v.block.aligned = vif.start_aligned; //MTVU doesn't have a packet size!
|
||||||
|
v.block.startPtr = 0; // Ease the detection of the end of the hash bucket
|
||||||
|
|
||||||
if ((upkType & 0xf) != 9)
|
if ((upkType & 0xf) != 9)
|
||||||
v.block.aligned &= 0x1;
|
v.block.aligned &= 0x1;
|
||||||
|
|
|
@ -13,9 +13,10 @@
|
||||||
* If not, see <http://www.gnu.org/licenses/>.
|
* If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "x86emitter/x86_intrin.h"
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <array>
|
||||||
|
|
||||||
// nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
|
// nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
|
||||||
// used as the hash bucket selector.
|
// used as the hash bucket selector.
|
||||||
struct __aligned16 nVifBlock {
|
struct __aligned16 nVifBlock {
|
||||||
|
@ -29,72 +30,96 @@ struct __aligned16 nVifBlock {
|
||||||
uptr startPtr; // [12] Start Ptr of RecGen Code
|
uptr startPtr; // [12] Start Ptr of RecGen Code
|
||||||
}; // 16 bytes
|
}; // 16 bytes
|
||||||
|
|
||||||
template< typename T >
|
#define hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
|
||||||
struct SizeChain
|
|
||||||
{
|
|
||||||
int Size;
|
|
||||||
T* Chain;
|
|
||||||
};
|
|
||||||
|
|
||||||
// HashBucket is a container which uses a built-in hash function
|
// HashBucket is a container which uses a built-in hash function
|
||||||
// to perform quick searches.
|
// to perform quick searches. It is designed around the nVifBlock structure
|
||||||
// hSize determines the number of buckets HashBucket will use for sorting.
|
//
|
||||||
// The hash function is determined by taking the first bytes of data and
|
// The hash function is determined by taking the first bytes of data and
|
||||||
// performing a modulus the size of hSize. So the most diverse-data should
|
// performing a modulus the size of hSize. So the most diverse-data should
|
||||||
// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
|
// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
|
||||||
template<int hSize>
|
|
||||||
class HashBucket {
|
class HashBucket {
|
||||||
protected:
|
protected:
|
||||||
SizeChain<nVifBlock> mBucket[hSize];
|
std::array<nVifBlock*, hSize> m_bucket;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
HashBucket() {
|
HashBucket() {
|
||||||
for (int i = 0; i < hSize; i++) {
|
m_bucket.fill(nullptr);
|
||||||
mBucket[i].Chain = NULL;
|
|
||||||
mBucket[i].Size = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual ~HashBucket() throw() { clear(); }
|
~HashBucket() throw() { clear(); }
|
||||||
int quickFind(u32 data) {
|
|
||||||
return mBucket[data % hSize].Size;
|
|
||||||
}
|
|
||||||
|
|
||||||
__fi nVifBlock* find(nVifBlock* dataPtr) {
|
__fi nVifBlock* find(nVifBlock* dataPtr) {
|
||||||
u32 d = *((u32*)dataPtr);
|
u32 d = *((u32*)dataPtr);
|
||||||
const SizeChain<nVifBlock>& bucket( mBucket[d % hSize] );
|
const __m128i* chainpos = (__m128i*)m_bucket[d % m_bucket.size()];
|
||||||
|
|
||||||
const __m128i* endpos = (__m128i*)&bucket.Chain[bucket.Size];
|
|
||||||
const __m128i data128( _mm_load_si128((__m128i*)dataPtr) );
|
const __m128i data128( _mm_load_si128((__m128i*)dataPtr) );
|
||||||
|
|
||||||
for( const __m128i* chainpos = (__m128i*)bucket.Chain; chainpos<endpos; chainpos+=sizeof(nVifBlock) / 16u ) {
|
int result;
|
||||||
// Note SSE4/AVX optimization (However it requires to only have the key in the first 16B without the pointer)
|
do {
|
||||||
// tmp = xor (data128, load(chainpos))
|
|
||||||
// ptest tmp tmp (zf will be set if tmp == 0, i.e equality)
|
|
||||||
|
|
||||||
// This inline SSE code is generally faster than using emitter code, since it inlines nicely. --air
|
// This inline SSE code is generally faster than using emitter code, since it inlines nicely. --air
|
||||||
int result = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) ) );
|
result = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) ) );
|
||||||
if( (result&0x7) == 0x7 ) return (nVifBlock*)chainpos;
|
// startPtr doesn't match (aka not nullptr) hence 4th bit must be 0
|
||||||
}
|
if (result == 0x7) return (nVifBlock*)chainpos;
|
||||||
return NULL;
|
|
||||||
|
chainpos += sizeof(nVifBlock) / sizeof(__m128i);
|
||||||
|
|
||||||
|
} while(result < 0x8);
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
__fi void add(const nVifBlock& dataPtr) {
|
void add(const nVifBlock& dataPtr) {
|
||||||
u32 d = (u32&)dataPtr;
|
u32 d = (u32&)dataPtr;
|
||||||
SizeChain<nVifBlock>& bucket( mBucket[d % hSize] );
|
u32 b = d % m_bucket.size();
|
||||||
|
|
||||||
if( (bucket.Chain = (nVifBlock*)pcsx2_aligned_realloc( bucket.Chain, sizeof(nVifBlock)*(bucket.Size+1), 16, sizeof(nVifBlock)*bucket.Size)) == NULL ) {
|
u32 size = bucket_size( dataPtr );
|
||||||
|
|
||||||
|
// Warning there is an extra +1 due to the empty cell
|
||||||
|
if( (m_bucket[b] = (nVifBlock*)pcsx2_aligned_realloc( m_bucket[b], sizeof(nVifBlock)*(size+2), 16, sizeof(nVifBlock)*(size+1) )) == NULL ) {
|
||||||
throw Exception::OutOfMemory(
|
throw Exception::OutOfMemory(
|
||||||
wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
|
wxsFormat(L"HashBucket Chain (bucket size=%d)", size+2)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(nVifBlock));
|
|
||||||
if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
|
// Replace the empty cell by the new block and create a new empty cell
|
||||||
|
memcpy(&m_bucket[b][size++], &dataPtr, sizeof(nVifBlock));
|
||||||
|
memset(&m_bucket[b][size], 0, sizeof(nVifBlock));
|
||||||
|
|
||||||
|
if( size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", b, size );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
u32 bucket_size(const nVifBlock& dataPtr) {
|
||||||
|
u32 d = (u32&)dataPtr;
|
||||||
|
nVifBlock* chainpos = m_bucket[d % m_bucket.size()];
|
||||||
|
|
||||||
|
u32 size = 0;
|
||||||
|
|
||||||
|
while (chainpos->startPtr != 0) {
|
||||||
|
size++;
|
||||||
|
chainpos++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
void clear() {
|
void clear() {
|
||||||
for (int i = 0; i < hSize; i++) {
|
for (auto& bucket : m_bucket)
|
||||||
safe_aligned_free(mBucket[i].Chain);
|
safe_aligned_free(bucket);
|
||||||
mBucket[i].Size = 0;
|
}
|
||||||
|
|
||||||
|
void reset() {
|
||||||
|
clear();
|
||||||
|
|
||||||
|
// Allocate an empty cell for all buckets
|
||||||
|
for (auto& bucket : m_bucket) {
|
||||||
|
if( (bucket = (nVifBlock*)_aligned_malloc( sizeof(nVifBlock), 16 )) == nullptr ) {
|
||||||
|
throw Exception::OutOfMemory(
|
||||||
|
wxsFormat(L"HashBucket Chain (bucket size=%d)", 1)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(bucket, 0, sizeof(nVifBlock));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue