Merge pull request #1706 from PCSX2/greg/vif-hash

Greg/vif hash
This commit is contained in:
Jonathan Li 2016-12-21 22:30:27 +00:00 committed by GitHub
commit 10eb88f6fe
4 changed files with 175 additions and 170 deletions

View File

@ -55,42 +55,21 @@ _vifT extern void dVifUnpack (const u8* data, bool isFill);
#define xmmRow xmm6
#define xmmTemp xmm7
// nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
// used as the hash bucket selector.
struct __aligned16 nVifBlock {
u8 num; // [00] Num Field
u8 upkType; // [01] Unpack Type [usn1:mask1:upk*4]
u8 mode; // [02] Mode Field
u8 aligned; // [03] Packet Alignment
u32 mask; // [04] Mask Field
u16 cl; // [08] CL Field
u16 wl; // [10] WL Field
uptr startPtr; // [12] Start Ptr of RecGen Code
}; // 16 bytes
#define _hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
#define _tParams nVifBlock, _hSize
struct nVifStruct {
__aligned16 nVifBlock block;
// Buffer for partial transfers (should always be first to ensure alignment)
// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
__aligned16 u8 buffer[256*16];
u32 bSize; // Size of 'buffer'
u32 bPtr;
uint recReserveSizeMB; // reserve size, in megabytes.
RecompiledCodeReserve* recReserve;
u8* recWritePtr; // current write pos into the reserve
HashBucket<_tParams>* vifBlocks; // Vif Blocks
int numBlocks; // # of Blocks Recompiled
// VIF0 or VIF1 - provided for debugging helpfulness only, and is generally unused.
// (templates are used for most or all VIF indexing)
u32 idx;
RecompiledCodeReserve* recReserve;
u8* recWritePtr; // current write pos into the reserve
HashBucket vifBlocks; // Vif Blocks
nVifStruct();
};

View File

@ -22,34 +22,30 @@
#include "MTVU.h"
#include "Utilities/Perf.h"
static void recReset(int idx) {
nVif[idx].vifBlocks.reset();
nVif[idx].recReserve->Reset();
nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
}
void dVifReserve(int idx) {
if(!nVif[idx].recReserve)
nVif[idx].recReserve = new RecompiledCodeReserve(pxsFmt(L"VIF%u Unpack Recompiler Cache", idx), _8mb);
nVif[idx].recReserve->Reserve( nVif[idx].recReserveSizeMB * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec );
nVif[idx].recReserve->Reserve( 8 * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec );
}
void dVifReset(int idx) {
pxAssertDev(nVif[idx].recReserve, "Dynamic VIF recompiler reserve must be created prior to VIF use or reset!");
if(!nVif[idx].vifBlocks)
nVif[idx].vifBlocks = new HashBucket<_tParams>();
else
nVif[idx].vifBlocks->clear();
nVif[idx].recReserve->Reset();
nVif[idx].numBlocks = 0;
nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
//memset(nVif[idx].recWritePtr, 0xcc, nVif[idx].recReserveSizeMB * _1mb);
recReset(idx);
}
void dVifClose(int idx) {
nVif[idx].numBlocks = 0;
if (nVif[idx].recReserve)
nVif[idx].recReserve->Reset();
safe_delete(nVif[idx].vifBlocks);
}
void dVifRelease(int idx) {
@ -61,7 +57,8 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo
: v(vif_)
, vB(vifBlock_)
{
isFill = (vB.cl < vB.wl);
const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
isFill = (vB.cl < wl);
usn = (vB.upkType>>5) & 1;
doMask = (vB.upkType>>4) & 1;
doMode = vB.mode & 3;
@ -201,11 +198,13 @@ void VifUnpackSSE_Dynarec::ModUnpack( int upknum, bool PostOp )
}
}
void VifUnpackSSE_Dynarec::CompileRoutine() {
const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
const int upkNum = vB.upkType & 0xf;
const u8& vift = nVifT[upkNum];
const int cycleSize = isFill ? vB.cl : vB.wl;
const int blockSize = isFill ? vB.wl : vB.cl;
const int cycleSize = isFill ? vB.cl : wl;
const int blockSize = isFill ? wl : vB.cl;
const int skipSize = blockSize - cycleSize;
uint vNum = vB.num ? vB.num : 256;
@ -261,60 +260,42 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
xRET();
}
_vifT static __fi u8* dVifsetVUptr(uint cl, uint wl, bool isFill) {
nVifStruct& v = nVif[idx];
vifStruct& vif = MTVU_VifX;
const VURegs& VU = vuRegs[idx];
const uint vuMemLimit = idx ? 0x4000 : 0x1000;
static u16 dVifComputeLength(uint cl, uint wl, u8 num, bool isFill) {
uint length = (num > 0) ? (num * 16) : 4096; // 0 = 256
u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
u8* endmem = VU.Mem + vuMemLimit;
uint length = (v.block.num > 0) ? (v.block.num * 16) : 4096; // 0 = 256
//wl = wl ? wl : 256; //0 is taken as 256 (KH2)
//if (wl == 256) isFill = true;
if (!isFill) {
uint skipSize = (cl - wl) * 16;
uint blocks = (v.block.num + (wl-1)) / wl; //Need to round up num's to calculate skip size correctly.
uint blocks = (num + (wl-1)) / wl; //Need to round up num's to calculate skip size correctly.
length += (blocks-1) * skipSize;
}
if ((startmem + length) <= endmem) {
return startmem;
}
//Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, vif.tag.addr, vif.tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl);
return NULL; // Fall Back to Interpreters which have wrap-around logic
return std::min(length, 0xFFFFu);
}
// [TODO] : Finish implementing support for VIF's growable recBlocks buffer. Currently
// it clears the buffer only.
static __fi void dVifRecLimit(int idx) {
if (nVif[idx].recWritePtr > (nVif[idx].recReserve->GetPtrEnd() - _256kb)) {
DevCon.WriteLn(L"nVif Recompiler Cache Reset! [%ls > %ls]",
pxsPtr(nVif[idx].recWritePtr), pxsPtr(nVif[idx].recReserve->GetPtrEnd())
);
nVif[idx].recReserve->Reset();
nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
}
}
_vifT static __ri bool dVifExecuteUnpack(const u8* data, bool isFill)
{
_vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill) {
nVifStruct& v = nVif[idx];
VIFregisters& vifRegs = MTVU_VifXRegs;
if (nVifBlock* b = v.vifBlocks->find(&v.block)) {
if (u8* dest = dVifsetVUptr<idx>(vifRegs.cycle.cl, vifRegs.cycle.wl, isFill)) {
//DevCon.WriteLn("Running Recompiled Block!");
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
// Check size before the compilation
if (v.recWritePtr > (v.recReserve->GetPtrEnd() - _256kb)) {
DevCon.WriteLn(L"nVif Recompiler Cache Reset! [%ls > %ls]",
pxsPtr(v.recWritePtr), pxsPtr(v.recReserve->GetPtrEnd())
);
recReset(idx);
}
else {
VIF_LOG("Running Interpreter Block");
_nVifUnpack(idx, data, vifRegs.mode, isFill);
}
return true;
}
return false;
// Compile the block now
xSetPtr(v.recWritePtr);
block.startPtr = (uptr)xGetAlignedCallTarget();
block.length = dVifComputeLength(block.cl, block.wl, block.num, isFill);
v.vifBlocks.add(block);
VifUnpackSSE_Dynarec(v, block).CompileRoutine();
Perf::vif.map((uptr)v.recWritePtr, xGetPtr() - v.recWritePtr, block.upkType /* FIXME ideally a key*/);
v.recWritePtr = xGetPtr();
return &block;
}
_vifT __fi void dVifUnpack(const u8* data, bool isFill) {
@ -326,42 +307,56 @@ _vifT __fi void dVifUnpack(const u8* data, bool isFill) {
const u8 upkType = (vif.cmd & 0x1f) | (vif.usn << 5);
const int doMask = isFill? 1 : (vif.cmd & 0x10);
v.block.upkType = upkType;
v.block.num = (u8&)vifRegs.num;
v.block.mode = (u8&)vifRegs.mode;
v.block.cl = vifRegs.cycle.cl;
v.block.wl = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256;
v.block.aligned = vif.start_aligned; //MTVU doesn't have a packet size!
nVifBlock block;
// Performance note: initial code was using u8/u16 field of the struct
// directly. However reading back the data (as u32) in HashBucket.find
// leads to various memory stalls. So it is way faster to manually build the data
// in u32 (aka x86 register).
//
// Warning the order of data in hash_key/key0/key1 depends on the nVifBlock struct
u32 hash_key = (u32)(upkType & 0xFF) << 8 | (vifRegs.num & 0xFF);
u32 key1 = ((u32)vifRegs.cycle.wl << 24) | ((u32)vifRegs.cycle.cl << 16) | ((u32)(vif.start_aligned & 0xFF) << 8) | ((u32)vifRegs.mode & 0xFF);
if ((upkType & 0xf) != 9)
v.block.aligned &= 0x1;
key1 &= 0xFFFF01FF;
//DevCon.Warning("Alignment %d", v.block.aligned);
// Zero out the mask parameter if it's unused -- games leave random junk
// values here which cause false recblock cache misses.
v.block.mask = doMask ? vifRegs.mask : 0;
u32 key0 = doMask ? vifRegs.mask : 0;
//DevCon.WriteLn("nVif%d: Recompiled Block! [%d]", idx, nVif[idx].numBlocks++);
block.hash_key = hash_key;
block.key0 = key0;
block.key1 = key1;
//DevCon.WriteLn("nVif%d: Recompiled Block!", idx);
//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
// v.Block.num, v.Block.upkType, v.Block.scl, v.Block.cl, v.Block.wl, v.Block.mode,
// doMask >> 4, doMask ? wxsFormat( L"0x%08x", v.Block.mask ).c_str() : L"ignored"
// block.num, block.upkType, block.scl, block.cl, block.wl, block.mode,
// doMask >> 4, doMask ? wxsFormat( L"0x%08x", block.mask ).c_str() : L"ignored"
//);
if (dVifExecuteUnpack<idx>(data, isFill)) return;
// Seach in cache before trying to compile the block
nVifBlock* b = v.vifBlocks.find(block);
if (unlikely(b == nullptr)) {
b = dVifCompile<idx>(block, isFill);
}
xSetPtr(v.recWritePtr);
v.block.startPtr = (uptr)xGetAlignedCallTarget();
v.vifBlocks->add(v.block);
VifUnpackSSE_Dynarec(v, v.block).CompileRoutine();
{ // Execute the block
const VURegs& VU = vuRegs[idx];
const uint vuMemLimit = idx ? 0x4000 : 0x1000;
Perf::vif.map((uptr)v.recWritePtr, xGetPtr() - v.recWritePtr, v.block.upkType /* FIXME ideally a key*/);
nVif[idx].recWritePtr = xGetPtr();
u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
u8* endmem = VU.Mem + vuMemLimit;
dVifRecLimit(idx);
// Run the block we just compiled. Various conditions may force us to still use
// the interpreter unpacker though, so a recursive call is the safest way here...
dVifExecuteUnpack<idx>(data, isFill);
if (likely((startmem + b->length) <= endmem)) {
// No wrapping, you can run the fast dynarec
((nVifrecCall)b->startPtr)((uptr)startmem, (uptr)data);
} else {
VIF_LOG("Running Interpreter Block: nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x",
v.idx, vif.tag.addr, vif.tag.addr + (block.num * 16), block.num, block.wl, block.cl);
_nVifUnpack(idx, data, vifRegs.mode, isFill);
}
}
}
template void dVifUnpack<0>(const u8* data, bool isFill);

View File

@ -13,87 +13,122 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "x86emitter/x86_intrin.h"
#pragma once
// Create some typecast operators for SIMD operations. For some reason MSVC needs a
// handle/reference typecast to avoid error. GCC (and presumably other compilers)
// generate an error if the handle/ref is used. Honestly neither makes sense, since
// both typecasts should be perfectly valid >_<. --air
#ifdef _MSC_VER
# define cast_m128 __m128&
# define cast_m128i __m128i&
# define cast_m128d __m128d&
#else // defined(__GNUC__)
# define cast_m128 __m128
# define cast_m128i __m128i
# define cast_m128d __m128d
#endif
#include <array>
template< typename T >
struct SizeChain
{
int Size;
T* Chain;
};
// nVifBlock - Ordered for Hashing; the 'num' and 'upkType' fields are
// used as the hash bucket selector.
union nVifBlock {
// Warning: order depends on the newVifDynaRec code
struct {
u8 num; // [00] Num Field
u8 upkType; // [01] Unpack Type [usn1:mask1:upk*4]
u16 length; // [02] Extra: pre computed Length
u32 mask; // [04] Mask Field
u8 mode; // [08] Mode Field
u8 aligned; // [09] Packet Alignment
u8 cl; // [10] CL Field
u8 wl; // [11] WL Field
uptr startPtr; // [12] Start Ptr of RecGen Code
};
struct {
u16 hash_key;
u16 _pad0;
u32 key0;
u32 key1;
uptr value;
};
}; // 16 bytes
// 0x4000 is enough but 0x10000 allow
// * to skip the compare value of the first double world in lookup
// * to use a 16 bits move instead of an 'and' mask to compute the hashed key
#define hSize 0x10000 // [usn*1:mask*1:upk*4:num*8] hash...
// HashBucket is a container which uses a built-in hash function
// to perform quick searches.
// T is a struct data type (note: size must be in multiples of 16 bytes!)
// hSize determines the number of buckets HashBucket will use for sorting.
// to perform quick searches. It is designed around the nVifBlock structure
//
// The hash function is determined by taking the first bytes of data and
// performing a modulus the size of hSize. So the most diverse-data should
// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
template<typename T, int hSize>
class HashBucket {
protected:
SizeChain<T> mBucket[hSize];
std::array<nVifBlock*, hSize> m_bucket;
public:
HashBucket() {
for (int i = 0; i < hSize; i++) {
mBucket[i].Chain = NULL;
mBucket[i].Size = 0;
m_bucket.fill(nullptr);
}
}
virtual ~HashBucket() throw() { clear(); }
int quickFind(u32 data) {
return mBucket[data % hSize].Size;
}
__fi T* find(T* dataPtr) {
u32 d = *((u32*)dataPtr);
const SizeChain<T>& bucket( mBucket[d % hSize] );
const __m128i* endpos = (__m128i*)&bucket.Chain[bucket.Size];
const __m128i data128( _mm_load_si128((__m128i*)dataPtr) );
~HashBucket() throw() { clear(); }
for( const __m128i* chainpos = (__m128i*)bucket.Chain; chainpos<endpos; chainpos+=sizeof(T) / 16u ) {
// Note SSE4/AVX optimization (However it requires to only have the key in the first 16B without the pointer)
// tmp = xor (data128, load(chainpos))
// ptest tmp tmp (zf will be set if tmp == 0, i.e equality)
__fi nVifBlock* find(const nVifBlock& dataPtr) {
nVifBlock* chainpos = m_bucket[dataPtr.hash_key];
// This inline SSE code is generally faster than using emitter code, since it inlines nicely. --air
int result = _mm_movemask_ps( (cast_m128) _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) );
if( (result&0x7) == 0x7 ) return (T*)chainpos;
while (true) {
if (chainpos->key0 == dataPtr.key0 && chainpos->key1 == dataPtr.key1)
return chainpos;
if (chainpos->startPtr == 0)
return nullptr;
chainpos++;
}
return NULL;
}
__fi void add(const T& dataPtr) {
u32 d = (u32&)dataPtr;
SizeChain<T>& bucket( mBucket[d % hSize] );
if( (bucket.Chain = (T*)pcsx2_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16, sizeof(T)*bucket.Size)) == NULL ) {
void add(const nVifBlock& dataPtr) {
u32 b = dataPtr.hash_key;
u32 size = bucket_size( dataPtr );
// Warning there is an extra +1 due to the empty cell
// Performance note: 64B align to reduce cache miss penalty in `find`
if( (m_bucket[b] = (nVifBlock*)pcsx2_aligned_realloc( m_bucket[b], sizeof(nVifBlock)*(size+2), 64, sizeof(nVifBlock)*(size+1) )) == NULL ) {
throw Exception::OutOfMemory(
wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
wxsFormat(L"HashBucket Chain (bucket size=%d)", size+2)
);
}
memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
// Replace the empty cell by the new block and create a new empty cell
memcpy(&m_bucket[b][size++], &dataPtr, sizeof(nVifBlock));
memset(&m_bucket[b][size], 0, sizeof(nVifBlock));
if( size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", b, size );
}
u32 bucket_size(const nVifBlock& dataPtr) {
nVifBlock* chainpos = m_bucket[dataPtr.hash_key];
u32 size = 0;
while (chainpos->startPtr != 0) {
size++;
chainpos++;
}
return size;
}
void clear() {
for (int i = 0; i < hSize; i++) {
safe_aligned_free(mBucket[i].Chain);
mBucket[i].Size = 0;
for (auto& bucket : m_bucket)
safe_aligned_free(bucket);
}
void reset() {
clear();
// Allocate an empty cell for all buckets
for (auto& bucket : m_bucket) {
if( (bucket = (nVifBlock*)_aligned_malloc( sizeof(nVifBlock), 64 )) == nullptr ) {
throw Exception::OutOfMemory(
wxsFormat(L"HashBucket Chain (bucket size=%d)", 1)
);
}
memset(bucket, 0, sizeof(nVifBlock));
}
}
};

View File

@ -73,10 +73,6 @@ static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = {
nVifStruct::nVifStruct()
{
vifBlocks = NULL;
numBlocks = 0;
recReserveSizeMB = 8;
}
void reserveNewVif(int idx)