diff --git a/pcsx2/x86/newVif.h b/pcsx2/x86/newVif.h index bdef4c468c..83b6beeb2c 100644 --- a/pcsx2/x86/newVif.h +++ b/pcsx2/x86/newVif.h @@ -55,42 +55,21 @@ _vifT extern void dVifUnpack (const u8* data, bool isFill); #define xmmRow xmm6 #define xmmTemp xmm7 -// nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are -// used as the hash bucket selector. -struct __aligned16 nVifBlock { - u8 num; // [00] Num Field - u8 upkType; // [01] Unpack Type [usn1:mask1:upk*4] - u8 mode; // [02] Mode Field - u8 aligned; // [03] Packet Alignment - u32 mask; // [04] Mask Field - u16 cl; // [08] CL Field - u16 wl; // [10] WL Field - uptr startPtr; // [12] Start Ptr of RecGen Code -}; // 16 bytes - -#define _hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash... -#define _tParams nVifBlock, _hSize struct nVifStruct { - - __aligned16 nVifBlock block; - // Buffer for partial transfers (should always be first to ensure alignment) // Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword) __aligned16 u8 buffer[256*16]; u32 bSize; // Size of 'buffer' - u32 bPtr; - - uint recReserveSizeMB; // reserve size, in megabytes. - RecompiledCodeReserve* recReserve; - u8* recWritePtr; // current write pos into the reserve - - HashBucket<_tParams>* vifBlocks; // Vif Blocks - int numBlocks; // # of Blocks Recompiled // VIF0 or VIF1 - provided for debugging helpfulness only, and is generally unused. // (templates are used for most or all VIF indexing) u32 idx; + RecompiledCodeReserve* recReserve; + u8* recWritePtr; // current write pos into the reserve + + HashBucket vifBlocks; // Vif Blocks + nVifStruct(); }; diff --git a/pcsx2/x86/newVif_Dynarec.cpp b/pcsx2/x86/newVif_Dynarec.cpp index b9160f8ab3..d5757f7001 100644 --- a/pcsx2/x86/newVif_Dynarec.cpp +++ b/pcsx2/x86/newVif_Dynarec.cpp @@ -22,34 +22,30 @@ #include "MTVU.h" #include "Utilities/Perf.h" +static void recReset(int idx) { + nVif[idx].vifBlocks.reset(); + + nVif[idx].recReserve->Reset(); + + nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr(); +} + void dVifReserve(int idx) { if(!nVif[idx].recReserve) nVif[idx].recReserve = new RecompiledCodeReserve(pxsFmt(L"VIF%u Unpack Recompiler Cache", idx), _8mb); - nVif[idx].recReserve->Reserve( nVif[idx].recReserveSizeMB * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec ); + nVif[idx].recReserve->Reserve( 8 * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec ); } void dVifReset(int idx) { pxAssertDev(nVif[idx].recReserve, "Dynamic VIF recompiler reserve must be created prior to VIF use or reset!"); - if(!nVif[idx].vifBlocks) - nVif[idx].vifBlocks = new HashBucket<_tParams>(); - else - nVif[idx].vifBlocks->clear(); - - nVif[idx].recReserve->Reset(); - - nVif[idx].numBlocks = 0; - nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr(); - //memset(nVif[idx].recWritePtr, 0xcc, nVif[idx].recReserveSizeMB * _1mb); + recReset(idx); } void dVifClose(int idx) { - nVif[idx].numBlocks = 0; if (nVif[idx].recReserve) nVif[idx].recReserve->Reset(); - - safe_delete(nVif[idx].vifBlocks); } void dVifRelease(int idx) { @@ -61,7 +57,8 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo : v(vif_) , vB(vifBlock_) { - isFill = (vB.cl < vB.wl); + const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2) + isFill = (vB.cl < wl); usn = (vB.upkType>>5) & 1; doMask = (vB.upkType>>4) & 1; doMode = vB.mode & 3; @@ -201,11 +198,13 @@ void VifUnpackSSE_Dynarec::ModUnpack( int upknum, bool PostOp ) } } + void VifUnpackSSE_Dynarec::CompileRoutine() { + const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2) const int upkNum = vB.upkType & 0xf; const u8& vift = nVifT[upkNum]; - const int cycleSize = isFill ? vB.cl : vB.wl; - const int blockSize = isFill ? vB.wl : vB.cl; + const int cycleSize = isFill ? vB.cl : wl; + const int blockSize = isFill ? wl : vB.cl; const int skipSize = blockSize - cycleSize; uint vNum = vB.num ? vB.num : 256; @@ -261,60 +260,42 @@ void VifUnpackSSE_Dynarec::CompileRoutine() { xRET(); } -_vifT static __fi u8* dVifsetVUptr(uint cl, uint wl, bool isFill) { - nVifStruct& v = nVif[idx]; - vifStruct& vif = MTVU_VifX; - const VURegs& VU = vuRegs[idx]; - const uint vuMemLimit = idx ? 0x4000 : 0x1000; +static u16 dVifComputeLength(uint cl, uint wl, u8 num, bool isFill) { + uint length = (num > 0) ? (num * 16) : 4096; // 0 = 256 - u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10)); - u8* endmem = VU.Mem + vuMemLimit; - uint length = (v.block.num > 0) ? (v.block.num * 16) : 4096; // 0 = 256 - - //wl = wl ? wl : 256; //0 is taken as 256 (KH2) - //if (wl == 256) isFill = true; if (!isFill) { uint skipSize = (cl - wl) * 16; - uint blocks = (v.block.num + (wl-1)) / wl; //Need to round up num's to calculate skip size correctly. + uint blocks = (num + (wl-1)) / wl; //Need to round up num's to calculate skip size correctly. length += (blocks-1) * skipSize; } - if ((startmem + length) <= endmem) { - return startmem; - } - //Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, vif.tag.addr, vif.tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl); - return NULL; // Fall Back to Interpreters which have wrap-around logic + return std::min(length, 0xFFFFu); } -// [TODO] : Finish implementing support for VIF's growable recBlocks buffer. Currently -// it clears the buffer only. -static __fi void dVifRecLimit(int idx) { - if (nVif[idx].recWritePtr > (nVif[idx].recReserve->GetPtrEnd() - _256kb)) { +_vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill) { + nVifStruct& v = nVif[idx]; + + // Check size before the compilation + if (v.recWritePtr > (v.recReserve->GetPtrEnd() - _256kb)) { DevCon.WriteLn(L"nVif Recompiler Cache Reset! [%ls > %ls]", - pxsPtr(nVif[idx].recWritePtr), pxsPtr(nVif[idx].recReserve->GetPtrEnd()) + pxsPtr(v.recWritePtr), pxsPtr(v.recReserve->GetPtrEnd()) ); - nVif[idx].recReserve->Reset(); - nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr(); + recReset(idx); } -} -_vifT static __ri bool dVifExecuteUnpack(const u8* data, bool isFill) -{ - nVifStruct& v = nVif[idx]; - VIFregisters& vifRegs = MTVU_VifXRegs; + // Compile the block now + xSetPtr(v.recWritePtr); - if (nVifBlock* b = v.vifBlocks->find(&v.block)) { - if (u8* dest = dVifsetVUptr(vifRegs.cycle.cl, vifRegs.cycle.wl, isFill)) { - //DevCon.WriteLn("Running Recompiled Block!"); - ((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data); - } - else { - VIF_LOG("Running Interpreter Block"); - _nVifUnpack(idx, data, vifRegs.mode, isFill); - } - return true; - } - return false; + block.startPtr = (uptr)xGetAlignedCallTarget(); + block.length = dVifComputeLength(block.cl, block.wl, block.num, isFill); + v.vifBlocks.add(block); + + VifUnpackSSE_Dynarec(v, block).CompileRoutine(); + + Perf::vif.map((uptr)v.recWritePtr, xGetPtr() - v.recWritePtr, block.upkType /* FIXME ideally a key*/); + v.recWritePtr = xGetPtr(); + + return █ } _vifT __fi void dVifUnpack(const u8* data, bool isFill) { @@ -326,42 +307,56 @@ _vifT __fi void dVifUnpack(const u8* data, bool isFill) { const u8 upkType = (vif.cmd & 0x1f) | (vif.usn << 5); const int doMask = isFill? 1 : (vif.cmd & 0x10); - v.block.upkType = upkType; - v.block.num = (u8&)vifRegs.num; - v.block.mode = (u8&)vifRegs.mode; - v.block.cl = vifRegs.cycle.cl; - v.block.wl = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256; - v.block.aligned = vif.start_aligned; //MTVU doesn't have a packet size! + nVifBlock block; + // Performance note: initial code was using u8/u16 field of the struct + // directly. However reading back the data (as u32) in HashBucket.find + // leads to various memory stalls. So it is way faster to manually build the data + // in u32 (aka x86 register). + // + // Warning the order of data in hash_key/key0/key1 depends on the nVifBlock struct + u32 hash_key = (u32)(upkType & 0xFF) << 8 | (vifRegs.num & 0xFF); + + u32 key1 = ((u32)vifRegs.cycle.wl << 24) | ((u32)vifRegs.cycle.cl << 16) | ((u32)(vif.start_aligned & 0xFF) << 8) | ((u32)vifRegs.mode & 0xFF); if ((upkType & 0xf) != 9) - v.block.aligned &= 0x1; + key1 &= 0xFFFF01FF; - //DevCon.Warning("Alignment %d", v.block.aligned); // Zero out the mask parameter if it's unused -- games leave random junk // values here which cause false recblock cache misses. - v.block.mask = doMask ? vifRegs.mask : 0; + u32 key0 = doMask ? vifRegs.mask : 0; - //DevCon.WriteLn("nVif%d: Recompiled Block! [%d]", idx, nVif[idx].numBlocks++); + block.hash_key = hash_key; + block.key0 = key0; + block.key1 = key1; + + //DevCon.WriteLn("nVif%d: Recompiled Block!", idx); //DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]", - // v.Block.num, v.Block.upkType, v.Block.scl, v.Block.cl, v.Block.wl, v.Block.mode, - // doMask >> 4, doMask ? wxsFormat( L"0x%08x", v.Block.mask ).c_str() : L"ignored" + // block.num, block.upkType, block.scl, block.cl, block.wl, block.mode, + // doMask >> 4, doMask ? wxsFormat( L"0x%08x", block.mask ).c_str() : L"ignored" //); - if (dVifExecuteUnpack(data, isFill)) return; + // Seach in cache before trying to compile the block + nVifBlock* b = v.vifBlocks.find(block); + if (unlikely(b == nullptr)) { + b = dVifCompile(block, isFill); + } - xSetPtr(v.recWritePtr); - v.block.startPtr = (uptr)xGetAlignedCallTarget(); - v.vifBlocks->add(v.block); - VifUnpackSSE_Dynarec(v, v.block).CompileRoutine(); + { // Execute the block + const VURegs& VU = vuRegs[idx]; + const uint vuMemLimit = idx ? 0x4000 : 0x1000; - Perf::vif.map((uptr)v.recWritePtr, xGetPtr() - v.recWritePtr, v.block.upkType /* FIXME ideally a key*/); - nVif[idx].recWritePtr = xGetPtr(); + u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10)); + u8* endmem = VU.Mem + vuMemLimit; - dVifRecLimit(idx); - - // Run the block we just compiled. Various conditions may force us to still use - // the interpreter unpacker though, so a recursive call is the safest way here... - dVifExecuteUnpack(data, isFill); + if (likely((startmem + b->length) <= endmem)) { + // No wrapping, you can run the fast dynarec + ((nVifrecCall)b->startPtr)((uptr)startmem, (uptr)data); + } else { + VIF_LOG("Running Interpreter Block: nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", + v.idx, vif.tag.addr, vif.tag.addr + (block.num * 16), block.num, block.wl, block.cl); + _nVifUnpack(idx, data, vifRegs.mode, isFill); + } + } } template void dVifUnpack<0>(const u8* data, bool isFill); diff --git a/pcsx2/x86/newVif_HashBucket.h b/pcsx2/x86/newVif_HashBucket.h index 7467005823..7dc2ecd1d7 100644 --- a/pcsx2/x86/newVif_HashBucket.h +++ b/pcsx2/x86/newVif_HashBucket.h @@ -13,87 +13,122 @@ * If not, see . */ -#include "x86emitter/x86_intrin.h" #pragma once -// Create some typecast operators for SIMD operations. For some reason MSVC needs a -// handle/reference typecast to avoid error. GCC (and presumably other compilers) -// generate an error if the handle/ref is used. Honestly neither makes sense, since -// both typecasts should be perfectly valid >_<. --air -#ifdef _MSC_VER -# define cast_m128 __m128& -# define cast_m128i __m128i& -# define cast_m128d __m128d& -#else // defined(__GNUC__) -# define cast_m128 __m128 -# define cast_m128i __m128i -# define cast_m128d __m128d -#endif +#include -template< typename T > -struct SizeChain -{ - int Size; - T* Chain; -}; +// nVifBlock - Ordered for Hashing; the 'num' and 'upkType' fields are +// used as the hash bucket selector. +union nVifBlock { + // Warning: order depends on the newVifDynaRec code + struct { + u8 num; // [00] Num Field + u8 upkType; // [01] Unpack Type [usn1:mask1:upk*4] + u16 length; // [02] Extra: pre computed Length + u32 mask; // [04] Mask Field + u8 mode; // [08] Mode Field + u8 aligned; // [09] Packet Alignment + u8 cl; // [10] CL Field + u8 wl; // [11] WL Field + uptr startPtr; // [12] Start Ptr of RecGen Code + }; + + struct { + u16 hash_key; + u16 _pad0; + u32 key0; + u32 key1; + uptr value; + }; + +}; // 16 bytes + +// 0x4000 is enough but 0x10000 allow +// * to skip the compare value of the first double world in lookup +// * to use a 16 bits move instead of an 'and' mask to compute the hashed key +#define hSize 0x10000 // [usn*1:mask*1:upk*4:num*8] hash... // HashBucket is a container which uses a built-in hash function -// to perform quick searches. -// T is a struct data type (note: size must be in multiples of 16 bytes!) -// hSize determines the number of buckets HashBucket will use for sorting. +// to perform quick searches. It is designed around the nVifBlock structure +// // The hash function is determined by taking the first bytes of data and // performing a modulus the size of hSize. So the most diverse-data should // be in the first bytes of the struct. (hence why nVifBlock is specifically sorted) -template class HashBucket { protected: - SizeChain mBucket[hSize]; + std::array m_bucket; public: HashBucket() { - for (int i = 0; i < hSize; i++) { - mBucket[i].Chain = NULL; - mBucket[i].Size = 0; + m_bucket.fill(nullptr); + } + + ~HashBucket() throw() { clear(); } + + __fi nVifBlock* find(const nVifBlock& dataPtr) { + nVifBlock* chainpos = m_bucket[dataPtr.hash_key]; + + while (true) { + if (chainpos->key0 == dataPtr.key0 && chainpos->key1 == dataPtr.key1) + return chainpos; + + if (chainpos->startPtr == 0) + return nullptr; + + chainpos++; } } - virtual ~HashBucket() throw() { clear(); } - int quickFind(u32 data) { - return mBucket[data % hSize].Size; - } - __fi T* find(T* dataPtr) { - u32 d = *((u32*)dataPtr); - const SizeChain& bucket( mBucket[d % hSize] ); - const __m128i* endpos = (__m128i*)&bucket.Chain[bucket.Size]; - const __m128i data128( _mm_load_si128((__m128i*)dataPtr) ); + void add(const nVifBlock& dataPtr) { + u32 b = dataPtr.hash_key; - for( const __m128i* chainpos = (__m128i*)bucket.Chain; chainpos& bucket( mBucket[d % hSize] ); - - if( (bucket.Chain = (T*)pcsx2_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16, sizeof(T)*bucket.Size)) == NULL ) { + // Warning there is an extra +1 due to the empty cell + // Performance note: 64B align to reduce cache miss penalty in `find` + if( (m_bucket[b] = (nVifBlock*)pcsx2_aligned_realloc( m_bucket[b], sizeof(nVifBlock)*(size+2), 64, sizeof(nVifBlock)*(size+1) )) == NULL ) { throw Exception::OutOfMemory( - wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1) + wxsFormat(L"HashBucket Chain (bucket size=%d)", size+2) ); } - memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T)); - if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size ); + + // Replace the empty cell by the new block and create a new empty cell + memcpy(&m_bucket[b][size++], &dataPtr, sizeof(nVifBlock)); + memset(&m_bucket[b][size], 0, sizeof(nVifBlock)); + + if( size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", b, size ); } + + u32 bucket_size(const nVifBlock& dataPtr) { + nVifBlock* chainpos = m_bucket[dataPtr.hash_key]; + + u32 size = 0; + + while (chainpos->startPtr != 0) { + size++; + chainpos++; + } + + return size; + } + void clear() { - for (int i = 0; i < hSize; i++) { - safe_aligned_free(mBucket[i].Chain); - mBucket[i].Size = 0; + for (auto& bucket : m_bucket) + safe_aligned_free(bucket); + } + + void reset() { + clear(); + + // Allocate an empty cell for all buckets + for (auto& bucket : m_bucket) { + if( (bucket = (nVifBlock*)_aligned_malloc( sizeof(nVifBlock), 64 )) == nullptr ) { + throw Exception::OutOfMemory( + wxsFormat(L"HashBucket Chain (bucket size=%d)", 1) + ); + } + + memset(bucket, 0, sizeof(nVifBlock)); } } }; diff --git a/pcsx2/x86/newVif_Unpack.cpp b/pcsx2/x86/newVif_Unpack.cpp index 0886bcf0b1..3fae1bea01 100644 --- a/pcsx2/x86/newVif_Unpack.cpp +++ b/pcsx2/x86/newVif_Unpack.cpp @@ -73,10 +73,6 @@ static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = { nVifStruct::nVifStruct() { - vifBlocks = NULL; - numBlocks = 0; - - recReserveSizeMB = 8; } void reserveNewVif(int idx)