mirror of https://github.com/PCSX2/pcsx2.git
commit
10eb88f6fe
|
@ -55,42 +55,21 @@ _vifT extern void dVifUnpack (const u8* data, bool isFill);
|
||||||
#define xmmRow xmm6
|
#define xmmRow xmm6
|
||||||
#define xmmTemp xmm7
|
#define xmmTemp xmm7
|
||||||
|
|
||||||
// nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
|
|
||||||
// used as the hash bucket selector.
|
|
||||||
struct __aligned16 nVifBlock {
|
|
||||||
u8 num; // [00] Num Field
|
|
||||||
u8 upkType; // [01] Unpack Type [usn1:mask1:upk*4]
|
|
||||||
u8 mode; // [02] Mode Field
|
|
||||||
u8 aligned; // [03] Packet Alignment
|
|
||||||
u32 mask; // [04] Mask Field
|
|
||||||
u16 cl; // [08] CL Field
|
|
||||||
u16 wl; // [10] WL Field
|
|
||||||
uptr startPtr; // [12] Start Ptr of RecGen Code
|
|
||||||
}; // 16 bytes
|
|
||||||
|
|
||||||
#define _hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
|
|
||||||
#define _tParams nVifBlock, _hSize
|
|
||||||
struct nVifStruct {
|
struct nVifStruct {
|
||||||
|
|
||||||
__aligned16 nVifBlock block;
|
|
||||||
|
|
||||||
// Buffer for partial transfers (should always be first to ensure alignment)
|
// Buffer for partial transfers (should always be first to ensure alignment)
|
||||||
// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
|
// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
|
||||||
__aligned16 u8 buffer[256*16];
|
__aligned16 u8 buffer[256*16];
|
||||||
u32 bSize; // Size of 'buffer'
|
u32 bSize; // Size of 'buffer'
|
||||||
u32 bPtr;
|
|
||||||
|
|
||||||
uint recReserveSizeMB; // reserve size, in megabytes.
|
|
||||||
RecompiledCodeReserve* recReserve;
|
|
||||||
u8* recWritePtr; // current write pos into the reserve
|
|
||||||
|
|
||||||
HashBucket<_tParams>* vifBlocks; // Vif Blocks
|
|
||||||
int numBlocks; // # of Blocks Recompiled
|
|
||||||
|
|
||||||
// VIF0 or VIF1 - provided for debugging helpfulness only, and is generally unused.
|
// VIF0 or VIF1 - provided for debugging helpfulness only, and is generally unused.
|
||||||
// (templates are used for most or all VIF indexing)
|
// (templates are used for most or all VIF indexing)
|
||||||
u32 idx;
|
u32 idx;
|
||||||
|
|
||||||
|
RecompiledCodeReserve* recReserve;
|
||||||
|
u8* recWritePtr; // current write pos into the reserve
|
||||||
|
|
||||||
|
HashBucket vifBlocks; // Vif Blocks
|
||||||
|
|
||||||
nVifStruct();
|
nVifStruct();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -22,34 +22,30 @@
|
||||||
#include "MTVU.h"
|
#include "MTVU.h"
|
||||||
#include "Utilities/Perf.h"
|
#include "Utilities/Perf.h"
|
||||||
|
|
||||||
|
static void recReset(int idx) {
|
||||||
|
nVif[idx].vifBlocks.reset();
|
||||||
|
|
||||||
|
nVif[idx].recReserve->Reset();
|
||||||
|
|
||||||
|
nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
|
||||||
|
}
|
||||||
|
|
||||||
void dVifReserve(int idx) {
|
void dVifReserve(int idx) {
|
||||||
if(!nVif[idx].recReserve)
|
if(!nVif[idx].recReserve)
|
||||||
nVif[idx].recReserve = new RecompiledCodeReserve(pxsFmt(L"VIF%u Unpack Recompiler Cache", idx), _8mb);
|
nVif[idx].recReserve = new RecompiledCodeReserve(pxsFmt(L"VIF%u Unpack Recompiler Cache", idx), _8mb);
|
||||||
|
|
||||||
nVif[idx].recReserve->Reserve( nVif[idx].recReserveSizeMB * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec );
|
nVif[idx].recReserve->Reserve( 8 * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec );
|
||||||
}
|
}
|
||||||
|
|
||||||
void dVifReset(int idx) {
|
void dVifReset(int idx) {
|
||||||
pxAssertDev(nVif[idx].recReserve, "Dynamic VIF recompiler reserve must be created prior to VIF use or reset!");
|
pxAssertDev(nVif[idx].recReserve, "Dynamic VIF recompiler reserve must be created prior to VIF use or reset!");
|
||||||
|
|
||||||
if(!nVif[idx].vifBlocks)
|
recReset(idx);
|
||||||
nVif[idx].vifBlocks = new HashBucket<_tParams>();
|
|
||||||
else
|
|
||||||
nVif[idx].vifBlocks->clear();
|
|
||||||
|
|
||||||
nVif[idx].recReserve->Reset();
|
|
||||||
|
|
||||||
nVif[idx].numBlocks = 0;
|
|
||||||
nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
|
|
||||||
//memset(nVif[idx].recWritePtr, 0xcc, nVif[idx].recReserveSizeMB * _1mb);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void dVifClose(int idx) {
|
void dVifClose(int idx) {
|
||||||
nVif[idx].numBlocks = 0;
|
|
||||||
if (nVif[idx].recReserve)
|
if (nVif[idx].recReserve)
|
||||||
nVif[idx].recReserve->Reset();
|
nVif[idx].recReserve->Reset();
|
||||||
|
|
||||||
safe_delete(nVif[idx].vifBlocks);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void dVifRelease(int idx) {
|
void dVifRelease(int idx) {
|
||||||
|
@ -61,7 +57,8 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo
|
||||||
: v(vif_)
|
: v(vif_)
|
||||||
, vB(vifBlock_)
|
, vB(vifBlock_)
|
||||||
{
|
{
|
||||||
isFill = (vB.cl < vB.wl);
|
const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
|
||||||
|
isFill = (vB.cl < wl);
|
||||||
usn = (vB.upkType>>5) & 1;
|
usn = (vB.upkType>>5) & 1;
|
||||||
doMask = (vB.upkType>>4) & 1;
|
doMask = (vB.upkType>>4) & 1;
|
||||||
doMode = vB.mode & 3;
|
doMode = vB.mode & 3;
|
||||||
|
@ -201,11 +198,13 @@ void VifUnpackSSE_Dynarec::ModUnpack( int upknum, bool PostOp )
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Dynarec::CompileRoutine() {
|
void VifUnpackSSE_Dynarec::CompileRoutine() {
|
||||||
|
const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
|
||||||
const int upkNum = vB.upkType & 0xf;
|
const int upkNum = vB.upkType & 0xf;
|
||||||
const u8& vift = nVifT[upkNum];
|
const u8& vift = nVifT[upkNum];
|
||||||
const int cycleSize = isFill ? vB.cl : vB.wl;
|
const int cycleSize = isFill ? vB.cl : wl;
|
||||||
const int blockSize = isFill ? vB.wl : vB.cl;
|
const int blockSize = isFill ? wl : vB.cl;
|
||||||
const int skipSize = blockSize - cycleSize;
|
const int skipSize = blockSize - cycleSize;
|
||||||
|
|
||||||
uint vNum = vB.num ? vB.num : 256;
|
uint vNum = vB.num ? vB.num : 256;
|
||||||
|
@ -261,60 +260,42 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
|
||||||
xRET();
|
xRET();
|
||||||
}
|
}
|
||||||
|
|
||||||
_vifT static __fi u8* dVifsetVUptr(uint cl, uint wl, bool isFill) {
|
static u16 dVifComputeLength(uint cl, uint wl, u8 num, bool isFill) {
|
||||||
nVifStruct& v = nVif[idx];
|
uint length = (num > 0) ? (num * 16) : 4096; // 0 = 256
|
||||||
vifStruct& vif = MTVU_VifX;
|
|
||||||
const VURegs& VU = vuRegs[idx];
|
|
||||||
const uint vuMemLimit = idx ? 0x4000 : 0x1000;
|
|
||||||
|
|
||||||
u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
|
|
||||||
u8* endmem = VU.Mem + vuMemLimit;
|
|
||||||
uint length = (v.block.num > 0) ? (v.block.num * 16) : 4096; // 0 = 256
|
|
||||||
|
|
||||||
//wl = wl ? wl : 256; //0 is taken as 256 (KH2)
|
|
||||||
//if (wl == 256) isFill = true;
|
|
||||||
if (!isFill) {
|
if (!isFill) {
|
||||||
uint skipSize = (cl - wl) * 16;
|
uint skipSize = (cl - wl) * 16;
|
||||||
uint blocks = (v.block.num + (wl-1)) / wl; //Need to round up num's to calculate skip size correctly.
|
uint blocks = (num + (wl-1)) / wl; //Need to round up num's to calculate skip size correctly.
|
||||||
length += (blocks-1) * skipSize;
|
length += (blocks-1) * skipSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((startmem + length) <= endmem) {
|
return std::min(length, 0xFFFFu);
|
||||||
return startmem;
|
|
||||||
}
|
|
||||||
//Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, vif.tag.addr, vif.tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl);
|
|
||||||
return NULL; // Fall Back to Interpreters which have wrap-around logic
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// [TODO] : Finish implementing support for VIF's growable recBlocks buffer. Currently
|
_vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill) {
|
||||||
// it clears the buffer only.
|
nVifStruct& v = nVif[idx];
|
||||||
static __fi void dVifRecLimit(int idx) {
|
|
||||||
if (nVif[idx].recWritePtr > (nVif[idx].recReserve->GetPtrEnd() - _256kb)) {
|
// Check size before the compilation
|
||||||
|
if (v.recWritePtr > (v.recReserve->GetPtrEnd() - _256kb)) {
|
||||||
DevCon.WriteLn(L"nVif Recompiler Cache Reset! [%ls > %ls]",
|
DevCon.WriteLn(L"nVif Recompiler Cache Reset! [%ls > %ls]",
|
||||||
pxsPtr(nVif[idx].recWritePtr), pxsPtr(nVif[idx].recReserve->GetPtrEnd())
|
pxsPtr(v.recWritePtr), pxsPtr(v.recReserve->GetPtrEnd())
|
||||||
);
|
);
|
||||||
nVif[idx].recReserve->Reset();
|
recReset(idx);
|
||||||
nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
_vifT static __ri bool dVifExecuteUnpack(const u8* data, bool isFill)
|
// Compile the block now
|
||||||
{
|
xSetPtr(v.recWritePtr);
|
||||||
nVifStruct& v = nVif[idx];
|
|
||||||
VIFregisters& vifRegs = MTVU_VifXRegs;
|
|
||||||
|
|
||||||
if (nVifBlock* b = v.vifBlocks->find(&v.block)) {
|
block.startPtr = (uptr)xGetAlignedCallTarget();
|
||||||
if (u8* dest = dVifsetVUptr<idx>(vifRegs.cycle.cl, vifRegs.cycle.wl, isFill)) {
|
block.length = dVifComputeLength(block.cl, block.wl, block.num, isFill);
|
||||||
//DevCon.WriteLn("Running Recompiled Block!");
|
v.vifBlocks.add(block);
|
||||||
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
|
|
||||||
}
|
VifUnpackSSE_Dynarec(v, block).CompileRoutine();
|
||||||
else {
|
|
||||||
VIF_LOG("Running Interpreter Block");
|
Perf::vif.map((uptr)v.recWritePtr, xGetPtr() - v.recWritePtr, block.upkType /* FIXME ideally a key*/);
|
||||||
_nVifUnpack(idx, data, vifRegs.mode, isFill);
|
v.recWritePtr = xGetPtr();
|
||||||
}
|
|
||||||
return true;
|
return █
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_vifT __fi void dVifUnpack(const u8* data, bool isFill) {
|
_vifT __fi void dVifUnpack(const u8* data, bool isFill) {
|
||||||
|
@ -326,42 +307,56 @@ _vifT __fi void dVifUnpack(const u8* data, bool isFill) {
|
||||||
const u8 upkType = (vif.cmd & 0x1f) | (vif.usn << 5);
|
const u8 upkType = (vif.cmd & 0x1f) | (vif.usn << 5);
|
||||||
const int doMask = isFill? 1 : (vif.cmd & 0x10);
|
const int doMask = isFill? 1 : (vif.cmd & 0x10);
|
||||||
|
|
||||||
v.block.upkType = upkType;
|
nVifBlock block;
|
||||||
v.block.num = (u8&)vifRegs.num;
|
|
||||||
v.block.mode = (u8&)vifRegs.mode;
|
|
||||||
v.block.cl = vifRegs.cycle.cl;
|
|
||||||
v.block.wl = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256;
|
|
||||||
v.block.aligned = vif.start_aligned; //MTVU doesn't have a packet size!
|
|
||||||
|
|
||||||
|
// Performance note: initial code was using u8/u16 field of the struct
|
||||||
|
// directly. However reading back the data (as u32) in HashBucket.find
|
||||||
|
// leads to various memory stalls. So it is way faster to manually build the data
|
||||||
|
// in u32 (aka x86 register).
|
||||||
|
//
|
||||||
|
// Warning the order of data in hash_key/key0/key1 depends on the nVifBlock struct
|
||||||
|
u32 hash_key = (u32)(upkType & 0xFF) << 8 | (vifRegs.num & 0xFF);
|
||||||
|
|
||||||
|
u32 key1 = ((u32)vifRegs.cycle.wl << 24) | ((u32)vifRegs.cycle.cl << 16) | ((u32)(vif.start_aligned & 0xFF) << 8) | ((u32)vifRegs.mode & 0xFF);
|
||||||
if ((upkType & 0xf) != 9)
|
if ((upkType & 0xf) != 9)
|
||||||
v.block.aligned &= 0x1;
|
key1 &= 0xFFFF01FF;
|
||||||
|
|
||||||
//DevCon.Warning("Alignment %d", v.block.aligned);
|
|
||||||
// Zero out the mask parameter if it's unused -- games leave random junk
|
// Zero out the mask parameter if it's unused -- games leave random junk
|
||||||
// values here which cause false recblock cache misses.
|
// values here which cause false recblock cache misses.
|
||||||
v.block.mask = doMask ? vifRegs.mask : 0;
|
u32 key0 = doMask ? vifRegs.mask : 0;
|
||||||
|
|
||||||
//DevCon.WriteLn("nVif%d: Recompiled Block! [%d]", idx, nVif[idx].numBlocks++);
|
block.hash_key = hash_key;
|
||||||
|
block.key0 = key0;
|
||||||
|
block.key1 = key1;
|
||||||
|
|
||||||
|
//DevCon.WriteLn("nVif%d: Recompiled Block!", idx);
|
||||||
//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
|
//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
|
||||||
// v.Block.num, v.Block.upkType, v.Block.scl, v.Block.cl, v.Block.wl, v.Block.mode,
|
// block.num, block.upkType, block.scl, block.cl, block.wl, block.mode,
|
||||||
// doMask >> 4, doMask ? wxsFormat( L"0x%08x", v.Block.mask ).c_str() : L"ignored"
|
// doMask >> 4, doMask ? wxsFormat( L"0x%08x", block.mask ).c_str() : L"ignored"
|
||||||
//);
|
//);
|
||||||
|
|
||||||
if (dVifExecuteUnpack<idx>(data, isFill)) return;
|
// Seach in cache before trying to compile the block
|
||||||
|
nVifBlock* b = v.vifBlocks.find(block);
|
||||||
|
if (unlikely(b == nullptr)) {
|
||||||
|
b = dVifCompile<idx>(block, isFill);
|
||||||
|
}
|
||||||
|
|
||||||
xSetPtr(v.recWritePtr);
|
{ // Execute the block
|
||||||
v.block.startPtr = (uptr)xGetAlignedCallTarget();
|
const VURegs& VU = vuRegs[idx];
|
||||||
v.vifBlocks->add(v.block);
|
const uint vuMemLimit = idx ? 0x4000 : 0x1000;
|
||||||
VifUnpackSSE_Dynarec(v, v.block).CompileRoutine();
|
|
||||||
|
|
||||||
Perf::vif.map((uptr)v.recWritePtr, xGetPtr() - v.recWritePtr, v.block.upkType /* FIXME ideally a key*/);
|
u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
|
||||||
nVif[idx].recWritePtr = xGetPtr();
|
u8* endmem = VU.Mem + vuMemLimit;
|
||||||
|
|
||||||
dVifRecLimit(idx);
|
if (likely((startmem + b->length) <= endmem)) {
|
||||||
|
// No wrapping, you can run the fast dynarec
|
||||||
// Run the block we just compiled. Various conditions may force us to still use
|
((nVifrecCall)b->startPtr)((uptr)startmem, (uptr)data);
|
||||||
// the interpreter unpacker though, so a recursive call is the safest way here...
|
} else {
|
||||||
dVifExecuteUnpack<idx>(data, isFill);
|
VIF_LOG("Running Interpreter Block: nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x",
|
||||||
|
v.idx, vif.tag.addr, vif.tag.addr + (block.num * 16), block.num, block.wl, block.cl);
|
||||||
|
_nVifUnpack(idx, data, vifRegs.mode, isFill);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template void dVifUnpack<0>(const u8* data, bool isFill);
|
template void dVifUnpack<0>(const u8* data, bool isFill);
|
||||||
|
|
|
@ -13,87 +13,122 @@
|
||||||
* If not, see <http://www.gnu.org/licenses/>.
|
* If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "x86emitter/x86_intrin.h"
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
// Create some typecast operators for SIMD operations. For some reason MSVC needs a
|
#include <array>
|
||||||
// handle/reference typecast to avoid error. GCC (and presumably other compilers)
|
|
||||||
// generate an error if the handle/ref is used. Honestly neither makes sense, since
|
|
||||||
// both typecasts should be perfectly valid >_<. --air
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
# define cast_m128 __m128&
|
|
||||||
# define cast_m128i __m128i&
|
|
||||||
# define cast_m128d __m128d&
|
|
||||||
#else // defined(__GNUC__)
|
|
||||||
# define cast_m128 __m128
|
|
||||||
# define cast_m128i __m128i
|
|
||||||
# define cast_m128d __m128d
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template< typename T >
|
// nVifBlock - Ordered for Hashing; the 'num' and 'upkType' fields are
|
||||||
struct SizeChain
|
// used as the hash bucket selector.
|
||||||
{
|
union nVifBlock {
|
||||||
int Size;
|
// Warning: order depends on the newVifDynaRec code
|
||||||
T* Chain;
|
struct {
|
||||||
};
|
u8 num; // [00] Num Field
|
||||||
|
u8 upkType; // [01] Unpack Type [usn1:mask1:upk*4]
|
||||||
|
u16 length; // [02] Extra: pre computed Length
|
||||||
|
u32 mask; // [04] Mask Field
|
||||||
|
u8 mode; // [08] Mode Field
|
||||||
|
u8 aligned; // [09] Packet Alignment
|
||||||
|
u8 cl; // [10] CL Field
|
||||||
|
u8 wl; // [11] WL Field
|
||||||
|
uptr startPtr; // [12] Start Ptr of RecGen Code
|
||||||
|
};
|
||||||
|
|
||||||
|
struct {
|
||||||
|
u16 hash_key;
|
||||||
|
u16 _pad0;
|
||||||
|
u32 key0;
|
||||||
|
u32 key1;
|
||||||
|
uptr value;
|
||||||
|
};
|
||||||
|
|
||||||
|
}; // 16 bytes
|
||||||
|
|
||||||
|
// 0x4000 is enough but 0x10000 allow
|
||||||
|
// * to skip the compare value of the first double world in lookup
|
||||||
|
// * to use a 16 bits move instead of an 'and' mask to compute the hashed key
|
||||||
|
#define hSize 0x10000 // [usn*1:mask*1:upk*4:num*8] hash...
|
||||||
|
|
||||||
// HashBucket is a container which uses a built-in hash function
|
// HashBucket is a container which uses a built-in hash function
|
||||||
// to perform quick searches.
|
// to perform quick searches. It is designed around the nVifBlock structure
|
||||||
// T is a struct data type (note: size must be in multiples of 16 bytes!)
|
//
|
||||||
// hSize determines the number of buckets HashBucket will use for sorting.
|
|
||||||
// The hash function is determined by taking the first bytes of data and
|
// The hash function is determined by taking the first bytes of data and
|
||||||
// performing a modulus the size of hSize. So the most diverse-data should
|
// performing a modulus the size of hSize. So the most diverse-data should
|
||||||
// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
|
// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
|
||||||
template<typename T, int hSize>
|
|
||||||
class HashBucket {
|
class HashBucket {
|
||||||
protected:
|
protected:
|
||||||
SizeChain<T> mBucket[hSize];
|
std::array<nVifBlock*, hSize> m_bucket;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
HashBucket() {
|
HashBucket() {
|
||||||
for (int i = 0; i < hSize; i++) {
|
m_bucket.fill(nullptr);
|
||||||
mBucket[i].Chain = NULL;
|
}
|
||||||
mBucket[i].Size = 0;
|
|
||||||
|
~HashBucket() throw() { clear(); }
|
||||||
|
|
||||||
|
__fi nVifBlock* find(const nVifBlock& dataPtr) {
|
||||||
|
nVifBlock* chainpos = m_bucket[dataPtr.hash_key];
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
if (chainpos->key0 == dataPtr.key0 && chainpos->key1 == dataPtr.key1)
|
||||||
|
return chainpos;
|
||||||
|
|
||||||
|
if (chainpos->startPtr == 0)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
chainpos++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
virtual ~HashBucket() throw() { clear(); }
|
|
||||||
int quickFind(u32 data) {
|
|
||||||
return mBucket[data % hSize].Size;
|
|
||||||
}
|
|
||||||
__fi T* find(T* dataPtr) {
|
|
||||||
u32 d = *((u32*)dataPtr);
|
|
||||||
const SizeChain<T>& bucket( mBucket[d % hSize] );
|
|
||||||
|
|
||||||
const __m128i* endpos = (__m128i*)&bucket.Chain[bucket.Size];
|
void add(const nVifBlock& dataPtr) {
|
||||||
const __m128i data128( _mm_load_si128((__m128i*)dataPtr) );
|
u32 b = dataPtr.hash_key;
|
||||||
|
|
||||||
for( const __m128i* chainpos = (__m128i*)bucket.Chain; chainpos<endpos; chainpos+=sizeof(T) / 16u ) {
|
u32 size = bucket_size( dataPtr );
|
||||||
// Note SSE4/AVX optimization (However it requires to only have the key in the first 16B without the pointer)
|
|
||||||
// tmp = xor (data128, load(chainpos))
|
|
||||||
// ptest tmp tmp (zf will be set if tmp == 0, i.e equality)
|
|
||||||
|
|
||||||
// This inline SSE code is generally faster than using emitter code, since it inlines nicely. --air
|
// Warning there is an extra +1 due to the empty cell
|
||||||
int result = _mm_movemask_ps( (cast_m128) _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) );
|
// Performance note: 64B align to reduce cache miss penalty in `find`
|
||||||
if( (result&0x7) == 0x7 ) return (T*)chainpos;
|
if( (m_bucket[b] = (nVifBlock*)pcsx2_aligned_realloc( m_bucket[b], sizeof(nVifBlock)*(size+2), 64, sizeof(nVifBlock)*(size+1) )) == NULL ) {
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
__fi void add(const T& dataPtr) {
|
|
||||||
u32 d = (u32&)dataPtr;
|
|
||||||
SizeChain<T>& bucket( mBucket[d % hSize] );
|
|
||||||
|
|
||||||
if( (bucket.Chain = (T*)pcsx2_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16, sizeof(T)*bucket.Size)) == NULL ) {
|
|
||||||
throw Exception::OutOfMemory(
|
throw Exception::OutOfMemory(
|
||||||
wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
|
wxsFormat(L"HashBucket Chain (bucket size=%d)", size+2)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
|
|
||||||
if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
|
// Replace the empty cell by the new block and create a new empty cell
|
||||||
|
memcpy(&m_bucket[b][size++], &dataPtr, sizeof(nVifBlock));
|
||||||
|
memset(&m_bucket[b][size], 0, sizeof(nVifBlock));
|
||||||
|
|
||||||
|
if( size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", b, size );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
u32 bucket_size(const nVifBlock& dataPtr) {
|
||||||
|
nVifBlock* chainpos = m_bucket[dataPtr.hash_key];
|
||||||
|
|
||||||
|
u32 size = 0;
|
||||||
|
|
||||||
|
while (chainpos->startPtr != 0) {
|
||||||
|
size++;
|
||||||
|
chainpos++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
void clear() {
|
void clear() {
|
||||||
for (int i = 0; i < hSize; i++) {
|
for (auto& bucket : m_bucket)
|
||||||
safe_aligned_free(mBucket[i].Chain);
|
safe_aligned_free(bucket);
|
||||||
mBucket[i].Size = 0;
|
}
|
||||||
|
|
||||||
|
void reset() {
|
||||||
|
clear();
|
||||||
|
|
||||||
|
// Allocate an empty cell for all buckets
|
||||||
|
for (auto& bucket : m_bucket) {
|
||||||
|
if( (bucket = (nVifBlock*)_aligned_malloc( sizeof(nVifBlock), 64 )) == nullptr ) {
|
||||||
|
throw Exception::OutOfMemory(
|
||||||
|
wxsFormat(L"HashBucket Chain (bucket size=%d)", 1)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(bucket, 0, sizeof(nVifBlock));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -73,10 +73,6 @@ static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = {
|
||||||
|
|
||||||
nVifStruct::nVifStruct()
|
nVifStruct::nVifStruct()
|
||||||
{
|
{
|
||||||
vifBlocks = NULL;
|
|
||||||
numBlocks = 0;
|
|
||||||
|
|
||||||
recReserveSizeMB = 8;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void reserveNewVif(int idx)
|
void reserveNewVif(int idx)
|
||||||
|
|
Loading…
Reference in New Issue