mirror of https://github.com/PCSX2/pcsx2.git
Linux: Fix bugs in _aligned_realloc and newVif's inlined SSE HashBucket finder.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2395 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
4b0b270776
commit
3d9bb25505
|
@ -28,7 +28,7 @@ static const uint headsize = sizeof(AlignedMallocHeader);
|
|||
|
||||
void* __fastcall pcsx2_aligned_malloc(size_t size, size_t align)
|
||||
{
|
||||
jASSUME( align < 0x10000 );
|
||||
pxAssume( align < 0x10000 );
|
||||
|
||||
u8* p = (u8*)malloc(size+align+headsize);
|
||||
|
||||
|
@ -47,15 +47,16 @@ void* __fastcall pcsx2_aligned_malloc(size_t size, size_t align)
|
|||
|
||||
void* __fastcall pcsx2_aligned_realloc(void* handle, size_t size, size_t align)
|
||||
{
|
||||
if( handle == NULL ) return NULL;
|
||||
jASSUME( align < 0x10000 );
|
||||
|
||||
AlignedMallocHeader* header = (AlignedMallocHeader*)((uptr)handle - headsize);
|
||||
pxAssume( align < 0x10000 );
|
||||
|
||||
void* newbuf = pcsx2_aligned_malloc( size, align );
|
||||
memcpy_fast( newbuf, handle, std::min( size, header->size ) );
|
||||
|
||||
free( header->baseptr );
|
||||
if( handle != NULL )
|
||||
{
|
||||
AlignedMallocHeader* header = (AlignedMallocHeader*)((uptr)handle - headsize);
|
||||
memcpy_fast( newbuf, handle, std::min( size, header->size ) );
|
||||
free( header->baseptr );
|
||||
}
|
||||
return newbuf;
|
||||
}
|
||||
|
||||
|
@ -74,7 +75,7 @@ __forceinline void pcsx2_aligned_free(void* pmem)
|
|||
// memzero_obj and stuff).
|
||||
__forceinline void _memset16_unaligned( void* dest, u16 data, size_t size )
|
||||
{
|
||||
jASSUME( (size & 0x1) == 0 );
|
||||
pxAssume( (size & 0x1) == 0 );
|
||||
|
||||
u16* dst = (u16*)dest;
|
||||
for(int i=size; i; --i, ++dst )
|
||||
|
|
|
@ -1,282 +1,282 @@
|
|||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2009 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets
|
||||
// authors: cottonvibes(@gmail.com)
|
||||
// Jake.Stine (@gmail.com)
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "VifUnpackSSE.h"
|
||||
|
||||
#if newVif
|
||||
|
||||
static __aligned16 nVifBlock _vBlock = {0};
|
||||
static __pagealigned u8 nVifMemCmp[__pagesize];
|
||||
|
||||
static void emitCustomCompare() {
|
||||
HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadWrite, false);
|
||||
memset8<0xcc>(nVifMemCmp);
|
||||
xSetPtr(nVifMemCmp);
|
||||
|
||||
xMOVAPS (xmm0, ptr32[ecx]);
|
||||
xPCMP.EQD(xmm0, ptr32[edx]);
|
||||
xMOVMSKPS(eax, xmm0);
|
||||
xAND (eax, 0x7); // ignore top 4 bytes (recBlock pointer)
|
||||
|
||||
xRET();
|
||||
HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadOnly, true);
|
||||
}
|
||||
|
||||
void dVifInit(int idx) {
|
||||
nVif[idx].idx = idx;
|
||||
nVif[idx].VU = idx ? &VU1 : &VU0;
|
||||
nVif[idx].vif = idx ? &vif1 : &vif0;
|
||||
nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
|
||||
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
|
||||
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
|
||||
nVif[idx].vifCache = new BlockBuffer(_1mb*4); // 4mb Rec Cache
|
||||
nVif[idx].vifBlocks = new HashBucket<_tParams>();
|
||||
nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
|
||||
nVif[idx].recEnd = &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone
|
||||
//emitCustomCompare();
|
||||
}
|
||||
|
||||
// Loads Row/Col Data from vifRegs instead of g_vifmask
|
||||
// Useful for testing vifReg and g_vifmask inconsistency.
|
||||
static void loadRowCol(nVifStruct& v) {
|
||||
xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
|
||||
xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
|
||||
xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
|
||||
xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
|
||||
xPSHUF.D(xmm0, xmm0, _v0);
|
||||
xPSHUF.D(xmm1, xmm1, _v0);
|
||||
xPSHUF.D(xmm2, xmm2, _v0);
|
||||
xPSHUF.D(xmm6, xmm6, _v0);
|
||||
mVUmergeRegs(XMM6, XMM0, 8);
|
||||
mVUmergeRegs(XMM6, XMM1, 4);
|
||||
mVUmergeRegs(XMM6, XMM2, 2);
|
||||
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
|
||||
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
|
||||
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
|
||||
xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
|
||||
xPSHUF.D(xmm2, xmm2, _v0);
|
||||
xPSHUF.D(xmm3, xmm3, _v0);
|
||||
xPSHUF.D(xmm4, xmm4, _v0);
|
||||
xPSHUF.D(xmm5, xmm5, _v0);
|
||||
}
|
||||
|
||||
VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_)
|
||||
: v(vif_)
|
||||
, vB(vifBlock_)
|
||||
{
|
||||
isFill = (vB.cl < vB.wl);
|
||||
usn = (vB.upkType>>5) & 1;
|
||||
doMask = (vB.upkType>>4) & 1;
|
||||
doMode = vB.mode & 3;
|
||||
}
|
||||
|
||||
#define makeMergeMask(x) { \
|
||||
x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3); \
|
||||
}
|
||||
|
||||
_f void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
|
||||
u32 m0 = vB.mask;
|
||||
u32 m1 = m0 & 0xaaaaaaaa;
|
||||
u32 m2 =(~m1>>1) & m0;
|
||||
u32 m3 = (m1>>1) & ~m0;
|
||||
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
|
||||
u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
|
||||
if((m2&&doMask) || doMode) { xMOVAPS(xmmRow, ptr32[row]); }
|
||||
if (m3&&doMask) {
|
||||
xMOVAPS(xmmCol0, ptr32[col]);
|
||||
if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
|
||||
if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
|
||||
if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
|
||||
if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0);
|
||||
}
|
||||
//if (mask||mode) loadRowCol(v);
|
||||
}
|
||||
|
||||
void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
|
||||
pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
|
||||
int cc = aMin(vCL, 3);
|
||||
u32 m0 = (vB.mask >> (cc * 8)) & 0xff;
|
||||
u32 m1 = m0 & 0xaaaa;
|
||||
u32 m2 =(~m1>>1) & m0;
|
||||
u32 m3 = (m1>>1) & ~m0;
|
||||
u32 m4 = (m1>>1) & m0;
|
||||
makeMergeMask(m2);
|
||||
makeMergeMask(m3);
|
||||
makeMergeMask(m4);
|
||||
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect
|
||||
if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id, m2); } // Merge Row
|
||||
if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc, m3); } // Merge Col
|
||||
if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id, m4); } // Merge Write Protect
|
||||
if (doMode) {
|
||||
u32 m5 = (~m1>>1) & ~m0;
|
||||
if (!doMask) m5 = 0xf;
|
||||
else makeMergeMask(m5);
|
||||
if (m5 < 0xf) {
|
||||
xPXOR(xmmTemp, xmmTemp);
|
||||
mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5);
|
||||
xPADD.D(regX, xmmTemp);
|
||||
if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5);
|
||||
}
|
||||
else if (m5 == 0xf) {
|
||||
xPADD.D(regX, xmmRow);
|
||||
if (doMode==2) xMOVAPS(xmmRow, regX);
|
||||
}
|
||||
}
|
||||
xMOVAPS(ptr32[dstIndirect], regX);
|
||||
}
|
||||
|
||||
void VifUnpackSSE_Dynarec::writeBackRow() const {
|
||||
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
|
||||
xMOVAPS(ptr32[row], xmmRow);
|
||||
DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
|
||||
// ToDo: Do we need to write back to vifregs.rX too!? :/
|
||||
}
|
||||
|
||||
static void ShiftDisplacementWindow( xAddressInfo& addr, const xRegister32& modReg )
|
||||
{
|
||||
// Shifts the displacement factor of a given indirect address, so that the address
|
||||
// remains in the optimal 0xf0 range (which allows for byte-form displacements when
|
||||
// generating instructions).
|
||||
|
||||
int addImm = 0;
|
||||
while( addr.Displacement >= 0x80 )
|
||||
{
|
||||
addImm += 0xf0;
|
||||
addr -= 0xf0;
|
||||
}
|
||||
if(addImm) xADD(modReg, addImm);
|
||||
}
|
||||
|
||||
void VifUnpackSSE_Dynarec::CompileRoutine() {
|
||||
const int upkNum = vB.upkType & 0xf;
|
||||
const u8& vift = nVifT[upkNum];
|
||||
const int cycleSize = isFill ? vB.cl : vB.wl;
|
||||
const int blockSize = isFill ? vB.wl : vB.cl;
|
||||
const int skipSize = blockSize - cycleSize;
|
||||
|
||||
int vNum = vifRegs->num;
|
||||
vCL = vif->cl;
|
||||
|
||||
SetMasks(cycleSize);
|
||||
|
||||
while (vNum) {
|
||||
|
||||
ShiftDisplacementWindow( srcIndirect, edx );
|
||||
ShiftDisplacementWindow( dstIndirect, ecx );
|
||||
|
||||
if (vCL < cycleSize) {
|
||||
xUnpack(upkNum);
|
||||
srcIndirect += vift;
|
||||
dstIndirect += 16;
|
||||
vNum--;
|
||||
if (++vCL == blockSize) vCL = 0;
|
||||
}
|
||||
else if (isFill) {
|
||||
DevCon.WriteLn("filling mode!");
|
||||
VifUnpackSSE_Dynarec::FillingWrite( *this ).xUnpack(upkNum);
|
||||
dstIndirect += 16;
|
||||
vNum--;
|
||||
if (++vCL == blockSize) vCL = 0;
|
||||
}
|
||||
else {
|
||||
dstIndirect += (16 * skipSize);
|
||||
vCL = 0;
|
||||
}
|
||||
}
|
||||
if (doMode==2) writeBackRow();
|
||||
xMOV(ptr32[&vif->cl], vCL);
|
||||
xMOV(ptr32[&vifRegs->num], vNum);
|
||||
xRET();
|
||||
}
|
||||
|
||||
static _f u8* dVifsetVUptr(const nVifStruct& v, int offset) {
|
||||
u8* ptr = (u8*)(v.VU->Mem + (offset & v.vuMemLimit));
|
||||
u8* endPtr = ptr + _vBlock.num * 16;
|
||||
if (endPtr > v.vuMemEnd) {
|
||||
DevCon.WriteLn("nVif - VU Mem Ptr Overflow; falling back to interpreter.");
|
||||
ptr = NULL; // Fall Back to Interpreters which have wrap-around logic
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static _f void dVifRecLimit(int idx) {
|
||||
if (nVif[idx].recPtr > nVif[idx].recEnd) {
|
||||
DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd);
|
||||
nVif[idx].vifBlocks->clear();
|
||||
nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
|
||||
}
|
||||
}
|
||||
|
||||
_f void dVifUnpack(int idx, u8 *data, u32 size, bool isFill) {
|
||||
|
||||
const nVifStruct& v = nVif[idx];
|
||||
const u8 upkType = vif->cmd & 0x1f | ((!!vif->usn) << 5);
|
||||
const int doMask = (upkType>>4) & 1;
|
||||
|
||||
const int cycle_cl = vifRegs->cycle.cl;
|
||||
const int cycle_wl = vifRegs->cycle.wl;
|
||||
const int cycleSize = isFill ? cycle_cl : cycle_wl;
|
||||
const int blockSize = isFill ? cycle_wl : cycle_cl;
|
||||
|
||||
if (vif->cl >= blockSize) vif->cl = 0;
|
||||
|
||||
_vBlock.upkType = upkType;
|
||||
_vBlock.num = *(u8*)&vifRegs->num;
|
||||
_vBlock.mode = *(u8*)&vifRegs->mode;
|
||||
_vBlock.scl = vif->cl;
|
||||
_vBlock.cl = cycle_cl;
|
||||
_vBlock.wl = cycle_wl;
|
||||
|
||||
// Zero out the mask parameter if it's unused -- games leave random junk
|
||||
// values here which cause false recblock cache misses.
|
||||
_vBlock.mask = doMask ? vifRegs->mask : 0x00;
|
||||
|
||||
if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
|
||||
if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) {
|
||||
//DevCon.WriteLn("Running Recompiled Block!");
|
||||
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
|
||||
}
|
||||
else {
|
||||
//DevCon.WriteLn("Running Interpreter Block");
|
||||
_nVifUnpack(idx, data, size, isFill);
|
||||
}
|
||||
return;
|
||||
}
|
||||
static int recBlockNum = 0;
|
||||
DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++);
|
||||
DevCon.WriteLn(L"\t(num=0x%02x, upkType=0x%02x, mode=0x%02x, scl=0x%02x, cl/wl=0x%x/0x%x, mask=%s)",
|
||||
_vBlock.num, _vBlock.upkType, _vBlock.mode, _vBlock.scl, _vBlock.cl, _vBlock.wl,
|
||||
doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
|
||||
);
|
||||
|
||||
xSetPtr(v.recPtr);
|
||||
_vBlock.startPtr = (uptr)xGetAlignedCallTarget();
|
||||
v.vifBlocks->add(_vBlock);
|
||||
VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine();
|
||||
nVif[idx].recPtr = xGetPtr();
|
||||
|
||||
dVifRecLimit(idx);
|
||||
|
||||
// Run the block we just compiled. Various conditions may force us to still use
|
||||
// the interpreter unpacker though, so a recursive call is the safest way here...
|
||||
dVifUnpack(idx, data, size, isFill);
|
||||
}
|
||||
|
||||
#endif
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2009 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets
|
||||
// authors: cottonvibes(@gmail.com)
|
||||
// Jake.Stine (@gmail.com)
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "VifUnpackSSE.h"
|
||||
|
||||
#if newVif
|
||||
|
||||
static __aligned16 nVifBlock _vBlock = {0};
|
||||
static __pagealigned u8 nVifMemCmp[__pagesize];
|
||||
|
||||
static void emitCustomCompare() {
|
||||
HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadWrite, false);
|
||||
memset8<0xcc>(nVifMemCmp);
|
||||
xSetPtr(nVifMemCmp);
|
||||
|
||||
xMOVAPS (xmm0, ptr32[ecx]);
|
||||
xPCMP.EQD(xmm0, ptr32[edx]);
|
||||
xMOVMSKPS(eax, xmm0);
|
||||
xAND (eax, 0x7); // ignore top 4 bytes (recBlock pointer)
|
||||
|
||||
xRET();
|
||||
HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadOnly, true);
|
||||
}
|
||||
|
||||
void dVifInit(int idx) {
|
||||
nVif[idx].idx = idx;
|
||||
nVif[idx].VU = idx ? &VU1 : &VU0;
|
||||
nVif[idx].vif = idx ? &vif1 : &vif0;
|
||||
nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
|
||||
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
|
||||
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
|
||||
nVif[idx].vifCache = new BlockBuffer(_1mb*4); // 4mb Rec Cache
|
||||
nVif[idx].vifBlocks = new HashBucket<_tParams>();
|
||||
nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
|
||||
nVif[idx].recEnd = &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone
|
||||
//emitCustomCompare();
|
||||
}
|
||||
|
||||
// Loads Row/Col Data from vifRegs instead of g_vifmask
|
||||
// Useful for testing vifReg and g_vifmask inconsistency.
|
||||
static void loadRowCol(nVifStruct& v) {
|
||||
xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
|
||||
xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
|
||||
xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
|
||||
xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
|
||||
xPSHUF.D(xmm0, xmm0, _v0);
|
||||
xPSHUF.D(xmm1, xmm1, _v0);
|
||||
xPSHUF.D(xmm2, xmm2, _v0);
|
||||
xPSHUF.D(xmm6, xmm6, _v0);
|
||||
mVUmergeRegs(XMM6, XMM0, 8);
|
||||
mVUmergeRegs(XMM6, XMM1, 4);
|
||||
mVUmergeRegs(XMM6, XMM2, 2);
|
||||
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
|
||||
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
|
||||
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
|
||||
xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
|
||||
xPSHUF.D(xmm2, xmm2, _v0);
|
||||
xPSHUF.D(xmm3, xmm3, _v0);
|
||||
xPSHUF.D(xmm4, xmm4, _v0);
|
||||
xPSHUF.D(xmm5, xmm5, _v0);
|
||||
}
|
||||
|
||||
VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_)
|
||||
: v(vif_)
|
||||
, vB(vifBlock_)
|
||||
{
|
||||
isFill = (vB.cl < vB.wl);
|
||||
usn = (vB.upkType>>5) & 1;
|
||||
doMask = (vB.upkType>>4) & 1;
|
||||
doMode = vB.mode & 3;
|
||||
}
|
||||
|
||||
#define makeMergeMask(x) { \
|
||||
x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3); \
|
||||
}
|
||||
|
||||
_f void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
|
||||
u32 m0 = vB.mask;
|
||||
u32 m1 = m0 & 0xaaaaaaaa;
|
||||
u32 m2 =(~m1>>1) & m0;
|
||||
u32 m3 = (m1>>1) & ~m0;
|
||||
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
|
||||
u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
|
||||
if((m2&&doMask) || doMode) { xMOVAPS(xmmRow, ptr32[row]); }
|
||||
if (m3&&doMask) {
|
||||
xMOVAPS(xmmCol0, ptr32[col]);
|
||||
if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
|
||||
if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
|
||||
if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
|
||||
if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0);
|
||||
}
|
||||
//if (mask||mode) loadRowCol(v);
|
||||
}
|
||||
|
||||
void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
|
||||
pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
|
||||
int cc = aMin(vCL, 3);
|
||||
u32 m0 = (vB.mask >> (cc * 8)) & 0xff;
|
||||
u32 m1 = m0 & 0xaaaa;
|
||||
u32 m2 =(~m1>>1) & m0;
|
||||
u32 m3 = (m1>>1) & ~m0;
|
||||
u32 m4 = (m1>>1) & m0;
|
||||
makeMergeMask(m2);
|
||||
makeMergeMask(m3);
|
||||
makeMergeMask(m4);
|
||||
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect
|
||||
if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id, m2); } // Merge Row
|
||||
if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc, m3); } // Merge Col
|
||||
if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id, m4); } // Merge Write Protect
|
||||
if (doMode) {
|
||||
u32 m5 = (~m1>>1) & ~m0;
|
||||
if (!doMask) m5 = 0xf;
|
||||
else makeMergeMask(m5);
|
||||
if (m5 < 0xf) {
|
||||
xPXOR(xmmTemp, xmmTemp);
|
||||
mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5);
|
||||
xPADD.D(regX, xmmTemp);
|
||||
if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5);
|
||||
}
|
||||
else if (m5 == 0xf) {
|
||||
xPADD.D(regX, xmmRow);
|
||||
if (doMode==2) xMOVAPS(xmmRow, regX);
|
||||
}
|
||||
}
|
||||
xMOVAPS(ptr32[dstIndirect], regX);
|
||||
}
|
||||
|
||||
void VifUnpackSSE_Dynarec::writeBackRow() const {
|
||||
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
|
||||
xMOVAPS(ptr32[row], xmmRow);
|
||||
DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
|
||||
// ToDo: Do we need to write back to vifregs.rX too!? :/
|
||||
}
|
||||
|
||||
static void ShiftDisplacementWindow( xAddressInfo& addr, const xRegister32& modReg )
|
||||
{
|
||||
// Shifts the displacement factor of a given indirect address, so that the address
|
||||
// remains in the optimal 0xf0 range (which allows for byte-form displacements when
|
||||
// generating instructions).
|
||||
|
||||
int addImm = 0;
|
||||
while( addr.Displacement >= 0x80 )
|
||||
{
|
||||
addImm += 0xf0;
|
||||
addr -= 0xf0;
|
||||
}
|
||||
if(addImm) xADD(modReg, addImm);
|
||||
}
|
||||
|
||||
void VifUnpackSSE_Dynarec::CompileRoutine() {
|
||||
const int upkNum = vB.upkType & 0xf;
|
||||
const u8& vift = nVifT[upkNum];
|
||||
const int cycleSize = isFill ? vB.cl : vB.wl;
|
||||
const int blockSize = isFill ? vB.wl : vB.cl;
|
||||
const int skipSize = blockSize - cycleSize;
|
||||
|
||||
int vNum = vifRegs->num;
|
||||
vCL = vif->cl;
|
||||
|
||||
SetMasks(cycleSize);
|
||||
|
||||
while (vNum) {
|
||||
|
||||
ShiftDisplacementWindow( srcIndirect, edx );
|
||||
ShiftDisplacementWindow( dstIndirect, ecx );
|
||||
|
||||
if (vCL < cycleSize) {
|
||||
xUnpack(upkNum);
|
||||
srcIndirect += vift;
|
||||
dstIndirect += 16;
|
||||
vNum--;
|
||||
if (++vCL == blockSize) vCL = 0;
|
||||
}
|
||||
else if (isFill) {
|
||||
DevCon.WriteLn("filling mode!");
|
||||
VifUnpackSSE_Dynarec::FillingWrite( *this ).xUnpack(upkNum);
|
||||
dstIndirect += 16;
|
||||
vNum--;
|
||||
if (++vCL == blockSize) vCL = 0;
|
||||
}
|
||||
else {
|
||||
dstIndirect += (16 * skipSize);
|
||||
vCL = 0;
|
||||
}
|
||||
}
|
||||
if (doMode==2) writeBackRow();
|
||||
xMOV(ptr32[&vif->cl], vCL);
|
||||
xMOV(ptr32[&vifRegs->num], vNum);
|
||||
xRET();
|
||||
}
|
||||
|
||||
static _f u8* dVifsetVUptr(const nVifStruct& v, int offset) {
|
||||
u8* ptr = (u8*)(v.VU->Mem + (offset & v.vuMemLimit));
|
||||
u8* endPtr = ptr + _vBlock.num * 16;
|
||||
if (endPtr > v.vuMemEnd) {
|
||||
DevCon.WriteLn("nVif - VU Mem Ptr Overflow; falling back to interpreter.");
|
||||
ptr = NULL; // Fall Back to Interpreters which have wrap-around logic
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static _f void dVifRecLimit(int idx) {
|
||||
if (nVif[idx].recPtr > nVif[idx].recEnd) {
|
||||
DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd);
|
||||
nVif[idx].vifBlocks->clear();
|
||||
nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
|
||||
}
|
||||
}
|
||||
|
||||
_f void dVifUnpack(int idx, u8 *data, u32 size, bool isFill) {
|
||||
|
||||
const nVifStruct& v = nVif[idx];
|
||||
const u8 upkType = vif->cmd & 0x1f | ((!!vif->usn) << 5);
|
||||
const int doMask = (upkType>>4) & 1;
|
||||
|
||||
const int cycle_cl = vifRegs->cycle.cl;
|
||||
const int cycle_wl = vifRegs->cycle.wl;
|
||||
const int cycleSize = isFill ? cycle_cl : cycle_wl;
|
||||
const int blockSize = isFill ? cycle_wl : cycle_cl;
|
||||
|
||||
if (vif->cl >= blockSize) vif->cl = 0;
|
||||
|
||||
_vBlock.upkType = upkType;
|
||||
_vBlock.num = *(u8*)&vifRegs->num;
|
||||
_vBlock.mode = *(u8*)&vifRegs->mode;
|
||||
_vBlock.scl = vif->cl;
|
||||
_vBlock.cl = cycle_cl;
|
||||
_vBlock.wl = cycle_wl;
|
||||
|
||||
// Zero out the mask parameter if it's unused -- games leave random junk
|
||||
// values here which cause false recblock cache misses.
|
||||
_vBlock.mask = doMask ? vifRegs->mask : 0x00;
|
||||
|
||||
if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
|
||||
if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) {
|
||||
//DevCon.WriteLn("Running Recompiled Block!");
|
||||
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
|
||||
}
|
||||
else {
|
||||
//DevCon.WriteLn("Running Interpreter Block");
|
||||
_nVifUnpack(idx, data, size, isFill);
|
||||
}
|
||||
return;
|
||||
}
|
||||
static int recBlockNum = 0;
|
||||
DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++);
|
||||
DevCon.WriteLn(L"\t(num=0x%02x, upkType=0x%02x, mode=0x%02x, scl=0x%02x, cl/wl=0x%x/0x%x, mask=%s)",
|
||||
_vBlock.num, _vBlock.upkType, _vBlock.mode, _vBlock.scl, _vBlock.cl, _vBlock.wl,
|
||||
doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
|
||||
);
|
||||
|
||||
xSetPtr(v.recPtr);
|
||||
_vBlock.startPtr = (uptr)xGetAlignedCallTarget();
|
||||
v.vifBlocks->add(_vBlock);
|
||||
VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine();
|
||||
nVif[idx].recPtr = xGetPtr();
|
||||
|
||||
dVifRecLimit(idx);
|
||||
|
||||
// Run the block we just compiled. Various conditions may force us to still use
|
||||
// the interpreter unpacker though, so a recursive call is the safest way here...
|
||||
dVifUnpack(idx, data, size, isFill);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,86 +1,100 @@
|
|||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2009 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2009 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "xmmintrin.h"
|
||||
#pragma once
|
||||
|
||||
template< typename T >
|
||||
struct SizeChain
|
||||
{
|
||||
int Size;
|
||||
T* Chain;
|
||||
};
|
||||
|
||||
// HashBucket is a container which uses a built-in hash function
|
||||
// to perform quick searches.
|
||||
// T is a struct data type (note: size must be in multiples of 16 bytes!)
|
||||
// hSize determines the number of buckets HashBucket will use for sorting.
|
||||
// cmpSize is the size of data to consider 2 structs equal (see find())
|
||||
// The hash function is determined by taking the first bytes of data and
|
||||
// performing a modulus the size of hSize. So the most diverse-data should
|
||||
// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
|
||||
template<typename T, int hSize, int cmpSize>
|
||||
class HashBucket {
|
||||
protected:
|
||||
SizeChain<T> mBucket[hSize];
|
||||
|
||||
public:
|
||||
HashBucket() {
|
||||
for (int i = 0; i < hSize; i++) {
|
||||
mBucket[i].Chain = NULL;
|
||||
mBucket[i].Size = 0;
|
||||
}
|
||||
}
|
||||
~HashBucket() { clear(); }
|
||||
int quickFind(u32 data) {
|
||||
return mBucket[data % hSize].Size;
|
||||
}
|
||||
__forceinline T* find(T* dataPtr) {
|
||||
u32 d = *((u32*)dataPtr);
|
||||
const SizeChain<T>& bucket( mBucket[d % hSize] );
|
||||
|
||||
for (int i=bucket.Size; i; --i) {
|
||||
// This inline version seems about 1-2% faster in tests of games that average 1
|
||||
// program per bucket. Games that average more should see a bigger improvement --air
|
||||
int result = _mm_movemask_ps( (__m128&) _mm_cmpeq_epi32( _mm_load_si128((__m128i*)&bucket.Chain[i]), _mm_load_si128((__m128i*)dataPtr) ) ) & 0x7;
|
||||
if( result == 0x7 ) return &bucket.Chain[i];
|
||||
|
||||
// Dynamically generated function version, can't be inlined. :(
|
||||
//if ((((nVifCall)((void*)nVifMemCmp))(&bucket.Chain[i], dataPtr))==7) return &bucket.Chain[i];
|
||||
|
||||
//if (!memcmp(&bucket.Chain[i], dataPtr, sizeof(T)-4)) return &c[i]; // old school version! >_<
|
||||
}
|
||||
if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
|
||||
return NULL;
|
||||
}
|
||||
__forceinline void add(const T& dataPtr) {
|
||||
u32 d = (u32&)dataPtr;
|
||||
SizeChain<T>& bucket( mBucket[d % hSize] );
|
||||
|
||||
if( bucket.Chain = (T*)_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16), bucket.Chain==NULL ) {
|
||||
throw Exception::OutOfMemory(
|
||||
wxsFormat(L"Out of memory re-allocating hash bucket (bucket size=%d)", bucket.Size+1),
|
||||
wxEmptyString
|
||||
);
|
||||
}
|
||||
memcpy_fast(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
|
||||
}
|
||||
void clear() {
|
||||
for (int i = 0; i < hSize; i++) {
|
||||
safe_aligned_free(mBucket[i].Chain);
|
||||
mBucket[i].Size = 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
#include "xmmintrin.h"
|
||||
#pragma once
|
||||
|
||||
// Create some typecast operators for SIMD operations. For some reason MSVC needs a
|
||||
// handle/reference typecast to avoid error. GCC (and presumably other compilers)
|
||||
// generate an error if the handle/ref is used. Honestly neither makes sense, since
|
||||
// both typecasts should be perfectly valid >_<. --air
|
||||
#ifdef _MSC_VER
|
||||
# define cast_m128 __m128&
|
||||
# define cast_m128i __m128i&
|
||||
# define cast_m128d __m128d&
|
||||
#else // defined(__GNUC__)
|
||||
# define cast_m128 __m128
|
||||
# define cast_m128i __m128i
|
||||
# define cast_m128d __m128d
|
||||
#endif
|
||||
|
||||
template< typename T >
|
||||
struct SizeChain
|
||||
{
|
||||
int Size;
|
||||
T* Chain;
|
||||
};
|
||||
|
||||
// HashBucket is a container which uses a built-in hash function
|
||||
// to perform quick searches.
|
||||
// T is a struct data type (note: size must be in multiples of 16 bytes!)
|
||||
// hSize determines the number of buckets HashBucket will use for sorting.
|
||||
// cmpSize is the size of data to consider 2 structs equal (see find())
|
||||
// The hash function is determined by taking the first bytes of data and
|
||||
// performing a modulus the size of hSize. So the most diverse-data should
|
||||
// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
|
||||
template<typename T, int hSize, int cmpSize>
|
||||
class HashBucket {
|
||||
protected:
|
||||
SizeChain<T> mBucket[hSize];
|
||||
|
||||
public:
|
||||
HashBucket() {
|
||||
for (int i = 0; i < hSize; i++) {
|
||||
mBucket[i].Chain = NULL;
|
||||
mBucket[i].Size = 0;
|
||||
}
|
||||
}
|
||||
~HashBucket() { clear(); }
|
||||
int quickFind(u32 data) {
|
||||
return mBucket[data % hSize].Size;
|
||||
}
|
||||
__forceinline T* find(T* dataPtr) {
|
||||
u32 d = *((u32*)dataPtr);
|
||||
const SizeChain<T>& bucket( mBucket[d % hSize] );
|
||||
|
||||
for (int i=bucket.Size; i; --i) {
|
||||
// This inline version seems about 1-2% faster in tests of games that average 1
|
||||
// program per bucket. Games that average more should see a bigger improvement --air
|
||||
int result = _mm_movemask_ps( (cast_m128) _mm_cmpeq_epi32( _mm_load_si128((__m128i*)&bucket.Chain[i]), _mm_load_si128((__m128i*)dataPtr) ) ) & 0x7;
|
||||
if( result == 0x7 ) return &bucket.Chain[i];
|
||||
|
||||
// Dynamically generated function version, can't be inlined. :(
|
||||
//if ((((nVifCall)((void*)nVifMemCmp))(&bucket.Chain[i], dataPtr))==7) return &bucket.Chain[i];
|
||||
|
||||
//if (!memcmp(&bucket.Chain[i], dataPtr, sizeof(T)-4)) return &c[i]; // old school version! >_<
|
||||
}
|
||||
if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
|
||||
return NULL;
|
||||
}
|
||||
__forceinline void add(const T& dataPtr) {
|
||||
u32 d = (u32&)dataPtr;
|
||||
SizeChain<T>& bucket( mBucket[d % hSize] );
|
||||
|
||||
if( bucket.Chain = (T*)_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16), bucket.Chain==NULL ) {
|
||||
throw Exception::OutOfMemory(
|
||||
wxsFormat(L"Out of memory re-allocating hash bucket (bucket size=%d)", bucket.Size+1),
|
||||
wxEmptyString
|
||||
);
|
||||
}
|
||||
memcpy_fast(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
|
||||
}
|
||||
void clear() {
|
||||
for (int i = 0; i < hSize; i++) {
|
||||
safe_aligned_free(mBucket[i].Chain);
|
||||
mBucket[i].Size = 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue