diff --git a/common/src/Utilities/AlignedMalloc.cpp b/common/src/Utilities/AlignedMalloc.cpp index d4bb6a1081..b5f5a05561 100644 --- a/common/src/Utilities/AlignedMalloc.cpp +++ b/common/src/Utilities/AlignedMalloc.cpp @@ -28,7 +28,7 @@ static const uint headsize = sizeof(AlignedMallocHeader); void* __fastcall pcsx2_aligned_malloc(size_t size, size_t align) { - jASSUME( align < 0x10000 ); + pxAssume( align < 0x10000 ); u8* p = (u8*)malloc(size+align+headsize); @@ -47,15 +47,16 @@ void* __fastcall pcsx2_aligned_malloc(size_t size, size_t align) void* __fastcall pcsx2_aligned_realloc(void* handle, size_t size, size_t align) { - if( handle == NULL ) return NULL; - jASSUME( align < 0x10000 ); - - AlignedMallocHeader* header = (AlignedMallocHeader*)((uptr)handle - headsize); + pxAssume( align < 0x10000 ); void* newbuf = pcsx2_aligned_malloc( size, align ); - memcpy_fast( newbuf, handle, std::min( size, header->size ) ); - free( header->baseptr ); + if( handle != NULL ) + { + AlignedMallocHeader* header = (AlignedMallocHeader*)((uptr)handle - headsize); + memcpy_fast( newbuf, handle, std::min( size, header->size ) ); + free( header->baseptr ); + } return newbuf; } @@ -74,7 +75,7 @@ __forceinline void pcsx2_aligned_free(void* pmem) // memzero_obj and stuff). __forceinline void _memset16_unaligned( void* dest, u16 data, size_t size ) { - jASSUME( (size & 0x1) == 0 ); + pxAssume( (size & 0x1) == 0 ); u16* dst = (u16*)dest; for(int i=size; i; --i, ++dst ) diff --git a/pcsx2/x86/VifUnpackSSE_Dynarec.cpp b/pcsx2/x86/VifUnpackSSE_Dynarec.cpp index dbe6f3e779..b4ee375beb 100644 --- a/pcsx2/x86/VifUnpackSSE_Dynarec.cpp +++ b/pcsx2/x86/VifUnpackSSE_Dynarec.cpp @@ -1,282 +1,282 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets -// authors: cottonvibes(@gmail.com) -// Jake.Stine (@gmail.com) - -#include "PrecompiledHeader.h" -#include "VifUnpackSSE.h" - -#if newVif - -static __aligned16 nVifBlock _vBlock = {0}; -static __pagealigned u8 nVifMemCmp[__pagesize]; - -static void emitCustomCompare() { - HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadWrite, false); - memset8<0xcc>(nVifMemCmp); - xSetPtr(nVifMemCmp); - - xMOVAPS (xmm0, ptr32[ecx]); - xPCMP.EQD(xmm0, ptr32[edx]); - xMOVMSKPS(eax, xmm0); - xAND (eax, 0x7); // ignore top 4 bytes (recBlock pointer) - - xRET(); - HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadOnly, true); -} - -void dVifInit(int idx) { - nVif[idx].idx = idx; - nVif[idx].VU = idx ? &VU1 : &VU0; - nVif[idx].vif = idx ? &vif1 : &vif0; - nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs; - nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); - nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; - nVif[idx].vifCache = new BlockBuffer(_1mb*4); // 4mb Rec Cache - nVif[idx].vifBlocks = new HashBucket<_tParams>(); - nVif[idx].recPtr = nVif[idx].vifCache->getBlock(); - nVif[idx].recEnd = &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone - //emitCustomCompare(); -} - -// Loads Row/Col Data from vifRegs instead of g_vifmask -// Useful for testing vifReg and g_vifmask inconsistency. -static void loadRowCol(nVifStruct& v) { - xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]); - xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]); - xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]); - xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]); - xPSHUF.D(xmm0, xmm0, _v0); - xPSHUF.D(xmm1, xmm1, _v0); - xPSHUF.D(xmm2, xmm2, _v0); - xPSHUF.D(xmm6, xmm6, _v0); - mVUmergeRegs(XMM6, XMM0, 8); - mVUmergeRegs(XMM6, XMM1, 4); - mVUmergeRegs(XMM6, XMM2, 2); - xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]); - xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]); - xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]); - xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]); - xPSHUF.D(xmm2, xmm2, _v0); - xPSHUF.D(xmm3, xmm3, _v0); - xPSHUF.D(xmm4, xmm4, _v0); - xPSHUF.D(xmm5, xmm5, _v0); -} - -VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_) - : v(vif_) - , vB(vifBlock_) -{ - isFill = (vB.cl < vB.wl); - usn = (vB.upkType>>5) & 1; - doMask = (vB.upkType>>4) & 1; - doMode = vB.mode & 3; -} - -#define makeMergeMask(x) { \ - x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3); \ -} - -_f void VifUnpackSSE_Dynarec::SetMasks(int cS) const { - u32 m0 = vB.mask; - u32 m1 = m0 & 0xaaaaaaaa; - u32 m2 =(~m1>>1) & m0; - u32 m3 = (m1>>1) & ~m0; - u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0; - u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0; - if((m2&&doMask) || doMode) { xMOVAPS(xmmRow, ptr32[row]); } - if (m3&&doMask) { - xMOVAPS(xmmCol0, ptr32[col]); - if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1); - if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2); - if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3); - if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0); - } - //if (mask||mode) loadRowCol(v); -} - -void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const { - pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking."); - int cc = aMin(vCL, 3); - u32 m0 = (vB.mask >> (cc * 8)) & 0xff; - u32 m1 = m0 & 0xaaaa; - u32 m2 =(~m1>>1) & m0; - u32 m3 = (m1>>1) & ~m0; - u32 m4 = (m1>>1) & m0; - makeMergeMask(m2); - makeMergeMask(m3); - makeMergeMask(m4); - if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect - if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id, m2); } // Merge Row - if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc, m3); } // Merge Col - if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id, m4); } // Merge Write Protect - if (doMode) { - u32 m5 = (~m1>>1) & ~m0; - if (!doMask) m5 = 0xf; - else makeMergeMask(m5); - if (m5 < 0xf) { - xPXOR(xmmTemp, xmmTemp); - mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5); - xPADD.D(regX, xmmTemp); - if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5); - } - else if (m5 == 0xf) { - xPADD.D(regX, xmmRow); - if (doMode==2) xMOVAPS(xmmRow, regX); - } - } - xMOVAPS(ptr32[dstIndirect], regX); -} - -void VifUnpackSSE_Dynarec::writeBackRow() const { - u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0; - xMOVAPS(ptr32[row], xmmRow); - DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]"); - // ToDo: Do we need to write back to vifregs.rX too!? :/ -} - -static void ShiftDisplacementWindow( xAddressInfo& addr, const xRegister32& modReg ) -{ - // Shifts the displacement factor of a given indirect address, so that the address - // remains in the optimal 0xf0 range (which allows for byte-form displacements when - // generating instructions). - - int addImm = 0; - while( addr.Displacement >= 0x80 ) - { - addImm += 0xf0; - addr -= 0xf0; - } - if(addImm) xADD(modReg, addImm); -} - -void VifUnpackSSE_Dynarec::CompileRoutine() { - const int upkNum = vB.upkType & 0xf; - const u8& vift = nVifT[upkNum]; - const int cycleSize = isFill ? vB.cl : vB.wl; - const int blockSize = isFill ? vB.wl : vB.cl; - const int skipSize = blockSize - cycleSize; - - int vNum = vifRegs->num; - vCL = vif->cl; - - SetMasks(cycleSize); - - while (vNum) { - - ShiftDisplacementWindow( srcIndirect, edx ); - ShiftDisplacementWindow( dstIndirect, ecx ); - - if (vCL < cycleSize) { - xUnpack(upkNum); - srcIndirect += vift; - dstIndirect += 16; - vNum--; - if (++vCL == blockSize) vCL = 0; - } - else if (isFill) { - DevCon.WriteLn("filling mode!"); - VifUnpackSSE_Dynarec::FillingWrite( *this ).xUnpack(upkNum); - dstIndirect += 16; - vNum--; - if (++vCL == blockSize) vCL = 0; - } - else { - dstIndirect += (16 * skipSize); - vCL = 0; - } - } - if (doMode==2) writeBackRow(); - xMOV(ptr32[&vif->cl], vCL); - xMOV(ptr32[&vifRegs->num], vNum); - xRET(); -} - -static _f u8* dVifsetVUptr(const nVifStruct& v, int offset) { - u8* ptr = (u8*)(v.VU->Mem + (offset & v.vuMemLimit)); - u8* endPtr = ptr + _vBlock.num * 16; - if (endPtr > v.vuMemEnd) { - DevCon.WriteLn("nVif - VU Mem Ptr Overflow; falling back to interpreter."); - ptr = NULL; // Fall Back to Interpreters which have wrap-around logic - } - return ptr; -} - -static _f void dVifRecLimit(int idx) { - if (nVif[idx].recPtr > nVif[idx].recEnd) { - DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd); - nVif[idx].vifBlocks->clear(); - nVif[idx].recPtr = nVif[idx].vifCache->getBlock(); - } -} - -_f void dVifUnpack(int idx, u8 *data, u32 size, bool isFill) { - - const nVifStruct& v = nVif[idx]; - const u8 upkType = vif->cmd & 0x1f | ((!!vif->usn) << 5); - const int doMask = (upkType>>4) & 1; - - const int cycle_cl = vifRegs->cycle.cl; - const int cycle_wl = vifRegs->cycle.wl; - const int cycleSize = isFill ? cycle_cl : cycle_wl; - const int blockSize = isFill ? cycle_wl : cycle_cl; - - if (vif->cl >= blockSize) vif->cl = 0; - - _vBlock.upkType = upkType; - _vBlock.num = *(u8*)&vifRegs->num; - _vBlock.mode = *(u8*)&vifRegs->mode; - _vBlock.scl = vif->cl; - _vBlock.cl = cycle_cl; - _vBlock.wl = cycle_wl; - - // Zero out the mask parameter if it's unused -- games leave random junk - // values here which cause false recblock cache misses. - _vBlock.mask = doMask ? vifRegs->mask : 0x00; - - if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) { - if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) { - //DevCon.WriteLn("Running Recompiled Block!"); - ((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data); - } - else { - //DevCon.WriteLn("Running Interpreter Block"); - _nVifUnpack(idx, data, size, isFill); - } - return; - } - static int recBlockNum = 0; - DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++); - DevCon.WriteLn(L"\t(num=0x%02x, upkType=0x%02x, mode=0x%02x, scl=0x%02x, cl/wl=0x%x/0x%x, mask=%s)", - _vBlock.num, _vBlock.upkType, _vBlock.mode, _vBlock.scl, _vBlock.cl, _vBlock.wl, - doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored" - ); - - xSetPtr(v.recPtr); - _vBlock.startPtr = (uptr)xGetAlignedCallTarget(); - v.vifBlocks->add(_vBlock); - VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine(); - nVif[idx].recPtr = xGetPtr(); - - dVifRecLimit(idx); - - // Run the block we just compiled. Various conditions may force us to still use - // the interpreter unpacker though, so a recursive call is the safest way here... - dVifUnpack(idx, data, size, isFill); -} - -#endif +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets +// authors: cottonvibes(@gmail.com) +// Jake.Stine (@gmail.com) + +#include "PrecompiledHeader.h" +#include "VifUnpackSSE.h" + +#if newVif + +static __aligned16 nVifBlock _vBlock = {0}; +static __pagealigned u8 nVifMemCmp[__pagesize]; + +static void emitCustomCompare() { + HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadWrite, false); + memset8<0xcc>(nVifMemCmp); + xSetPtr(nVifMemCmp); + + xMOVAPS (xmm0, ptr32[ecx]); + xPCMP.EQD(xmm0, ptr32[edx]); + xMOVMSKPS(eax, xmm0); + xAND (eax, 0x7); // ignore top 4 bytes (recBlock pointer) + + xRET(); + HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadOnly, true); +} + +void dVifInit(int idx) { + nVif[idx].idx = idx; + nVif[idx].VU = idx ? &VU1 : &VU0; + nVif[idx].vif = idx ? &vif1 : &vif0; + nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs; + nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); + nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; + nVif[idx].vifCache = new BlockBuffer(_1mb*4); // 4mb Rec Cache + nVif[idx].vifBlocks = new HashBucket<_tParams>(); + nVif[idx].recPtr = nVif[idx].vifCache->getBlock(); + nVif[idx].recEnd = &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone + //emitCustomCompare(); +} + +// Loads Row/Col Data from vifRegs instead of g_vifmask +// Useful for testing vifReg and g_vifmask inconsistency. +static void loadRowCol(nVifStruct& v) { + xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]); + xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]); + xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]); + xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]); + xPSHUF.D(xmm0, xmm0, _v0); + xPSHUF.D(xmm1, xmm1, _v0); + xPSHUF.D(xmm2, xmm2, _v0); + xPSHUF.D(xmm6, xmm6, _v0); + mVUmergeRegs(XMM6, XMM0, 8); + mVUmergeRegs(XMM6, XMM1, 4); + mVUmergeRegs(XMM6, XMM2, 2); + xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]); + xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]); + xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]); + xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]); + xPSHUF.D(xmm2, xmm2, _v0); + xPSHUF.D(xmm3, xmm3, _v0); + xPSHUF.D(xmm4, xmm4, _v0); + xPSHUF.D(xmm5, xmm5, _v0); +} + +VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_) + : v(vif_) + , vB(vifBlock_) +{ + isFill = (vB.cl < vB.wl); + usn = (vB.upkType>>5) & 1; + doMask = (vB.upkType>>4) & 1; + doMode = vB.mode & 3; +} + +#define makeMergeMask(x) { \ + x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3); \ +} + +_f void VifUnpackSSE_Dynarec::SetMasks(int cS) const { + u32 m0 = vB.mask; + u32 m1 = m0 & 0xaaaaaaaa; + u32 m2 =(~m1>>1) & m0; + u32 m3 = (m1>>1) & ~m0; + u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0; + u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0; + if((m2&&doMask) || doMode) { xMOVAPS(xmmRow, ptr32[row]); } + if (m3&&doMask) { + xMOVAPS(xmmCol0, ptr32[col]); + if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1); + if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2); + if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3); + if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0); + } + //if (mask||mode) loadRowCol(v); +} + +void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const { + pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking."); + int cc = aMin(vCL, 3); + u32 m0 = (vB.mask >> (cc * 8)) & 0xff; + u32 m1 = m0 & 0xaaaa; + u32 m2 =(~m1>>1) & m0; + u32 m3 = (m1>>1) & ~m0; + u32 m4 = (m1>>1) & m0; + makeMergeMask(m2); + makeMergeMask(m3); + makeMergeMask(m4); + if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect + if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id, m2); } // Merge Row + if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc, m3); } // Merge Col + if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id, m4); } // Merge Write Protect + if (doMode) { + u32 m5 = (~m1>>1) & ~m0; + if (!doMask) m5 = 0xf; + else makeMergeMask(m5); + if (m5 < 0xf) { + xPXOR(xmmTemp, xmmTemp); + mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5); + xPADD.D(regX, xmmTemp); + if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5); + } + else if (m5 == 0xf) { + xPADD.D(regX, xmmRow); + if (doMode==2) xMOVAPS(xmmRow, regX); + } + } + xMOVAPS(ptr32[dstIndirect], regX); +} + +void VifUnpackSSE_Dynarec::writeBackRow() const { + u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0; + xMOVAPS(ptr32[row], xmmRow); + DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]"); + // ToDo: Do we need to write back to vifregs.rX too!? :/ +} + +static void ShiftDisplacementWindow( xAddressInfo& addr, const xRegister32& modReg ) +{ + // Shifts the displacement factor of a given indirect address, so that the address + // remains in the optimal 0xf0 range (which allows for byte-form displacements when + // generating instructions). + + int addImm = 0; + while( addr.Displacement >= 0x80 ) + { + addImm += 0xf0; + addr -= 0xf0; + } + if(addImm) xADD(modReg, addImm); +} + +void VifUnpackSSE_Dynarec::CompileRoutine() { + const int upkNum = vB.upkType & 0xf; + const u8& vift = nVifT[upkNum]; + const int cycleSize = isFill ? vB.cl : vB.wl; + const int blockSize = isFill ? vB.wl : vB.cl; + const int skipSize = blockSize - cycleSize; + + int vNum = vifRegs->num; + vCL = vif->cl; + + SetMasks(cycleSize); + + while (vNum) { + + ShiftDisplacementWindow( srcIndirect, edx ); + ShiftDisplacementWindow( dstIndirect, ecx ); + + if (vCL < cycleSize) { + xUnpack(upkNum); + srcIndirect += vift; + dstIndirect += 16; + vNum--; + if (++vCL == blockSize) vCL = 0; + } + else if (isFill) { + DevCon.WriteLn("filling mode!"); + VifUnpackSSE_Dynarec::FillingWrite( *this ).xUnpack(upkNum); + dstIndirect += 16; + vNum--; + if (++vCL == blockSize) vCL = 0; + } + else { + dstIndirect += (16 * skipSize); + vCL = 0; + } + } + if (doMode==2) writeBackRow(); + xMOV(ptr32[&vif->cl], vCL); + xMOV(ptr32[&vifRegs->num], vNum); + xRET(); +} + +static _f u8* dVifsetVUptr(const nVifStruct& v, int offset) { + u8* ptr = (u8*)(v.VU->Mem + (offset & v.vuMemLimit)); + u8* endPtr = ptr + _vBlock.num * 16; + if (endPtr > v.vuMemEnd) { + DevCon.WriteLn("nVif - VU Mem Ptr Overflow; falling back to interpreter."); + ptr = NULL; // Fall Back to Interpreters which have wrap-around logic + } + return ptr; +} + +static _f void dVifRecLimit(int idx) { + if (nVif[idx].recPtr > nVif[idx].recEnd) { + DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd); + nVif[idx].vifBlocks->clear(); + nVif[idx].recPtr = nVif[idx].vifCache->getBlock(); + } +} + +_f void dVifUnpack(int idx, u8 *data, u32 size, bool isFill) { + + const nVifStruct& v = nVif[idx]; + const u8 upkType = vif->cmd & 0x1f | ((!!vif->usn) << 5); + const int doMask = (upkType>>4) & 1; + + const int cycle_cl = vifRegs->cycle.cl; + const int cycle_wl = vifRegs->cycle.wl; + const int cycleSize = isFill ? cycle_cl : cycle_wl; + const int blockSize = isFill ? cycle_wl : cycle_cl; + + if (vif->cl >= blockSize) vif->cl = 0; + + _vBlock.upkType = upkType; + _vBlock.num = *(u8*)&vifRegs->num; + _vBlock.mode = *(u8*)&vifRegs->mode; + _vBlock.scl = vif->cl; + _vBlock.cl = cycle_cl; + _vBlock.wl = cycle_wl; + + // Zero out the mask parameter if it's unused -- games leave random junk + // values here which cause false recblock cache misses. + _vBlock.mask = doMask ? vifRegs->mask : 0x00; + + if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) { + if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) { + //DevCon.WriteLn("Running Recompiled Block!"); + ((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data); + } + else { + //DevCon.WriteLn("Running Interpreter Block"); + _nVifUnpack(idx, data, size, isFill); + } + return; + } + static int recBlockNum = 0; + DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++); + DevCon.WriteLn(L"\t(num=0x%02x, upkType=0x%02x, mode=0x%02x, scl=0x%02x, cl/wl=0x%x/0x%x, mask=%s)", + _vBlock.num, _vBlock.upkType, _vBlock.mode, _vBlock.scl, _vBlock.cl, _vBlock.wl, + doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored" + ); + + xSetPtr(v.recPtr); + _vBlock.startPtr = (uptr)xGetAlignedCallTarget(); + v.vifBlocks->add(_vBlock); + VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine(); + nVif[idx].recPtr = xGetPtr(); + + dVifRecLimit(idx); + + // Run the block we just compiled. Various conditions may force us to still use + // the interpreter unpacker though, so a recursive call is the safest way here... + dVifUnpack(idx, data, size, isFill); +} + +#endif diff --git a/pcsx2/x86/newVif_HashBucket.h b/pcsx2/x86/newVif_HashBucket.h index 3cea0f8771..d27a15cc4c 100644 --- a/pcsx2/x86/newVif_HashBucket.h +++ b/pcsx2/x86/newVif_HashBucket.h @@ -1,86 +1,100 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ -#include "xmmintrin.h" -#pragma once - -template< typename T > -struct SizeChain -{ - int Size; - T* Chain; -}; - -// HashBucket is a container which uses a built-in hash function -// to perform quick searches. -// T is a struct data type (note: size must be in multiples of 16 bytes!) -// hSize determines the number of buckets HashBucket will use for sorting. -// cmpSize is the size of data to consider 2 structs equal (see find()) -// The hash function is determined by taking the first bytes of data and -// performing a modulus the size of hSize. So the most diverse-data should -// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted) -template -class HashBucket { -protected: - SizeChain mBucket[hSize]; - -public: - HashBucket() { - for (int i = 0; i < hSize; i++) { - mBucket[i].Chain = NULL; - mBucket[i].Size = 0; - } - } - ~HashBucket() { clear(); } - int quickFind(u32 data) { - return mBucket[data % hSize].Size; - } - __forceinline T* find(T* dataPtr) { - u32 d = *((u32*)dataPtr); - const SizeChain& bucket( mBucket[d % hSize] ); - - for (int i=bucket.Size; i; --i) { - // This inline version seems about 1-2% faster in tests of games that average 1 - // program per bucket. Games that average more should see a bigger improvement --air - int result = _mm_movemask_ps( (__m128&) _mm_cmpeq_epi32( _mm_load_si128((__m128i*)&bucket.Chain[i]), _mm_load_si128((__m128i*)dataPtr) ) ) & 0x7; - if( result == 0x7 ) return &bucket.Chain[i]; - - // Dynamically generated function version, can't be inlined. :( - //if ((((nVifCall)((void*)nVifMemCmp))(&bucket.Chain[i], dataPtr))==7) return &bucket.Chain[i]; - - //if (!memcmp(&bucket.Chain[i], dataPtr, sizeof(T)-4)) return &c[i]; // old school version! >_< - } - if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size ); - return NULL; - } - __forceinline void add(const T& dataPtr) { - u32 d = (u32&)dataPtr; - SizeChain& bucket( mBucket[d % hSize] ); - - if( bucket.Chain = (T*)_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16), bucket.Chain==NULL ) { - throw Exception::OutOfMemory( - wxsFormat(L"Out of memory re-allocating hash bucket (bucket size=%d)", bucket.Size+1), - wxEmptyString - ); - } - memcpy_fast(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T)); - } - void clear() { - for (int i = 0; i < hSize; i++) { - safe_aligned_free(mBucket[i].Chain); - mBucket[i].Size = 0; - } - } -}; +#include "xmmintrin.h" +#pragma once + +// Create some typecast operators for SIMD operations. For some reason MSVC needs a +// handle/reference typecast to avoid error. GCC (and presumably other compilers) +// generate an error if the handle/ref is used. Honestly neither makes sense, since +// both typecasts should be perfectly valid >_<. --air +#ifdef _MSC_VER +# define cast_m128 __m128& +# define cast_m128i __m128i& +# define cast_m128d __m128d& +#else // defined(__GNUC__) +# define cast_m128 __m128 +# define cast_m128i __m128i +# define cast_m128d __m128d +#endif + +template< typename T > +struct SizeChain +{ + int Size; + T* Chain; +}; + +// HashBucket is a container which uses a built-in hash function +// to perform quick searches. +// T is a struct data type (note: size must be in multiples of 16 bytes!) +// hSize determines the number of buckets HashBucket will use for sorting. +// cmpSize is the size of data to consider 2 structs equal (see find()) +// The hash function is determined by taking the first bytes of data and +// performing a modulus the size of hSize. So the most diverse-data should +// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted) +template +class HashBucket { +protected: + SizeChain mBucket[hSize]; + +public: + HashBucket() { + for (int i = 0; i < hSize; i++) { + mBucket[i].Chain = NULL; + mBucket[i].Size = 0; + } + } + ~HashBucket() { clear(); } + int quickFind(u32 data) { + return mBucket[data % hSize].Size; + } + __forceinline T* find(T* dataPtr) { + u32 d = *((u32*)dataPtr); + const SizeChain& bucket( mBucket[d % hSize] ); + + for (int i=bucket.Size; i; --i) { + // This inline version seems about 1-2% faster in tests of games that average 1 + // program per bucket. Games that average more should see a bigger improvement --air + int result = _mm_movemask_ps( (cast_m128) _mm_cmpeq_epi32( _mm_load_si128((__m128i*)&bucket.Chain[i]), _mm_load_si128((__m128i*)dataPtr) ) ) & 0x7; + if( result == 0x7 ) return &bucket.Chain[i]; + + // Dynamically generated function version, can't be inlined. :( + //if ((((nVifCall)((void*)nVifMemCmp))(&bucket.Chain[i], dataPtr))==7) return &bucket.Chain[i]; + + //if (!memcmp(&bucket.Chain[i], dataPtr, sizeof(T)-4)) return &c[i]; // old school version! >_< + } + if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size ); + return NULL; + } + __forceinline void add(const T& dataPtr) { + u32 d = (u32&)dataPtr; + SizeChain& bucket( mBucket[d % hSize] ); + + if( bucket.Chain = (T*)_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16), bucket.Chain==NULL ) { + throw Exception::OutOfMemory( + wxsFormat(L"Out of memory re-allocating hash bucket (bucket size=%d)", bucket.Size+1), + wxEmptyString + ); + } + memcpy_fast(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T)); + } + void clear() { + for (int i = 0; i < hSize; i++) { + safe_aligned_free(mBucket[i].Chain); + mBucket[i].Size = 0; + } + } +};