From a5272f8dc91f41d72ab7e1c994675c1a9dc033b3 Mon Sep 17 00:00:00 2001 From: cottonvibes Date: Sat, 19 Dec 2009 10:00:40 +0000 Subject: [PATCH] Wrote a vif 'unpack' packet recompiler. Compatibility is probably the same as the newVif interpreter code, but its faster. Speedwise its similar to the old-vif unpack code (the one currently enabled by default in pcsx2). Its about 0~2% slower on my machine from my limited testing, but I assume people with SSE4.1 cpus might have a lot better results since I added a lot of sse4.1 optimizations... The SSE4.1 optimizations were also ported to the newVif interpreter code. Also the "filling mode" should be fast compared to the old-vif unpack code since its sse optimized, but most games don't use this mode so it hasn't been tested much... git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2358 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/Vif.h | 4 +- pcsx2/windows/VCprojects/pcsx2_2008.vcproj | 16 ++ pcsx2/x86/newVif.h | 63 +++-- pcsx2/x86/newVif_BlockBuffer.h | 7 +- pcsx2/x86/newVif_Dynarec.inl | 157 ++++++++++++ pcsx2/x86/newVif_HashBucket.h | 58 +++++ pcsx2/x86/newVif_Tables.inl | 273 +++++++++++++++++++++ pcsx2/x86/newVif_Unpack.inl | 77 +++--- pcsx2/x86/newVif_UnpackGen.inl | 86 ++++--- 9 files changed, 654 insertions(+), 87 deletions(-) create mode 100644 pcsx2/x86/newVif_Dynarec.inl create mode 100644 pcsx2/x86/newVif_HashBucket.h create mode 100644 pcsx2/x86/newVif_Tables.inl diff --git a/pcsx2/Vif.h b/pcsx2/Vif.h index 7450c2d546..39bfd62a2e 100644 --- a/pcsx2/Vif.h +++ b/pcsx2/Vif.h @@ -236,7 +236,7 @@ extern bool VIF1transfer(u32 *data, int size, bool istag); extern void vifMFIFOInterrupt(); // -------------------------------------------------------------------------------------- -// VIF SEE-optimized Masking Mess +// VIF SSE-optimized Masking Mess // -------------------------------------------------------------------------------------- struct VifMaskTypes @@ -245,7 +245,7 @@ struct VifMaskTypes u32 Row1[4], Col1[4]; }; -extern __aligned16 VifMaskTypes g_vifmask; +extern __aligned16 VifMaskTypes g_vifmask; // This struct is used by newVif as well as oldVif code... extern void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask); diff --git a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj index 7fec7f62fd..055bbe9116 100644 --- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj +++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj @@ -851,6 +851,10 @@ RelativePath="..\..\x86\newVif_BlockBuffer.h" > + + @@ -863,6 +867,18 @@ RelativePath="..\..\x86\newVif_UnpackGen.inl" > + + + + + + * vifBlocks; // Vif Blocks + nVifBlock* vifBlock; // Current Vif Block Ptr }; -// Contents of this table are doubled up for doMast(false) and doMask(true) lookups. +// Contents of this table are doubled up for doMask(false) and doMask(true) lookups. // (note: currently unused, I'm using gsize in the interp tables instead since it // seems to be faster for now, which may change when nVif isn't reliant on interpreted // unpackers anymore --air) @@ -98,10 +118,13 @@ static const u32 nVifT[32] = { 2, // V4-5 }; +#define useOldUnpack 0 // Use code in newVif_OldUnpack.inl +#define newVifDynaRec 1 // Use code in newVif_Dynarec.inl #include "newVif_OldUnpack.inl" #include "newVif_Unpack.inl" #include "newVif_UnpackGen.inl" -//#include "newVif_Dynarec.inl" +#include "newVif_Tables.inl" +#include "newVif_Dynarec.inl" #endif diff --git a/pcsx2/x86/newVif_BlockBuffer.h b/pcsx2/x86/newVif_BlockBuffer.h index b540319c7f..2c107e96ad 100644 --- a/pcsx2/x86/newVif_BlockBuffer.h +++ b/pcsx2/x86/newVif_BlockBuffer.h @@ -43,7 +43,8 @@ public: dealloc(temp, mSizeT); mSizeT = newSize; } - void clear() { mSize = 0; } - u32 getSize() { return mSize; } - u8* getBlock() { return mData; } + void clear() { mSize = 0; } + u32 getCurSize() { return mSize; } + u32 getSize() { return mSizeT; } + u8* getBlock() { return mData; } }; diff --git a/pcsx2/x86/newVif_Dynarec.inl b/pcsx2/x86/newVif_Dynarec.inl new file mode 100644 index 0000000000..4a4bfca7e8 --- /dev/null +++ b/pcsx2/x86/newVif_Dynarec.inl @@ -0,0 +1,157 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets +// authors: cottonvibes(@gmail.com) +// Jake.Stine (@gmail.com) + +#pragma once + +void dVifInit(int idx) { + nVif[idx].idx = idx; + nVif[idx].VU = idx ? &VU1 : &VU0; + nVif[idx].vif = idx ? &vif1 : &vif0; + nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs; + nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); + nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; + nVif[idx].vifCache = new BlockBuffer(_1mb*4); // 4mb Rec Cache + nVif[idx].vifBlocks = new HashBucket<_tParams>(); + nVif[idx].recPtr = nVif[idx].vifCache->getBlock(); + nVif[idx].recEnd = &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone +} + +_f void dVifRecLimit(int idx) { + if (nVif[idx].recPtr > nVif[idx].recEnd) { + nVif[idx].vifBlocks->clear(); + DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd); + } +} + +_f void dVifSetMasks(nVifStruct& v, int mask, int mode, int cS) { + u32 m0 = v.vifBlock->mask; + u32 m1 = m0 & 0xaaaaaaaa; + u32 m2 =(~m1>>1) & m0; + u32 m3 = (m1>>1) & ~m0; + u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0; + u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0; + if((m2&&mask) || mode) { xMOVAPS(xmmRow, ptr32[row]); } + if (m3&&mask) { + xMOVAPS(xmmCol0, ptr32[col]); + if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1); + if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2); + if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3); + if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0); + } + //if (mask||mode) loadRowCol(v); +} + +void dVifRecompile(nVifStruct& v, nVifBlock* vB) { + const bool isFill = (vB->cl < vB->wl); + const int usn = (vB->upkType>>5)&1; + const int doMask = (vB->upkType>>4)&1; + const int upkNum = vB->upkType & 0xf; + const u32& vift = nVifT[upkNum]; + const int doMode = vifRegs->mode & 3; + const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl; + const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl; + const int skipSize = blockSize - cycleSize; + const bool simpleBlock = (vifRegs->num == 1); + const int backupCL = vif->cl; + const int backupNum = vifRegs->num; + if (vif->cl >= blockSize) vif->cl = 0; + + v.vifBlock = vB; + xSetPtr(v.recPtr); + xAlignPtr(16); + vB->startPtr = xGetPtr(); + dVifSetMasks(v, doMask, doMode, cycleSize); + + while (vifRegs->num) { + if (vif->cl < cycleSize) { + xUnpack[upkNum](&v, doMode<<1 | doMask); + if (!simpleBlock) xADD(edx, vift); + if (!simpleBlock) xADD(ecx, 16); + vifRegs->num--; + if (++vif->cl == blockSize) vif->cl = 0; + } + else if (isFill) { + DevCon.WriteLn("filling mode!"); + xUnpack[upkNum](&v, 1); + xADD(ecx, 16); + vifRegs->num--; + if (++vif->cl == blockSize) vif->cl = 0; + } + else { + xADD(ecx, 16 * skipSize); + vif->cl = 0; + } + } + if (doMode==2) writeBackRow(v); + xMOV(ptr32[&vif->cl], vif->cl); + xMOV(ptr32[&vifRegs->num], vifRegs->num); + xRET(); + v.recPtr = xGetPtr(); + vif->cl = backupCL; + vifRegs->num = backupNum; +} + +static nVifBlock _vBlock = {0}; + +_f u8* dVifsetVUptr(nVifStruct& v, int offset) { + u8* ptr = (u8*)(v.VU->Mem + (offset & v.vuMemLimit)); + u8* endPtr = ptr + _vBlock.num * 16; + if (endPtr > v.vuMemEnd) { + DevCon.WriteLn("nVif - VU Mem Ptr Overflow!"); + ptr = NULL; // Fall Back to Interpreters which have wrap-around logic + } + return ptr; +} + +void dVifUnpack(int idx, u8 *data, u32 size) { + + nVifStruct& v = nVif[idx]; + vif = v.vif; + vifRegs = v.vifRegs; + const u8 upkType = vif->tag.cmd & 0x1f | ((!!(vif->usn)) << 5); + + _vBlock.upkType = upkType; + _vBlock.num = *(u8*)&vifRegs->num; + _vBlock.mode = *(u8*)&vifRegs->mode; + _vBlock.scl = vif->cl; + _vBlock.cl = vifRegs->cycle.cl; + _vBlock.wl = vifRegs->cycle.wl; + _vBlock.mask = vifRegs->mask; + + if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) { + u8* dest = dVifsetVUptr(v, vif->tag.addr); + if (!dest) { + //DevCon.WriteLn("Running Interpreter Block"); + _nVifUnpack(idx, data, size); + } + else { + //DevCon.WriteLn("Running Recompiled Block!"); + ((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data); + } + return; + } + static int recBlockNum = 0; + DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++); + nVifBlock* vB = new nVifBlock(); + memcpy(vB, &_vBlock, sizeof(nVifBlock)); + dVifRecompile(v, vB); + v.vifBlocks->add(vB); + dVifRecLimit(idx); + dVifUnpack(idx, data, size); +} diff --git a/pcsx2/x86/newVif_HashBucket.h b/pcsx2/x86/newVif_HashBucket.h new file mode 100644 index 0000000000..dce8e39d5a --- /dev/null +++ b/pcsx2/x86/newVif_HashBucket.h @@ -0,0 +1,58 @@ + +#pragma once + +// HashBucket is a container which uses a built-in hash function +// to perform quick searches. +// T is a struct data type. +// hSize determines the number of buckets HashBucket will use for sorting. +// cmpSize is the size of data to consider 2 structs equal (see find()) +// The hash function is determined by taking the first bytes of data and +// performing a modulus the size of hSize. So the most diverse-data should +// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted) +template +class HashBucket { +private: + T* mChain[hSize]; + int mSize [hSize]; +public: + HashBucket() { + for (int i = 0; i < hSize; i++) { + mChain[i] = NULL; + mSize [i] = 0; + } + } + ~HashBucket() { clear(); } + int quickFind(u32 data) { + int o = data % hSize; + return mSize[o]; + } + T* find(T* dataPtr) { + u32 d = *((u32*)dataPtr); + int o = d % hSize; + int s = mSize[o]; + T* c = mChain[o]; + for (int i = 0; i < s; i++) { + if (!memcmp(&c[i], dataPtr, cmpSize)) return &c[i]; + } + return NULL; + } + void add(T* dataPtr) { + u32 d = *(u32*)dataPtr; + int o = d % hSize; + int s = mSize[o]++; + T* c = mChain[o]; + T* n = new T[s+1]; + if (s) { + memcpy(n, c, sizeof(T) * s); + delete[] c; + } + memcpy(&n[s], dataPtr, sizeof(T)); + mChain[o] = n; + } + void clear() { + for (int i = 0; i < hSize; i++) { + safe_delete_array(mChain[i]); + mSize[i] = 0; + } + } +}; diff --git a/pcsx2/x86/newVif_Tables.inl b/pcsx2/x86/newVif_Tables.inl new file mode 100644 index 0000000000..05bcb0c5d4 --- /dev/null +++ b/pcsx2/x86/newVif_Tables.inl @@ -0,0 +1,273 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +#define vUPK(x) void x(nVifStruct* v, int doMask) +#define _doUSN (v->vifBlock->upkType & 0x20) +#undef xMovDest +#undef xShiftR +#undef xPMOVXX8 +#undef xPMOVXX16 +#undef xMaskWrite +#define makeMergeMask(x) { \ + x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3); \ +} +void doMaskWrite(const xRegisterSSE& regX, nVifStruct* v, int doMask) { + if (regX.Id > 1) DevCon.WriteLn("Reg Overflow!!!"); + int doMode = doMask>>1; doMask &= 1; + int cc = aMin(v->vif->cl, 3); + u32 m0 = (v->vifBlock->mask >> (cc * 8)) & 0xff; + u32 m1 = m0 & 0xaaaa; + u32 m2 =(~m1>>1) & m0; + u32 m3 = (m1>>1) & ~m0; + u32 m4 = (m1>>1) & m0; + makeMergeMask(m2); + makeMergeMask(m3); + makeMergeMask(m4); + if (doMask&&m4) { xMOVAPS(xmmTemp, ptr32[ecx]); } // Load Write Protect + if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id, m2); } // Merge Row + if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc, m3); } // Merge Col + if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id, m4); } // Merge Write Protect + if (doMode) { + u32 m5 = (~m1>>1) & ~m0; + if (!doMask) m5 = 0xf; + else makeMergeMask(m5); + if (m5 < 0xf) { + xPXOR(xmmTemp, xmmTemp); + mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5); + xPADD.D(regX, xmmTemp); + if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5); + } + else if (m5 == 0xf) { + xPADD.D(regX, xmmRow); + if (doMode==2) xMOVAPS(xmmRow, regX); + } + } + xMOVAPS(ptr32[ecx], regX); +} +#define xMovDest(regX) { \ + if (!doMask){ xMOVAPS (ptr32[ecx], regX); } \ + else { doMaskWrite(regX, v, doMask); } \ +} +#define xShiftR(regX, n) { \ + if (_doUSN) { xPSRL.D(regX, n); } \ + else { xPSRA.D(regX, n); } \ +} +#define xPMOVXX8(regX, src) { \ + if (_doUSN) xPMOVZX.BD(regX, src); \ + else xPMOVSX.BD(regX, src); \ +} +#define xPMOVXX16(regX, src) { \ + if (_doUSN) xPMOVZX.WD(regX, src); \ + else xPMOVSX.WD(regX, src); \ +} + +// ecx = dest, edx = src +vUPK(nVif_S_32) { + xMOV32 (xmm0, ptr32[edx]); + xPSHUF.D (xmm1, xmm0, _v0); + xMovDest (xmm1); +} + +vUPK(nVif_S_16) { +if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX16 (xmm0, ptr64[edx]); +} +else { + xMOV16 (xmm0, ptr32[edx]); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 16); +} + xPSHUF.D (xmm1, xmm0, _v0); + xMovDest (xmm1); +} + +vUPK(nVif_S_8) { +if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX8 (xmm0, ptr32[edx]); +} +else { + xMOV8 (xmm0, ptr32[edx]); + xPUNPCK.LBW(xmm0, xmm0); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 24); +} + xPSHUF.D (xmm1, xmm0, _v0); + xMovDest (xmm1); +} + +vUPK(nVif_V2_32) { + xMOV64 (xmm0, ptr32[edx]); + xMovDest (xmm0); +} + +vUPK(nVif_V2_16) { +if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX16 (xmm0, ptr64[edx]); +} +else { + xMOV32 (xmm0, ptr32[edx]); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 16); +} + xMovDest (xmm0); +} + +vUPK(nVif_V2_8) { +if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX8 (xmm0, ptr32[edx]); +} +else { + xMOV16 (xmm0, ptr32[edx]); + xPUNPCK.LBW(xmm0, xmm0); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 24); +} + xMovDest (xmm0); +} + +vUPK(nVif_V3_32) { + xMOV128 (xmm0, ptr32[edx]); + xMovDest (xmm0); +} + +vUPK(nVif_V3_16) { +if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX16 (xmm0, ptr64[edx]); +} +else { + xMOV64 (xmm0, ptr32[edx]); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 16); +} + xMovDest (xmm0); +} + +vUPK(nVif_V3_8) { +if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX8 (xmm0, ptr32[edx]); +} +else { + xMOV32 (xmm0, ptr32[edx]); + xPUNPCK.LBW(xmm0, xmm0); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 24); +} + xMovDest (xmm0); +} + +vUPK(nVif_V4_32) { + xMOV128 (xmm0, ptr32[edx]); + xMovDest (xmm0); +} + +vUPK(nVif_V4_16) { +if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX16 (xmm0, ptr64[edx]); +} +else { + xMOV64 (xmm0, ptr32[edx]); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 16); +} + xMovDest (xmm0); +} + +vUPK(nVif_V4_8) { +if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX8 (xmm0, ptr32[edx]); +} +else { + xMOV32 (xmm0, ptr32[edx]); + xPUNPCK.LBW(xmm0, xmm0); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 24); +} + xMovDest (xmm0); +} + +vUPK(nVif_V4_5) { + xMOV16 (xmm0, ptr32[edx]); + xPSHUF.D (xmm0, xmm0, _v0); + xPSLL.D (xmm0, 3); // ABG|R5.000 + xMOVAPS (xmm1, xmm0); // x|x|x|R + xPSRL.D (xmm0, 8); // ABG + xPSLL.D (xmm0, 3); // AB|G5.000 + mVUmergeRegs(XMM1, XMM0, 0x4); // x|x|G|R + xPSRL.D (xmm0, 8); // AB + xPSLL.D (xmm0, 3); // A|B5.000 + mVUmergeRegs(XMM1, XMM0, 0x2); // x|B|G|R + xPSRL.D (xmm0, 8); // A + xPSLL.D (xmm0, 7); // A.0000000 + mVUmergeRegs(XMM1, XMM0, 0x1); // A|B|G|R + xPSLL.D (xmm1, 24); // can optimize to + xPSRL.D (xmm1, 24); // single AND... + xMovDest (xmm1); +} + +vUPK(nVif_unkown) { + Console.Error("nVif%d - Invalid Unpack! [%d]", v->idx, v->vif->tag.cmd & 0xf); +} + +void (*xUnpack[16])(nVifStruct* v, int doMask) = { + nVif_S_32, + nVif_S_16, + nVif_S_8, + nVif_unkown, + nVif_V2_32, + nVif_V2_16, + nVif_V2_8, + nVif_unkown, + nVif_V3_32, + nVif_V3_16, + nVif_V3_8, + nVif_unkown, + nVif_V4_32, + nVif_V4_16, + nVif_V4_8, + nVif_V4_5, +}; + +// Loads Row/Col Data from vifRegs instead of g_vifmask +// Useful for testing vifReg and g_vifmask inconsistency. +void loadRowCol(nVifStruct& v) { + xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]); + xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]); + xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]); + xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]); + xPSHUF.D(xmm0, xmm0, _v0); + xPSHUF.D(xmm1, xmm1, _v0); + xPSHUF.D(xmm2, xmm2, _v0); + xPSHUF.D(xmm6, xmm6, _v0); + mVUmergeRegs(XMM6, XMM0, 8); + mVUmergeRegs(XMM6, XMM1, 4); + mVUmergeRegs(XMM6, XMM2, 2); + xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]); + xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]); + xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]); + xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]); + xPSHUF.D(xmm2, xmm2, _v0); + xPSHUF.D(xmm3, xmm3, _v0); + xPSHUF.D(xmm4, xmm4, _v0); + xPSHUF.D(xmm5, xmm5, _v0); +} + +void writeBackRow(nVifStruct& v) { + u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0; + xMOVAPS(ptr32[row], xmmRow); + DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]"); + // ToDo: Do we need to write back to vifregs.rX too!? :/ +} diff --git a/pcsx2/x86/newVif_Unpack.inl b/pcsx2/x86/newVif_Unpack.inl index 94caca8020..d11689bea7 100644 --- a/pcsx2/x86/newVif_Unpack.inl +++ b/pcsx2/x86/newVif_Unpack.inl @@ -22,11 +22,36 @@ static __aligned16 nVifStruct nVif[2]; static _f void _nVifUnpack(int idx, u8 *data, u32 size); +void initNewVif(int idx) { + nVif[idx].idx = idx; + nVif[idx].VU = idx ? &VU1 : &VU0; + nVif[idx].vif = idx ? &vif1 : &vif0; + nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs; + nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); + nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; + nVif[idx].vifCache = NULL; + + HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false); + memset8<0xcc>( nVifUpkExec ); + + xSetPtr( nVifUpkExec ); + + for (int a = 0; a < 2; a++) { + for (int b = 0; b < 2; b++) { + for (int c = 0; c < 4; c++) { + nVifGen(a, b, c); + }}} + + HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true); + if (newVifDynaRec) dVifInit(idx); +} + int nVifUnpack(int idx, u32 *data) { XMMRegisters::Freeze(); int ret = aMin(vif1.vifpacketsize, vif1.tag.size); vif1.tag.size -= ret; - _nVifUnpack(idx, (u8*)data, ret<<2); + if (newVifDynaRec) dVifUnpack(idx, (u8*)data, ret<<2); + else _nVifUnpack(idx, (u8*)data, ret<<2); if (vif1.tag.size <= 0) { vif1.tag.size = 0; vif1.cmd = 0; @@ -59,9 +84,9 @@ static u32 oldMaskIdx = -1; static u32 oldMask = 0; static void setMasks(int idx, const VIFregisters& v) { - if (idx == oldMaskIdx && oldMask == v.mask) return; - oldMaskIdx = idx; - oldMask = v.mask; + //if (idx == oldMaskIdx && oldMask == v.mask) return; + //oldMaskIdx = idx; + //oldMask = v.mask; //DevCon.WriteLn("mask"); for (int i = 0; i < 16; i++) { int m = (v.mask >> (i*2)) & 3; @@ -115,9 +140,7 @@ __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) { const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl; const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl; const int skipSize = blockSize - cycleSize; - - //if (skipSize > 2) - //DevCon.WriteLn("[num = %d][cl = %d][bl = %d][diff = %d]", vifRegs->num, vif->cl, blockSize, skipSize); + //DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs->num, upkNum, vif->cl, blockSize, skipSize); if (vif->cmd & 0x10) setMasks(idx, *vifRegs); @@ -141,25 +164,20 @@ __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) { // mucks up compiler optimizations on the internal loops. >_< --air const u8* vuMemBase = (idx ? VU1 : VU0).Mem; u8* dest = setVUptr(idx, vuMemBase, vif->tag.addr); - if (vif->cl >= blockSize) vif->cl = 0; - while (vifRegs->num /*&& size*/) { + while (vifRegs->num && size) { if (vif->cl < cycleSize) { - if (doMode /*|| doMask*/) { - //if (doMask) + if (doMode) { //DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum); func((u32*)dest, (u32*)data); } else { //DevCon.WriteLn("SSE Unpack!"); - - // Opt note: removing this min check (which isn't needed right now?) is +1% - // or more. Just something to keep in mind. :) --air - fnbase[aMin(vif->cl, 4)](dest, data); + fnbase[aMin(vif->cl, 3)](dest, data); } data += ft.gsize; - //if( IsDebugBuild ) size -= ft.gsize; // only used below for assertion checking + //if( IsDebugBuild ) size -= ft.gsize; // only used below for assertion checking vifRegs->num--; incVUptrBy16(idx, dest, vuMemBase); @@ -187,34 +205,31 @@ static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = { { { _nVifUnpackLoop<0,false,false>, _nVifUnpackLoop<0,false,true> }, - { _nVifUnpackLoop<0,true,false>, _nVifUnpackLoop<0,true,true> }, + { _nVifUnpackLoop<0,true, false>, _nVifUnpackLoop<0,true, true> }, }, - { { _nVifUnpackLoop<1,false,false>, _nVifUnpackLoop<1,false,true> }, - { _nVifUnpackLoop<1,true,false>, _nVifUnpackLoop<1,true,true> }, + { _nVifUnpackLoop<1,true, false>, _nVifUnpackLoop<1,true, true> }, }, }; - static _f void _nVifUnpack(int idx, u8 *data, u32 size) { - /*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write + + if (useOldUnpack) { if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2); else VIFunpack<1>((u32*)data, &vif1.tag, size>>2); return; } - else*/ { // filling write - vif = nVif[idx].vif; - vifRegs = nVif[idx].vifRegs; - const bool doMode = vifRegs->mode && !(vif->tag.cmd & 0x10); - const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl); + vif = nVif[idx].vif; + vifRegs = nVif[idx].vifRegs; + const bool doMode = vifRegs->mode && !(vif->tag.cmd & 0x10); + const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl); - UnpackLoopTable[idx][doMode][isFill]( data, size ); + UnpackLoopTable[idx][doMode][isFill]( data, size ); - //if (isFill) - //DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok")); - //DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num); - } + //if (isFill) + //DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok")); + //DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num); } diff --git a/pcsx2/x86/newVif_UnpackGen.inl b/pcsx2/x86/newVif_UnpackGen.inl index 89e39d1c96..75a513d0fb 100644 --- a/pcsx2/x86/newVif_UnpackGen.inl +++ b/pcsx2/x86/newVif_UnpackGen.inl @@ -17,20 +17,28 @@ #define xMaskWrite(regX) { \ xMOVAPS(xmm7, ptr32[ecx]); \ - int offX = aMin(curCycle, 4); \ + int offX = aMin(curCycle, 3); \ xPAND(regX, ptr32[nVifMask[0][offX]]); \ xPAND(xmm7, ptr32[nVifMask[1][offX]]); \ xPOR (regX, ptr32[nVifMask[2][offX]]); \ xPOR (regX, xmm7); \ xMOVAPS(ptr32[ecx], regX); \ } -#define xMovDest(regX) { \ - if (mask==0) { xMOVAPS (ptr32[ecx], regX); } \ - else { xMaskWrite(regX); } \ +#define xMovDest(regX) { \ + if (!mask) { xMOVAPS (ptr32[ecx], regX); } \ + else { xMaskWrite(regX); } \ } -#define xShiftR(regX, n) { \ - if (usn) { xPSRL.D(regX, n); } \ - else { xPSRA.D(regX, n); } \ +#define xShiftR(regX, n) { \ + if (usn) { xPSRL.D(regX, n); } \ + else { xPSRA.D(regX, n); } \ +} +#define xPMOVXX8(regX, src) { \ + if (usn) xPMOVZX.BD(regX, src); \ + else xPMOVSX.BD(regX, src); \ +} +#define xPMOVXX16(regX, src) { \ + if (usn) xPMOVZX.WD(regX, src); \ + else xPMOVSX.WD(regX, src); \ } struct VifUnpackIndexer { @@ -74,18 +82,28 @@ void nVifGen(int usn, int mask, int curCycle) { xRET(); indexer.xSetCall(0x1); // S-16 + if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX16 (xmm0, ptr64[edx]); + } + else { xMOV16 (xmm0, ptr32[edx]); xPUNPCK.LWD(xmm0, xmm0); xShiftR (xmm0, 16); + } xPSHUF.D (xmm1, xmm0, _v0); xMovDest (xmm1); xRET(); indexer.xSetCall(0x2); // S-8 + if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX8 (xmm0, ptr32[edx]); + } + else { xMOV8 (xmm0, ptr32[edx]); xPUNPCK.LBW(xmm0, xmm0); xPUNPCK.LWD(xmm0, xmm0); xShiftR (xmm0, 24); + } xPSHUF.D (xmm1, xmm0, _v0); xMovDest (xmm1); xRET(); @@ -98,17 +116,27 @@ void nVifGen(int usn, int mask, int curCycle) { xRET(); indexer.xSetCall(0x5); // V2-16 + if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX16 (xmm0, ptr64[edx]); + } + else { xMOV32 (xmm0, ptr32[edx]); xPUNPCK.LWD(xmm0, xmm0); xShiftR (xmm0, 16); + } xMovDest (xmm0); xRET(); indexer.xSetCall(0x6); // V2-8 + if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX8 (xmm0, ptr32[edx]); + } + else { xMOV16 (xmm0, ptr32[edx]); xPUNPCK.LBW(xmm0, xmm0); xPUNPCK.LWD(xmm0, xmm0); xShiftR (xmm0, 24); + } xMovDest (xmm0); xRET(); @@ -120,17 +148,27 @@ void nVifGen(int usn, int mask, int curCycle) { xRET(); indexer.xSetCall(0x9); // V3-16 + if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX16 (xmm0, ptr64[edx]); + } + else { xMOV64 (xmm0, ptr32[edx]); xPUNPCK.LWD(xmm0, xmm0); xShiftR (xmm0, 16); + } xMovDest (xmm0); xRET(); indexer.xSetCall(0xa); // V3-8 + if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX8 (xmm0, ptr32[edx]); + } + else { xMOV32 (xmm0, ptr32[edx]); xPUNPCK.LBW(xmm0, xmm0); xPUNPCK.LWD(xmm0, xmm0); xShiftR (xmm0, 24); + } xMovDest (xmm0); xRET(); @@ -142,17 +180,27 @@ void nVifGen(int usn, int mask, int curCycle) { xRET(); indexer.xSetCall(0xd); // V4-16 + if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX16 (xmm0, ptr64[edx]); + } + else { xMOV64 (xmm0, ptr32[edx]); xPUNPCK.LWD(xmm0, xmm0); xShiftR (xmm0, 16); + } xMovDest (xmm0); xRET(); indexer.xSetCall(0xe); // V4-8 + if (x86caps.hasStreamingSIMD4Extensions) { + xPMOVXX8 (xmm0, ptr32[edx]); + } + else { xMOV32 (xmm0, ptr32[edx]); xPUNPCK.LBW(xmm0, xmm0); xPUNPCK.LWD(xmm0, xmm0); xShiftR (xmm0, 24); + } xMovDest (xmm0); xRET(); @@ -190,27 +238,3 @@ void nVifGen(int usn, int mask, int curCycle) { pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) ); } - -void initNewVif(int idx) { - nVif[idx].idx = idx; - nVif[idx].VU = idx ? &VU1 : &VU0; - nVif[idx].vif = idx ? &vif1 : &vif0; - nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs; - nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); - nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; - nVif[idx].vifCache = NULL; - - HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false); - memset8<0xcc>( nVifUpkExec ); - - xSetPtr( nVifUpkExec ); - - for (int a = 0; a < 2; a++) { - for (int b = 0; b < 2; b++) { - for (int c = 0; c < 4; c++) { - nVifGen(a, b, c); - } - }} - - HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true); -}