Wrote a vif 'unpack' packet recompiler.

Compatibility is probably the same as the newVif interpreter code, but its faster.

Speedwise its similar to the old-vif unpack code (the one currently enabled by default in pcsx2).
Its about 0~2% slower on my machine from my limited testing, but I assume people with SSE4.1 cpus might have a lot better results since I added a lot of sse4.1 optimizations...
The SSE4.1 optimizations were also ported to the newVif interpreter code.

Also the "filling mode" should be fast compared to the old-vif unpack code since its sse optimized, but most games don't use this mode so it hasn't been tested much...

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2358 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
cottonvibes 2009-12-19 10:00:40 +00:00
parent 4c2a7ae39f
commit a5272f8dc9
9 changed files with 654 additions and 87 deletions

View File

@ -236,7 +236,7 @@ extern bool VIF1transfer(u32 *data, int size, bool istag);
extern void vifMFIFOInterrupt();
// --------------------------------------------------------------------------------------
// VIF SEE-optimized Masking Mess
// VIF SSE-optimized Masking Mess
// --------------------------------------------------------------------------------------
struct VifMaskTypes
@ -245,7 +245,7 @@ struct VifMaskTypes
u32 Row1[4], Col1[4];
};
extern __aligned16 VifMaskTypes g_vifmask;
extern __aligned16 VifMaskTypes g_vifmask; // This struct is used by newVif as well as oldVif code...
extern void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask);

View File

@ -851,6 +851,10 @@
RelativePath="..\..\x86\newVif_BlockBuffer.h"
>
</File>
<File
RelativePath=".\newVif_HashBucket.h"
>
</File>
<File
RelativePath="..\..\x86\newVif_OldUnpack.inl"
>
@ -863,6 +867,18 @@
RelativePath="..\..\x86\newVif_UnpackGen.inl"
>
</File>
<Filter
Name="Dynarec"
>
<File
RelativePath="..\..\x86\newVif_Dynarec.inl"
>
</File>
<File
RelativePath="..\..\x86\newVif_Tables.inl"
>
</File>
</Filter>
</Filter>
</Filter>
<Filter

View File

@ -17,15 +17,21 @@
#ifdef newVif
#include "newVif_BlockBuffer.h"
#include "newVif_HashBucket.h"
#include "x86emitter/x86emitter.h"
using namespace x86Emitter;
extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0);
extern void nVifGen(int usn, int mask, int curCycle);
extern void _nVifUnpack (int idx, u8 *data, u32 size);
extern void dVifUnpack (int idx, u8 *data, u32 size);
extern void dVifInit (int idx);
typedef u32 (__fastcall *nVifCall)(void*, void*);
typedef u32 (__fastcall *nVifCall)(void*, void*);
typedef void (__fastcall *nVifrecCall)(uptr dest, uptr src);
static __pagealigned u8 nVifUpkExec[__pagesize*4];
static __aligned16 nVifCall nVifUpk[(2*2*16) *4 ]; // ([USN][Masking][Unpack Type]) [curCycle]
static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
static __aligned16 nVifCall nVifUpk[(2*2*16) *4]; // ([USN][Masking][Unpack Type]) [curCycle]
static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
#define _1mb (0x100000)
#define _v0 0
@ -35,28 +41,42 @@ static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Ve
#define aMax(x, y) std::max(x,y)
#define aMin(x, y) std::min(x,y)
#define _f __forceinline
#define xmmCol0 xmm2
#define xmmCol1 xmm3
#define xmmCol2 xmm4
#define xmmCol3 xmm5
#define xmmRow xmm6
#define xmmTemp xmm7
struct nVifBlock {
u8 upkType; // Unpack Type
u8 num; // Num Field
u8 mode; // Mode Field
u8 cl; // CL Field
u8 wl; // WL Field
u32 mask; // Mask Field
u8* startPtr; // Start Ptr of RecGen Code
struct nVifBlock { // Ordered for Hashing
u8 num; // Num Field
u8 upkType; // Unpack Type [usn*1:mask*1:upk*4]
u8 mode; // Mode Field
u8 scl; // Start Cycle
u8 cl; // CL Field
u8 wl; // WL Field
u32 mask; // Mask Field
u8* startPtr; // Start Ptr of RecGen Code
};
#define _hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
#define _cmpS (sizeof(nVifBlock) - sizeof(uptr))
#define _tParams nVifBlock, _hSize, _cmpS
struct nVifStruct {
u32 idx; // VIF0 or VIF1
vifStruct* vif; // Vif Struct ptr
VIFregisters* vifRegs; // Vif Regs ptr
VURegs* VU; // VU Regs ptr
u8* vuMemEnd; // End of VU Memory
u32 vuMemLimit; // Use for fast AND
BlockBuffer* vifCache; // Block Buffer
u32 idx; // VIF0 or VIF1
vifStruct* vif; // Vif Struct ptr
VIFregisters* vifRegs; // Vif Regs ptr
VURegs* VU; // VU Regs ptr
u8* vuMemEnd; // End of VU Memory
u32 vuMemLimit; // Use for fast AND
u8* recPtr; // Cur Pos to recompile to
u8* recEnd; // End of Rec Cache
BlockBuffer* vifCache; // Block Buffer
HashBucket<_tParams>* vifBlocks; // Vif Blocks
nVifBlock* vifBlock; // Current Vif Block Ptr
};
// Contents of this table are doubled up for doMast(false) and doMask(true) lookups.
// Contents of this table are doubled up for doMask(false) and doMask(true) lookups.
// (note: currently unused, I'm using gsize in the interp tables instead since it
// seems to be faster for now, which may change when nVif isn't reliant on interpreted
// unpackers anymore --air)
@ -98,10 +118,13 @@ static const u32 nVifT[32] = {
2, // V4-5
};
#define useOldUnpack 0 // Use code in newVif_OldUnpack.inl
#define newVifDynaRec 1 // Use code in newVif_Dynarec.inl
#include "newVif_OldUnpack.inl"
#include "newVif_Unpack.inl"
#include "newVif_UnpackGen.inl"
//#include "newVif_Dynarec.inl"
#include "newVif_Tables.inl"
#include "newVif_Dynarec.inl"
#endif

View File

@ -43,7 +43,8 @@ public:
dealloc(temp, mSizeT);
mSizeT = newSize;
}
void clear() { mSize = 0; }
u32 getSize() { return mSize; }
u8* getBlock() { return mData; }
void clear() { mSize = 0; }
u32 getCurSize() { return mSize; }
u32 getSize() { return mSizeT; }
u8* getBlock() { return mData; }
};

View File

@ -0,0 +1,157 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2009 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets
// authors: cottonvibes(@gmail.com)
// Jake.Stine (@gmail.com)
#pragma once
void dVifInit(int idx) {
nVif[idx].idx = idx;
nVif[idx].VU = idx ? &VU1 : &VU0;
nVif[idx].vif = idx ? &vif1 : &vif0;
nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
nVif[idx].vifCache = new BlockBuffer(_1mb*4); // 4mb Rec Cache
nVif[idx].vifBlocks = new HashBucket<_tParams>();
nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
nVif[idx].recEnd = &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone
}
_f void dVifRecLimit(int idx) {
if (nVif[idx].recPtr > nVif[idx].recEnd) {
nVif[idx].vifBlocks->clear();
DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd);
}
}
_f void dVifSetMasks(nVifStruct& v, int mask, int mode, int cS) {
u32 m0 = v.vifBlock->mask;
u32 m1 = m0 & 0xaaaaaaaa;
u32 m2 =(~m1>>1) & m0;
u32 m3 = (m1>>1) & ~m0;
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
if((m2&&mask) || mode) { xMOVAPS(xmmRow, ptr32[row]); }
if (m3&&mask) {
xMOVAPS(xmmCol0, ptr32[col]);
if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0);
}
//if (mask||mode) loadRowCol(v);
}
void dVifRecompile(nVifStruct& v, nVifBlock* vB) {
const bool isFill = (vB->cl < vB->wl);
const int usn = (vB->upkType>>5)&1;
const int doMask = (vB->upkType>>4)&1;
const int upkNum = vB->upkType & 0xf;
const u32& vift = nVifT[upkNum];
const int doMode = vifRegs->mode & 3;
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
const int skipSize = blockSize - cycleSize;
const bool simpleBlock = (vifRegs->num == 1);
const int backupCL = vif->cl;
const int backupNum = vifRegs->num;
if (vif->cl >= blockSize) vif->cl = 0;
v.vifBlock = vB;
xSetPtr(v.recPtr);
xAlignPtr(16);
vB->startPtr = xGetPtr();
dVifSetMasks(v, doMask, doMode, cycleSize);
while (vifRegs->num) {
if (vif->cl < cycleSize) {
xUnpack[upkNum](&v, doMode<<1 | doMask);
if (!simpleBlock) xADD(edx, vift);
if (!simpleBlock) xADD(ecx, 16);
vifRegs->num--;
if (++vif->cl == blockSize) vif->cl = 0;
}
else if (isFill) {
DevCon.WriteLn("filling mode!");
xUnpack[upkNum](&v, 1);
xADD(ecx, 16);
vifRegs->num--;
if (++vif->cl == blockSize) vif->cl = 0;
}
else {
xADD(ecx, 16 * skipSize);
vif->cl = 0;
}
}
if (doMode==2) writeBackRow(v);
xMOV(ptr32[&vif->cl], vif->cl);
xMOV(ptr32[&vifRegs->num], vifRegs->num);
xRET();
v.recPtr = xGetPtr();
vif->cl = backupCL;
vifRegs->num = backupNum;
}
static nVifBlock _vBlock = {0};
_f u8* dVifsetVUptr(nVifStruct& v, int offset) {
u8* ptr = (u8*)(v.VU->Mem + (offset & v.vuMemLimit));
u8* endPtr = ptr + _vBlock.num * 16;
if (endPtr > v.vuMemEnd) {
DevCon.WriteLn("nVif - VU Mem Ptr Overflow!");
ptr = NULL; // Fall Back to Interpreters which have wrap-around logic
}
return ptr;
}
void dVifUnpack(int idx, u8 *data, u32 size) {
nVifStruct& v = nVif[idx];
vif = v.vif;
vifRegs = v.vifRegs;
const u8 upkType = vif->tag.cmd & 0x1f | ((!!(vif->usn)) << 5);
_vBlock.upkType = upkType;
_vBlock.num = *(u8*)&vifRegs->num;
_vBlock.mode = *(u8*)&vifRegs->mode;
_vBlock.scl = vif->cl;
_vBlock.cl = vifRegs->cycle.cl;
_vBlock.wl = vifRegs->cycle.wl;
_vBlock.mask = vifRegs->mask;
if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
u8* dest = dVifsetVUptr(v, vif->tag.addr);
if (!dest) {
//DevCon.WriteLn("Running Interpreter Block");
_nVifUnpack(idx, data, size);
}
else {
//DevCon.WriteLn("Running Recompiled Block!");
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
}
return;
}
static int recBlockNum = 0;
DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++);
nVifBlock* vB = new nVifBlock();
memcpy(vB, &_vBlock, sizeof(nVifBlock));
dVifRecompile(v, vB);
v.vifBlocks->add(vB);
dVifRecLimit(idx);
dVifUnpack(idx, data, size);
}

View File

@ -0,0 +1,58 @@
#pragma once
// HashBucket is a container which uses a built-in hash function
// to perform quick searches.
// T is a struct data type.
// hSize determines the number of buckets HashBucket will use for sorting.
// cmpSize is the size of data to consider 2 structs equal (see find())
// The hash function is determined by taking the first bytes of data and
// performing a modulus the size of hSize. So the most diverse-data should
// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
template<typename T, int hSize, int cmpSize>
class HashBucket {
private:
T* mChain[hSize];
int mSize [hSize];
public:
HashBucket() {
for (int i = 0; i < hSize; i++) {
mChain[i] = NULL;
mSize [i] = 0;
}
}
~HashBucket() { clear(); }
int quickFind(u32 data) {
int o = data % hSize;
return mSize[o];
}
T* find(T* dataPtr) {
u32 d = *((u32*)dataPtr);
int o = d % hSize;
int s = mSize[o];
T* c = mChain[o];
for (int i = 0; i < s; i++) {
if (!memcmp(&c[i], dataPtr, cmpSize)) return &c[i];
}
return NULL;
}
void add(T* dataPtr) {
u32 d = *(u32*)dataPtr;
int o = d % hSize;
int s = mSize[o]++;
T* c = mChain[o];
T* n = new T[s+1];
if (s) {
memcpy(n, c, sizeof(T) * s);
delete[] c;
}
memcpy(&n[s], dataPtr, sizeof(T));
mChain[o] = n;
}
void clear() {
for (int i = 0; i < hSize; i++) {
safe_delete_array(mChain[i]);
mSize[i] = 0;
}
}
};

273
pcsx2/x86/newVif_Tables.inl Normal file
View File

@ -0,0 +1,273 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2009 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#define vUPK(x) void x(nVifStruct* v, int doMask)
#define _doUSN (v->vifBlock->upkType & 0x20)
#undef xMovDest
#undef xShiftR
#undef xPMOVXX8
#undef xPMOVXX16
#undef xMaskWrite
#define makeMergeMask(x) { \
x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3); \
}
void doMaskWrite(const xRegisterSSE& regX, nVifStruct* v, int doMask) {
if (regX.Id > 1) DevCon.WriteLn("Reg Overflow!!!");
int doMode = doMask>>1; doMask &= 1;
int cc = aMin(v->vif->cl, 3);
u32 m0 = (v->vifBlock->mask >> (cc * 8)) & 0xff;
u32 m1 = m0 & 0xaaaa;
u32 m2 =(~m1>>1) & m0;
u32 m3 = (m1>>1) & ~m0;
u32 m4 = (m1>>1) & m0;
makeMergeMask(m2);
makeMergeMask(m3);
makeMergeMask(m4);
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr32[ecx]); } // Load Write Protect
if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id, m2); } // Merge Row
if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc, m3); } // Merge Col
if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id, m4); } // Merge Write Protect
if (doMode) {
u32 m5 = (~m1>>1) & ~m0;
if (!doMask) m5 = 0xf;
else makeMergeMask(m5);
if (m5 < 0xf) {
xPXOR(xmmTemp, xmmTemp);
mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5);
xPADD.D(regX, xmmTemp);
if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5);
}
else if (m5 == 0xf) {
xPADD.D(regX, xmmRow);
if (doMode==2) xMOVAPS(xmmRow, regX);
}
}
xMOVAPS(ptr32[ecx], regX);
}
#define xMovDest(regX) { \
if (!doMask){ xMOVAPS (ptr32[ecx], regX); } \
else { doMaskWrite(regX, v, doMask); } \
}
#define xShiftR(regX, n) { \
if (_doUSN) { xPSRL.D(regX, n); } \
else { xPSRA.D(regX, n); } \
}
#define xPMOVXX8(regX, src) { \
if (_doUSN) xPMOVZX.BD(regX, src); \
else xPMOVSX.BD(regX, src); \
}
#define xPMOVXX16(regX, src) { \
if (_doUSN) xPMOVZX.WD(regX, src); \
else xPMOVSX.WD(regX, src); \
}
// ecx = dest, edx = src
vUPK(nVif_S_32) {
xMOV32 (xmm0, ptr32[edx]);
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
}
vUPK(nVif_S_16) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV16 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
}
vUPK(nVif_S_8) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV8 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
}
vUPK(nVif_V2_32) {
xMOV64 (xmm0, ptr32[edx]);
xMovDest (xmm0);
}
vUPK(nVif_V2_16) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
}
vUPK(nVif_V2_8) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV16 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
}
vUPK(nVif_V3_32) {
xMOV128 (xmm0, ptr32[edx]);
xMovDest (xmm0);
}
vUPK(nVif_V3_16) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV64 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
}
vUPK(nVif_V3_8) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
}
vUPK(nVif_V4_32) {
xMOV128 (xmm0, ptr32[edx]);
xMovDest (xmm0);
}
vUPK(nVif_V4_16) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV64 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
}
vUPK(nVif_V4_8) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
}
vUPK(nVif_V4_5) {
xMOV16 (xmm0, ptr32[edx]);
xPSHUF.D (xmm0, xmm0, _v0);
xPSLL.D (xmm0, 3); // ABG|R5.000
xMOVAPS (xmm1, xmm0); // x|x|x|R
xPSRL.D (xmm0, 8); // ABG
xPSLL.D (xmm0, 3); // AB|G5.000
mVUmergeRegs(XMM1, XMM0, 0x4); // x|x|G|R
xPSRL.D (xmm0, 8); // AB
xPSLL.D (xmm0, 3); // A|B5.000
mVUmergeRegs(XMM1, XMM0, 0x2); // x|B|G|R
xPSRL.D (xmm0, 8); // A
xPSLL.D (xmm0, 7); // A.0000000
mVUmergeRegs(XMM1, XMM0, 0x1); // A|B|G|R
xPSLL.D (xmm1, 24); // can optimize to
xPSRL.D (xmm1, 24); // single AND...
xMovDest (xmm1);
}
vUPK(nVif_unkown) {
Console.Error("nVif%d - Invalid Unpack! [%d]", v->idx, v->vif->tag.cmd & 0xf);
}
void (*xUnpack[16])(nVifStruct* v, int doMask) = {
nVif_S_32,
nVif_S_16,
nVif_S_8,
nVif_unkown,
nVif_V2_32,
nVif_V2_16,
nVif_V2_8,
nVif_unkown,
nVif_V3_32,
nVif_V3_16,
nVif_V3_8,
nVif_unkown,
nVif_V4_32,
nVif_V4_16,
nVif_V4_8,
nVif_V4_5,
};
// Loads Row/Col Data from vifRegs instead of g_vifmask
// Useful for testing vifReg and g_vifmask inconsistency.
void loadRowCol(nVifStruct& v) {
xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
xPSHUF.D(xmm0, xmm0, _v0);
xPSHUF.D(xmm1, xmm1, _v0);
xPSHUF.D(xmm2, xmm2, _v0);
xPSHUF.D(xmm6, xmm6, _v0);
mVUmergeRegs(XMM6, XMM0, 8);
mVUmergeRegs(XMM6, XMM1, 4);
mVUmergeRegs(XMM6, XMM2, 2);
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
xPSHUF.D(xmm2, xmm2, _v0);
xPSHUF.D(xmm3, xmm3, _v0);
xPSHUF.D(xmm4, xmm4, _v0);
xPSHUF.D(xmm5, xmm5, _v0);
}
void writeBackRow(nVifStruct& v) {
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
xMOVAPS(ptr32[row], xmmRow);
DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
// ToDo: Do we need to write back to vifregs.rX too!? :/
}

View File

@ -22,11 +22,36 @@
static __aligned16 nVifStruct nVif[2];
static _f void _nVifUnpack(int idx, u8 *data, u32 size);
void initNewVif(int idx) {
nVif[idx].idx = idx;
nVif[idx].VU = idx ? &VU1 : &VU0;
nVif[idx].vif = idx ? &vif1 : &vif0;
nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
nVif[idx].vifCache = NULL;
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
memset8<0xcc>( nVifUpkExec );
xSetPtr( nVifUpkExec );
for (int a = 0; a < 2; a++) {
for (int b = 0; b < 2; b++) {
for (int c = 0; c < 4; c++) {
nVifGen(a, b, c);
}}}
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
if (newVifDynaRec) dVifInit(idx);
}
int nVifUnpack(int idx, u32 *data) {
XMMRegisters::Freeze();
int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
vif1.tag.size -= ret;
_nVifUnpack(idx, (u8*)data, ret<<2);
if (newVifDynaRec) dVifUnpack(idx, (u8*)data, ret<<2);
else _nVifUnpack(idx, (u8*)data, ret<<2);
if (vif1.tag.size <= 0) {
vif1.tag.size = 0;
vif1.cmd = 0;
@ -59,9 +84,9 @@ static u32 oldMaskIdx = -1;
static u32 oldMask = 0;
static void setMasks(int idx, const VIFregisters& v) {
if (idx == oldMaskIdx && oldMask == v.mask) return;
oldMaskIdx = idx;
oldMask = v.mask;
//if (idx == oldMaskIdx && oldMask == v.mask) return;
//oldMaskIdx = idx;
//oldMask = v.mask;
//DevCon.WriteLn("mask");
for (int i = 0; i < 16; i++) {
int m = (v.mask >> (i*2)) & 3;
@ -115,9 +140,7 @@ __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) {
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
const int skipSize = blockSize - cycleSize;
//if (skipSize > 2)
//DevCon.WriteLn("[num = %d][cl = %d][bl = %d][diff = %d]", vifRegs->num, vif->cl, blockSize, skipSize);
//DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs->num, upkNum, vif->cl, blockSize, skipSize);
if (vif->cmd & 0x10) setMasks(idx, *vifRegs);
@ -141,25 +164,20 @@ __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) {
// mucks up compiler optimizations on the internal loops. >_< --air
const u8* vuMemBase = (idx ? VU1 : VU0).Mem;
u8* dest = setVUptr(idx, vuMemBase, vif->tag.addr);
if (vif->cl >= blockSize) vif->cl = 0;
while (vifRegs->num /*&& size*/) {
while (vifRegs->num && size) {
if (vif->cl < cycleSize) {
if (doMode /*|| doMask*/) {
//if (doMask)
if (doMode) {
//DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum);
func((u32*)dest, (u32*)data);
}
else {
//DevCon.WriteLn("SSE Unpack!");
// Opt note: removing this min check (which isn't needed right now?) is +1%
// or more. Just something to keep in mind. :) --air
fnbase[aMin(vif->cl, 4)](dest, data);
fnbase[aMin(vif->cl, 3)](dest, data);
}
data += ft.gsize;
//if( IsDebugBuild ) size -= ft.gsize; // only used below for assertion checking
//if( IsDebugBuild ) size -= ft.gsize; // only used below for assertion checking
vifRegs->num--;
incVUptrBy16(idx, dest, vuMemBase);
@ -187,34 +205,31 @@ static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] =
{
{
{ _nVifUnpackLoop<0,false,false>, _nVifUnpackLoop<0,false,true> },
{ _nVifUnpackLoop<0,true,false>, _nVifUnpackLoop<0,true,true> },
{ _nVifUnpackLoop<0,true, false>, _nVifUnpackLoop<0,true, true> },
},
{
{ _nVifUnpackLoop<1,false,false>, _nVifUnpackLoop<1,false,true> },
{ _nVifUnpackLoop<1,true,false>, _nVifUnpackLoop<1,true,true> },
{ _nVifUnpackLoop<1,true, false>, _nVifUnpackLoop<1,true, true> },
},
};
static _f void _nVifUnpack(int idx, u8 *data, u32 size) {
/*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write
if (useOldUnpack) {
if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2);
else VIFunpack<1>((u32*)data, &vif1.tag, size>>2);
return;
}
else*/ { // filling write
vif = nVif[idx].vif;
vifRegs = nVif[idx].vifRegs;
const bool doMode = vifRegs->mode && !(vif->tag.cmd & 0x10);
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
vif = nVif[idx].vif;
vifRegs = nVif[idx].vifRegs;
const bool doMode = vifRegs->mode && !(vif->tag.cmd & 0x10);
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
UnpackLoopTable[idx][doMode][isFill]( data, size );
UnpackLoopTable[idx][doMode][isFill]( data, size );
//if (isFill)
//DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
//DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);
}
//if (isFill)
//DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
//DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);
}

View File

@ -17,20 +17,28 @@
#define xMaskWrite(regX) { \
xMOVAPS(xmm7, ptr32[ecx]); \
int offX = aMin(curCycle, 4); \
int offX = aMin(curCycle, 3); \
xPAND(regX, ptr32[nVifMask[0][offX]]); \
xPAND(xmm7, ptr32[nVifMask[1][offX]]); \
xPOR (regX, ptr32[nVifMask[2][offX]]); \
xPOR (regX, xmm7); \
xMOVAPS(ptr32[ecx], regX); \
}
#define xMovDest(regX) { \
if (mask==0) { xMOVAPS (ptr32[ecx], regX); } \
else { xMaskWrite(regX); } \
#define xMovDest(regX) { \
if (!mask) { xMOVAPS (ptr32[ecx], regX); } \
else { xMaskWrite(regX); } \
}
#define xShiftR(regX, n) { \
if (usn) { xPSRL.D(regX, n); } \
else { xPSRA.D(regX, n); } \
#define xShiftR(regX, n) { \
if (usn) { xPSRL.D(regX, n); } \
else { xPSRA.D(regX, n); } \
}
#define xPMOVXX8(regX, src) { \
if (usn) xPMOVZX.BD(regX, src); \
else xPMOVSX.BD(regX, src); \
}
#define xPMOVXX16(regX, src) { \
if (usn) xPMOVZX.WD(regX, src); \
else xPMOVSX.WD(regX, src); \
}
struct VifUnpackIndexer {
@ -74,18 +82,28 @@ void nVifGen(int usn, int mask, int curCycle) {
xRET();
indexer.xSetCall(0x1); // S-16
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV16 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
xRET();
indexer.xSetCall(0x2); // S-8
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV8 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
xRET();
@ -98,17 +116,27 @@ void nVifGen(int usn, int mask, int curCycle) {
xRET();
indexer.xSetCall(0x5); // V2-16
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
xRET();
indexer.xSetCall(0x6); // V2-8
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV16 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
xRET();
@ -120,17 +148,27 @@ void nVifGen(int usn, int mask, int curCycle) {
xRET();
indexer.xSetCall(0x9); // V3-16
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV64 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
xRET();
indexer.xSetCall(0xa); // V3-8
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
xRET();
@ -142,17 +180,27 @@ void nVifGen(int usn, int mask, int curCycle) {
xRET();
indexer.xSetCall(0xd); // V4-16
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV64 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
xRET();
indexer.xSetCall(0xe); // V4-8
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
xRET();
@ -190,27 +238,3 @@ void nVifGen(int usn, int mask, int curCycle) {
pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
}
void initNewVif(int idx) {
nVif[idx].idx = idx;
nVif[idx].VU = idx ? &VU1 : &VU0;
nVif[idx].vif = idx ? &vif1 : &vif0;
nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
nVif[idx].vifCache = NULL;
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
memset8<0xcc>( nVifUpkExec );
xSetPtr( nVifUpkExec );
for (int a = 0; a < 2; a++) {
for (int b = 0; b < 2; b++) {
for (int c = 0; c < 4; c++) {
nVifGen(a, b, c);
}
}}
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
}