mirror of https://github.com/PCSX2/pcsx2.git
Wrote a vif 'unpack' packet recompiler.
Compatibility is probably the same as the newVif interpreter code, but its faster. Speedwise its similar to the old-vif unpack code (the one currently enabled by default in pcsx2). Its about 0~2% slower on my machine from my limited testing, but I assume people with SSE4.1 cpus might have a lot better results since I added a lot of sse4.1 optimizations... The SSE4.1 optimizations were also ported to the newVif interpreter code. Also the "filling mode" should be fast compared to the old-vif unpack code since its sse optimized, but most games don't use this mode so it hasn't been tested much... git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2358 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
4c2a7ae39f
commit
a5272f8dc9
|
@ -236,7 +236,7 @@ extern bool VIF1transfer(u32 *data, int size, bool istag);
|
|||
extern void vifMFIFOInterrupt();
|
||||
|
||||
// --------------------------------------------------------------------------------------
|
||||
// VIF SEE-optimized Masking Mess
|
||||
// VIF SSE-optimized Masking Mess
|
||||
// --------------------------------------------------------------------------------------
|
||||
|
||||
struct VifMaskTypes
|
||||
|
@ -245,7 +245,7 @@ struct VifMaskTypes
|
|||
u32 Row1[4], Col1[4];
|
||||
};
|
||||
|
||||
extern __aligned16 VifMaskTypes g_vifmask;
|
||||
extern __aligned16 VifMaskTypes g_vifmask; // This struct is used by newVif as well as oldVif code...
|
||||
|
||||
extern void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask);
|
||||
|
||||
|
|
|
@ -851,6 +851,10 @@
|
|||
RelativePath="..\..\x86\newVif_BlockBuffer.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\newVif_HashBucket.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\..\x86\newVif_OldUnpack.inl"
|
||||
>
|
||||
|
@ -863,6 +867,18 @@
|
|||
RelativePath="..\..\x86\newVif_UnpackGen.inl"
|
||||
>
|
||||
</File>
|
||||
<Filter
|
||||
Name="Dynarec"
|
||||
>
|
||||
<File
|
||||
RelativePath="..\..\x86\newVif_Dynarec.inl"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\..\x86\newVif_Tables.inl"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
</Filter>
|
||||
</Filter>
|
||||
<Filter
|
||||
|
|
|
@ -17,14 +17,20 @@
|
|||
|
||||
#ifdef newVif
|
||||
#include "newVif_BlockBuffer.h"
|
||||
#include "newVif_HashBucket.h"
|
||||
#include "x86emitter/x86emitter.h"
|
||||
using namespace x86Emitter;
|
||||
extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0);
|
||||
extern void nVifGen(int usn, int mask, int curCycle);
|
||||
extern void _nVifUnpack (int idx, u8 *data, u32 size);
|
||||
extern void dVifUnpack (int idx, u8 *data, u32 size);
|
||||
extern void dVifInit (int idx);
|
||||
|
||||
typedef u32 (__fastcall *nVifCall)(void*, void*);
|
||||
typedef void (__fastcall *nVifrecCall)(uptr dest, uptr src);
|
||||
|
||||
static __pagealigned u8 nVifUpkExec[__pagesize*4];
|
||||
static __aligned16 nVifCall nVifUpk[(2*2*16) *4 ]; // ([USN][Masking][Unpack Type]) [curCycle]
|
||||
static __aligned16 nVifCall nVifUpk[(2*2*16) *4]; // ([USN][Masking][Unpack Type]) [curCycle]
|
||||
static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
|
||||
|
||||
#define _1mb (0x100000)
|
||||
|
@ -35,17 +41,27 @@ static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Ve
|
|||
#define aMax(x, y) std::max(x,y)
|
||||
#define aMin(x, y) std::min(x,y)
|
||||
#define _f __forceinline
|
||||
#define xmmCol0 xmm2
|
||||
#define xmmCol1 xmm3
|
||||
#define xmmCol2 xmm4
|
||||
#define xmmCol3 xmm5
|
||||
#define xmmRow xmm6
|
||||
#define xmmTemp xmm7
|
||||
|
||||
struct nVifBlock {
|
||||
u8 upkType; // Unpack Type
|
||||
struct nVifBlock { // Ordered for Hashing
|
||||
u8 num; // Num Field
|
||||
u8 upkType; // Unpack Type [usn*1:mask*1:upk*4]
|
||||
u8 mode; // Mode Field
|
||||
u8 scl; // Start Cycle
|
||||
u8 cl; // CL Field
|
||||
u8 wl; // WL Field
|
||||
u32 mask; // Mask Field
|
||||
u8* startPtr; // Start Ptr of RecGen Code
|
||||
};
|
||||
|
||||
#define _hSize 0x4000 // [usn*1:mask*1:upk*4:num*8] hash...
|
||||
#define _cmpS (sizeof(nVifBlock) - sizeof(uptr))
|
||||
#define _tParams nVifBlock, _hSize, _cmpS
|
||||
struct nVifStruct {
|
||||
u32 idx; // VIF0 or VIF1
|
||||
vifStruct* vif; // Vif Struct ptr
|
||||
|
@ -53,10 +69,14 @@ struct nVifStruct {
|
|||
VURegs* VU; // VU Regs ptr
|
||||
u8* vuMemEnd; // End of VU Memory
|
||||
u32 vuMemLimit; // Use for fast AND
|
||||
u8* recPtr; // Cur Pos to recompile to
|
||||
u8* recEnd; // End of Rec Cache
|
||||
BlockBuffer* vifCache; // Block Buffer
|
||||
HashBucket<_tParams>* vifBlocks; // Vif Blocks
|
||||
nVifBlock* vifBlock; // Current Vif Block Ptr
|
||||
};
|
||||
|
||||
// Contents of this table are doubled up for doMast(false) and doMask(true) lookups.
|
||||
// Contents of this table are doubled up for doMask(false) and doMask(true) lookups.
|
||||
// (note: currently unused, I'm using gsize in the interp tables instead since it
|
||||
// seems to be faster for now, which may change when nVif isn't reliant on interpreted
|
||||
// unpackers anymore --air)
|
||||
|
@ -98,10 +118,13 @@ static const u32 nVifT[32] = {
|
|||
2, // V4-5
|
||||
};
|
||||
|
||||
#define useOldUnpack 0 // Use code in newVif_OldUnpack.inl
|
||||
#define newVifDynaRec 1 // Use code in newVif_Dynarec.inl
|
||||
#include "newVif_OldUnpack.inl"
|
||||
#include "newVif_Unpack.inl"
|
||||
#include "newVif_UnpackGen.inl"
|
||||
|
||||
//#include "newVif_Dynarec.inl"
|
||||
#include "newVif_Tables.inl"
|
||||
#include "newVif_Dynarec.inl"
|
||||
|
||||
#endif
|
||||
|
|
|
@ -44,6 +44,7 @@ public:
|
|||
mSizeT = newSize;
|
||||
}
|
||||
void clear() { mSize = 0; }
|
||||
u32 getSize() { return mSize; }
|
||||
u32 getCurSize() { return mSize; }
|
||||
u32 getSize() { return mSizeT; }
|
||||
u8* getBlock() { return mData; }
|
||||
};
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2009 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets
|
||||
// authors: cottonvibes(@gmail.com)
|
||||
// Jake.Stine (@gmail.com)
|
||||
|
||||
#pragma once
|
||||
|
||||
void dVifInit(int idx) {
|
||||
nVif[idx].idx = idx;
|
||||
nVif[idx].VU = idx ? &VU1 : &VU0;
|
||||
nVif[idx].vif = idx ? &vif1 : &vif0;
|
||||
nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
|
||||
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
|
||||
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
|
||||
nVif[idx].vifCache = new BlockBuffer(_1mb*4); // 4mb Rec Cache
|
||||
nVif[idx].vifBlocks = new HashBucket<_tParams>();
|
||||
nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
|
||||
nVif[idx].recEnd = &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone
|
||||
}
|
||||
|
||||
_f void dVifRecLimit(int idx) {
|
||||
if (nVif[idx].recPtr > nVif[idx].recEnd) {
|
||||
nVif[idx].vifBlocks->clear();
|
||||
DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd);
|
||||
}
|
||||
}
|
||||
|
||||
_f void dVifSetMasks(nVifStruct& v, int mask, int mode, int cS) {
|
||||
u32 m0 = v.vifBlock->mask;
|
||||
u32 m1 = m0 & 0xaaaaaaaa;
|
||||
u32 m2 =(~m1>>1) & m0;
|
||||
u32 m3 = (m1>>1) & ~m0;
|
||||
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
|
||||
u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
|
||||
if((m2&&mask) || mode) { xMOVAPS(xmmRow, ptr32[row]); }
|
||||
if (m3&&mask) {
|
||||
xMOVAPS(xmmCol0, ptr32[col]);
|
||||
if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
|
||||
if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
|
||||
if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
|
||||
if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0);
|
||||
}
|
||||
//if (mask||mode) loadRowCol(v);
|
||||
}
|
||||
|
||||
void dVifRecompile(nVifStruct& v, nVifBlock* vB) {
|
||||
const bool isFill = (vB->cl < vB->wl);
|
||||
const int usn = (vB->upkType>>5)&1;
|
||||
const int doMask = (vB->upkType>>4)&1;
|
||||
const int upkNum = vB->upkType & 0xf;
|
||||
const u32& vift = nVifT[upkNum];
|
||||
const int doMode = vifRegs->mode & 3;
|
||||
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
|
||||
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
|
||||
const int skipSize = blockSize - cycleSize;
|
||||
const bool simpleBlock = (vifRegs->num == 1);
|
||||
const int backupCL = vif->cl;
|
||||
const int backupNum = vifRegs->num;
|
||||
if (vif->cl >= blockSize) vif->cl = 0;
|
||||
|
||||
v.vifBlock = vB;
|
||||
xSetPtr(v.recPtr);
|
||||
xAlignPtr(16);
|
||||
vB->startPtr = xGetPtr();
|
||||
dVifSetMasks(v, doMask, doMode, cycleSize);
|
||||
|
||||
while (vifRegs->num) {
|
||||
if (vif->cl < cycleSize) {
|
||||
xUnpack[upkNum](&v, doMode<<1 | doMask);
|
||||
if (!simpleBlock) xADD(edx, vift);
|
||||
if (!simpleBlock) xADD(ecx, 16);
|
||||
vifRegs->num--;
|
||||
if (++vif->cl == blockSize) vif->cl = 0;
|
||||
}
|
||||
else if (isFill) {
|
||||
DevCon.WriteLn("filling mode!");
|
||||
xUnpack[upkNum](&v, 1);
|
||||
xADD(ecx, 16);
|
||||
vifRegs->num--;
|
||||
if (++vif->cl == blockSize) vif->cl = 0;
|
||||
}
|
||||
else {
|
||||
xADD(ecx, 16 * skipSize);
|
||||
vif->cl = 0;
|
||||
}
|
||||
}
|
||||
if (doMode==2) writeBackRow(v);
|
||||
xMOV(ptr32[&vif->cl], vif->cl);
|
||||
xMOV(ptr32[&vifRegs->num], vifRegs->num);
|
||||
xRET();
|
||||
v.recPtr = xGetPtr();
|
||||
vif->cl = backupCL;
|
||||
vifRegs->num = backupNum;
|
||||
}
|
||||
|
||||
static nVifBlock _vBlock = {0};
|
||||
|
||||
_f u8* dVifsetVUptr(nVifStruct& v, int offset) {
|
||||
u8* ptr = (u8*)(v.VU->Mem + (offset & v.vuMemLimit));
|
||||
u8* endPtr = ptr + _vBlock.num * 16;
|
||||
if (endPtr > v.vuMemEnd) {
|
||||
DevCon.WriteLn("nVif - VU Mem Ptr Overflow!");
|
||||
ptr = NULL; // Fall Back to Interpreters which have wrap-around logic
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void dVifUnpack(int idx, u8 *data, u32 size) {
|
||||
|
||||
nVifStruct& v = nVif[idx];
|
||||
vif = v.vif;
|
||||
vifRegs = v.vifRegs;
|
||||
const u8 upkType = vif->tag.cmd & 0x1f | ((!!(vif->usn)) << 5);
|
||||
|
||||
_vBlock.upkType = upkType;
|
||||
_vBlock.num = *(u8*)&vifRegs->num;
|
||||
_vBlock.mode = *(u8*)&vifRegs->mode;
|
||||
_vBlock.scl = vif->cl;
|
||||
_vBlock.cl = vifRegs->cycle.cl;
|
||||
_vBlock.wl = vifRegs->cycle.wl;
|
||||
_vBlock.mask = vifRegs->mask;
|
||||
|
||||
if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
|
||||
u8* dest = dVifsetVUptr(v, vif->tag.addr);
|
||||
if (!dest) {
|
||||
//DevCon.WriteLn("Running Interpreter Block");
|
||||
_nVifUnpack(idx, data, size);
|
||||
}
|
||||
else {
|
||||
//DevCon.WriteLn("Running Recompiled Block!");
|
||||
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
|
||||
}
|
||||
return;
|
||||
}
|
||||
static int recBlockNum = 0;
|
||||
DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++);
|
||||
nVifBlock* vB = new nVifBlock();
|
||||
memcpy(vB, &_vBlock, sizeof(nVifBlock));
|
||||
dVifRecompile(v, vB);
|
||||
v.vifBlocks->add(vB);
|
||||
dVifRecLimit(idx);
|
||||
dVifUnpack(idx, data, size);
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
// HashBucket is a container which uses a built-in hash function
|
||||
// to perform quick searches.
|
||||
// T is a struct data type.
|
||||
// hSize determines the number of buckets HashBucket will use for sorting.
|
||||
// cmpSize is the size of data to consider 2 structs equal (see find())
|
||||
// The hash function is determined by taking the first bytes of data and
|
||||
// performing a modulus the size of hSize. So the most diverse-data should
|
||||
// be in the first bytes of the struct. (hence why nVifBlock is specifically sorted)
|
||||
template<typename T, int hSize, int cmpSize>
|
||||
class HashBucket {
|
||||
private:
|
||||
T* mChain[hSize];
|
||||
int mSize [hSize];
|
||||
public:
|
||||
HashBucket() {
|
||||
for (int i = 0; i < hSize; i++) {
|
||||
mChain[i] = NULL;
|
||||
mSize [i] = 0;
|
||||
}
|
||||
}
|
||||
~HashBucket() { clear(); }
|
||||
int quickFind(u32 data) {
|
||||
int o = data % hSize;
|
||||
return mSize[o];
|
||||
}
|
||||
T* find(T* dataPtr) {
|
||||
u32 d = *((u32*)dataPtr);
|
||||
int o = d % hSize;
|
||||
int s = mSize[o];
|
||||
T* c = mChain[o];
|
||||
for (int i = 0; i < s; i++) {
|
||||
if (!memcmp(&c[i], dataPtr, cmpSize)) return &c[i];
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
void add(T* dataPtr) {
|
||||
u32 d = *(u32*)dataPtr;
|
||||
int o = d % hSize;
|
||||
int s = mSize[o]++;
|
||||
T* c = mChain[o];
|
||||
T* n = new T[s+1];
|
||||
if (s) {
|
||||
memcpy(n, c, sizeof(T) * s);
|
||||
delete[] c;
|
||||
}
|
||||
memcpy(&n[s], dataPtr, sizeof(T));
|
||||
mChain[o] = n;
|
||||
}
|
||||
void clear() {
|
||||
for (int i = 0; i < hSize; i++) {
|
||||
safe_delete_array(mChain[i]);
|
||||
mSize[i] = 0;
|
||||
}
|
||||
}
|
||||
};
|
|
@ -0,0 +1,273 @@
|
|||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2009 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#define vUPK(x) void x(nVifStruct* v, int doMask)
|
||||
#define _doUSN (v->vifBlock->upkType & 0x20)
|
||||
#undef xMovDest
|
||||
#undef xShiftR
|
||||
#undef xPMOVXX8
|
||||
#undef xPMOVXX16
|
||||
#undef xMaskWrite
|
||||
#define makeMergeMask(x) { \
|
||||
x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3); \
|
||||
}
|
||||
void doMaskWrite(const xRegisterSSE& regX, nVifStruct* v, int doMask) {
|
||||
if (regX.Id > 1) DevCon.WriteLn("Reg Overflow!!!");
|
||||
int doMode = doMask>>1; doMask &= 1;
|
||||
int cc = aMin(v->vif->cl, 3);
|
||||
u32 m0 = (v->vifBlock->mask >> (cc * 8)) & 0xff;
|
||||
u32 m1 = m0 & 0xaaaa;
|
||||
u32 m2 =(~m1>>1) & m0;
|
||||
u32 m3 = (m1>>1) & ~m0;
|
||||
u32 m4 = (m1>>1) & m0;
|
||||
makeMergeMask(m2);
|
||||
makeMergeMask(m3);
|
||||
makeMergeMask(m4);
|
||||
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr32[ecx]); } // Load Write Protect
|
||||
if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id, m2); } // Merge Row
|
||||
if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc, m3); } // Merge Col
|
||||
if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id, m4); } // Merge Write Protect
|
||||
if (doMode) {
|
||||
u32 m5 = (~m1>>1) & ~m0;
|
||||
if (!doMask) m5 = 0xf;
|
||||
else makeMergeMask(m5);
|
||||
if (m5 < 0xf) {
|
||||
xPXOR(xmmTemp, xmmTemp);
|
||||
mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5);
|
||||
xPADD.D(regX, xmmTemp);
|
||||
if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5);
|
||||
}
|
||||
else if (m5 == 0xf) {
|
||||
xPADD.D(regX, xmmRow);
|
||||
if (doMode==2) xMOVAPS(xmmRow, regX);
|
||||
}
|
||||
}
|
||||
xMOVAPS(ptr32[ecx], regX);
|
||||
}
|
||||
#define xMovDest(regX) { \
|
||||
if (!doMask){ xMOVAPS (ptr32[ecx], regX); } \
|
||||
else { doMaskWrite(regX, v, doMask); } \
|
||||
}
|
||||
#define xShiftR(regX, n) { \
|
||||
if (_doUSN) { xPSRL.D(regX, n); } \
|
||||
else { xPSRA.D(regX, n); } \
|
||||
}
|
||||
#define xPMOVXX8(regX, src) { \
|
||||
if (_doUSN) xPMOVZX.BD(regX, src); \
|
||||
else xPMOVSX.BD(regX, src); \
|
||||
}
|
||||
#define xPMOVXX16(regX, src) { \
|
||||
if (_doUSN) xPMOVZX.WD(regX, src); \
|
||||
else xPMOVSX.WD(regX, src); \
|
||||
}
|
||||
|
||||
// ecx = dest, edx = src
|
||||
vUPK(nVif_S_32) {
|
||||
xMOV32 (xmm0, ptr32[edx]);
|
||||
xPSHUF.D (xmm1, xmm0, _v0);
|
||||
xMovDest (xmm1);
|
||||
}
|
||||
|
||||
vUPK(nVif_S_16) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX16 (xmm0, ptr64[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV16 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 16);
|
||||
}
|
||||
xPSHUF.D (xmm1, xmm0, _v0);
|
||||
xMovDest (xmm1);
|
||||
}
|
||||
|
||||
vUPK(nVif_S_8) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX8 (xmm0, ptr32[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV8 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LBW(xmm0, xmm0);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 24);
|
||||
}
|
||||
xPSHUF.D (xmm1, xmm0, _v0);
|
||||
xMovDest (xmm1);
|
||||
}
|
||||
|
||||
vUPK(nVif_V2_32) {
|
||||
xMOV64 (xmm0, ptr32[edx]);
|
||||
xMovDest (xmm0);
|
||||
}
|
||||
|
||||
vUPK(nVif_V2_16) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX16 (xmm0, ptr64[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV32 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 16);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
}
|
||||
|
||||
vUPK(nVif_V2_8) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX8 (xmm0, ptr32[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV16 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LBW(xmm0, xmm0);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 24);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
}
|
||||
|
||||
vUPK(nVif_V3_32) {
|
||||
xMOV128 (xmm0, ptr32[edx]);
|
||||
xMovDest (xmm0);
|
||||
}
|
||||
|
||||
vUPK(nVif_V3_16) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX16 (xmm0, ptr64[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV64 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 16);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
}
|
||||
|
||||
vUPK(nVif_V3_8) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX8 (xmm0, ptr32[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV32 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LBW(xmm0, xmm0);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 24);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
}
|
||||
|
||||
vUPK(nVif_V4_32) {
|
||||
xMOV128 (xmm0, ptr32[edx]);
|
||||
xMovDest (xmm0);
|
||||
}
|
||||
|
||||
vUPK(nVif_V4_16) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX16 (xmm0, ptr64[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV64 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 16);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
}
|
||||
|
||||
vUPK(nVif_V4_8) {
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX8 (xmm0, ptr32[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV32 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LBW(xmm0, xmm0);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 24);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
}
|
||||
|
||||
vUPK(nVif_V4_5) {
|
||||
xMOV16 (xmm0, ptr32[edx]);
|
||||
xPSHUF.D (xmm0, xmm0, _v0);
|
||||
xPSLL.D (xmm0, 3); // ABG|R5.000
|
||||
xMOVAPS (xmm1, xmm0); // x|x|x|R
|
||||
xPSRL.D (xmm0, 8); // ABG
|
||||
xPSLL.D (xmm0, 3); // AB|G5.000
|
||||
mVUmergeRegs(XMM1, XMM0, 0x4); // x|x|G|R
|
||||
xPSRL.D (xmm0, 8); // AB
|
||||
xPSLL.D (xmm0, 3); // A|B5.000
|
||||
mVUmergeRegs(XMM1, XMM0, 0x2); // x|B|G|R
|
||||
xPSRL.D (xmm0, 8); // A
|
||||
xPSLL.D (xmm0, 7); // A.0000000
|
||||
mVUmergeRegs(XMM1, XMM0, 0x1); // A|B|G|R
|
||||
xPSLL.D (xmm1, 24); // can optimize to
|
||||
xPSRL.D (xmm1, 24); // single AND...
|
||||
xMovDest (xmm1);
|
||||
}
|
||||
|
||||
vUPK(nVif_unkown) {
|
||||
Console.Error("nVif%d - Invalid Unpack! [%d]", v->idx, v->vif->tag.cmd & 0xf);
|
||||
}
|
||||
|
||||
void (*xUnpack[16])(nVifStruct* v, int doMask) = {
|
||||
nVif_S_32,
|
||||
nVif_S_16,
|
||||
nVif_S_8,
|
||||
nVif_unkown,
|
||||
nVif_V2_32,
|
||||
nVif_V2_16,
|
||||
nVif_V2_8,
|
||||
nVif_unkown,
|
||||
nVif_V3_32,
|
||||
nVif_V3_16,
|
||||
nVif_V3_8,
|
||||
nVif_unkown,
|
||||
nVif_V4_32,
|
||||
nVif_V4_16,
|
||||
nVif_V4_8,
|
||||
nVif_V4_5,
|
||||
};
|
||||
|
||||
// Loads Row/Col Data from vifRegs instead of g_vifmask
|
||||
// Useful for testing vifReg and g_vifmask inconsistency.
|
||||
void loadRowCol(nVifStruct& v) {
|
||||
xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
|
||||
xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
|
||||
xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
|
||||
xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
|
||||
xPSHUF.D(xmm0, xmm0, _v0);
|
||||
xPSHUF.D(xmm1, xmm1, _v0);
|
||||
xPSHUF.D(xmm2, xmm2, _v0);
|
||||
xPSHUF.D(xmm6, xmm6, _v0);
|
||||
mVUmergeRegs(XMM6, XMM0, 8);
|
||||
mVUmergeRegs(XMM6, XMM1, 4);
|
||||
mVUmergeRegs(XMM6, XMM2, 2);
|
||||
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
|
||||
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
|
||||
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
|
||||
xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
|
||||
xPSHUF.D(xmm2, xmm2, _v0);
|
||||
xPSHUF.D(xmm3, xmm3, _v0);
|
||||
xPSHUF.D(xmm4, xmm4, _v0);
|
||||
xPSHUF.D(xmm5, xmm5, _v0);
|
||||
}
|
||||
|
||||
void writeBackRow(nVifStruct& v) {
|
||||
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
|
||||
xMOVAPS(ptr32[row], xmmRow);
|
||||
DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
|
||||
// ToDo: Do we need to write back to vifregs.rX too!? :/
|
||||
}
|
|
@ -22,11 +22,36 @@
|
|||
static __aligned16 nVifStruct nVif[2];
|
||||
static _f void _nVifUnpack(int idx, u8 *data, u32 size);
|
||||
|
||||
void initNewVif(int idx) {
|
||||
nVif[idx].idx = idx;
|
||||
nVif[idx].VU = idx ? &VU1 : &VU0;
|
||||
nVif[idx].vif = idx ? &vif1 : &vif0;
|
||||
nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
|
||||
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
|
||||
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
|
||||
nVif[idx].vifCache = NULL;
|
||||
|
||||
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
|
||||
memset8<0xcc>( nVifUpkExec );
|
||||
|
||||
xSetPtr( nVifUpkExec );
|
||||
|
||||
for (int a = 0; a < 2; a++) {
|
||||
for (int b = 0; b < 2; b++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
nVifGen(a, b, c);
|
||||
}}}
|
||||
|
||||
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
|
||||
if (newVifDynaRec) dVifInit(idx);
|
||||
}
|
||||
|
||||
int nVifUnpack(int idx, u32 *data) {
|
||||
XMMRegisters::Freeze();
|
||||
int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
|
||||
vif1.tag.size -= ret;
|
||||
_nVifUnpack(idx, (u8*)data, ret<<2);
|
||||
if (newVifDynaRec) dVifUnpack(idx, (u8*)data, ret<<2);
|
||||
else _nVifUnpack(idx, (u8*)data, ret<<2);
|
||||
if (vif1.tag.size <= 0) {
|
||||
vif1.tag.size = 0;
|
||||
vif1.cmd = 0;
|
||||
|
@ -59,9 +84,9 @@ static u32 oldMaskIdx = -1;
|
|||
static u32 oldMask = 0;
|
||||
|
||||
static void setMasks(int idx, const VIFregisters& v) {
|
||||
if (idx == oldMaskIdx && oldMask == v.mask) return;
|
||||
oldMaskIdx = idx;
|
||||
oldMask = v.mask;
|
||||
//if (idx == oldMaskIdx && oldMask == v.mask) return;
|
||||
//oldMaskIdx = idx;
|
||||
//oldMask = v.mask;
|
||||
//DevCon.WriteLn("mask");
|
||||
for (int i = 0; i < 16; i++) {
|
||||
int m = (v.mask >> (i*2)) & 3;
|
||||
|
@ -115,9 +140,7 @@ __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) {
|
|||
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
|
||||
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
|
||||
const int skipSize = blockSize - cycleSize;
|
||||
|
||||
//if (skipSize > 2)
|
||||
//DevCon.WriteLn("[num = %d][cl = %d][bl = %d][diff = %d]", vifRegs->num, vif->cl, blockSize, skipSize);
|
||||
//DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs->num, upkNum, vif->cl, blockSize, skipSize);
|
||||
|
||||
if (vif->cmd & 0x10) setMasks(idx, *vifRegs);
|
||||
|
||||
|
@ -141,22 +164,17 @@ __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) {
|
|||
// mucks up compiler optimizations on the internal loops. >_< --air
|
||||
const u8* vuMemBase = (idx ? VU1 : VU0).Mem;
|
||||
u8* dest = setVUptr(idx, vuMemBase, vif->tag.addr);
|
||||
|
||||
if (vif->cl >= blockSize) vif->cl = 0;
|
||||
|
||||
while (vifRegs->num /*&& size*/) {
|
||||
while (vifRegs->num && size) {
|
||||
if (vif->cl < cycleSize) {
|
||||
if (doMode /*|| doMask*/) {
|
||||
//if (doMask)
|
||||
if (doMode) {
|
||||
//DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum);
|
||||
func((u32*)dest, (u32*)data);
|
||||
}
|
||||
else {
|
||||
//DevCon.WriteLn("SSE Unpack!");
|
||||
|
||||
// Opt note: removing this min check (which isn't needed right now?) is +1%
|
||||
// or more. Just something to keep in mind. :) --air
|
||||
fnbase[aMin(vif->cl, 4)](dest, data);
|
||||
fnbase[aMin(vif->cl, 3)](dest, data);
|
||||
}
|
||||
data += ft.gsize;
|
||||
//if( IsDebugBuild ) size -= ft.gsize; // only used below for assertion checking
|
||||
|
@ -187,24 +205,22 @@ static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] =
|
|||
{
|
||||
{
|
||||
{ _nVifUnpackLoop<0,false,false>, _nVifUnpackLoop<0,false,true> },
|
||||
{ _nVifUnpackLoop<0,true,false>, _nVifUnpackLoop<0,true,true> },
|
||||
{ _nVifUnpackLoop<0,true, false>, _nVifUnpackLoop<0,true, true> },
|
||||
},
|
||||
|
||||
{
|
||||
{ _nVifUnpackLoop<1,false,false>, _nVifUnpackLoop<1,false,true> },
|
||||
{ _nVifUnpackLoop<1,true,false>, _nVifUnpackLoop<1,true,true> },
|
||||
{ _nVifUnpackLoop<1,true, false>, _nVifUnpackLoop<1,true, true> },
|
||||
},
|
||||
|
||||
};
|
||||
|
||||
|
||||
static _f void _nVifUnpack(int idx, u8 *data, u32 size) {
|
||||
/*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write
|
||||
|
||||
if (useOldUnpack) {
|
||||
if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2);
|
||||
else VIFunpack<1>((u32*)data, &vif1.tag, size>>2);
|
||||
return;
|
||||
}
|
||||
else*/ { // filling write
|
||||
|
||||
vif = nVif[idx].vif;
|
||||
vifRegs = nVif[idx].vifRegs;
|
||||
|
@ -216,5 +232,4 @@ static _f void _nVifUnpack(int idx, u8 *data, u32 size) {
|
|||
//if (isFill)
|
||||
//DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
|
||||
//DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
#define xMaskWrite(regX) { \
|
||||
xMOVAPS(xmm7, ptr32[ecx]); \
|
||||
int offX = aMin(curCycle, 4); \
|
||||
int offX = aMin(curCycle, 3); \
|
||||
xPAND(regX, ptr32[nVifMask[0][offX]]); \
|
||||
xPAND(xmm7, ptr32[nVifMask[1][offX]]); \
|
||||
xPOR (regX, ptr32[nVifMask[2][offX]]); \
|
||||
|
@ -25,13 +25,21 @@
|
|||
xMOVAPS(ptr32[ecx], regX); \
|
||||
}
|
||||
#define xMovDest(regX) { \
|
||||
if (mask==0) { xMOVAPS (ptr32[ecx], regX); } \
|
||||
if (!mask) { xMOVAPS (ptr32[ecx], regX); } \
|
||||
else { xMaskWrite(regX); } \
|
||||
}
|
||||
#define xShiftR(regX, n) { \
|
||||
if (usn) { xPSRL.D(regX, n); } \
|
||||
else { xPSRA.D(regX, n); } \
|
||||
}
|
||||
#define xPMOVXX8(regX, src) { \
|
||||
if (usn) xPMOVZX.BD(regX, src); \
|
||||
else xPMOVSX.BD(regX, src); \
|
||||
}
|
||||
#define xPMOVXX16(regX, src) { \
|
||||
if (usn) xPMOVZX.WD(regX, src); \
|
||||
else xPMOVSX.WD(regX, src); \
|
||||
}
|
||||
|
||||
struct VifUnpackIndexer {
|
||||
int usn, mask;
|
||||
|
@ -74,18 +82,28 @@ void nVifGen(int usn, int mask, int curCycle) {
|
|||
xRET();
|
||||
|
||||
indexer.xSetCall(0x1); // S-16
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX16 (xmm0, ptr64[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV16 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 16);
|
||||
}
|
||||
xPSHUF.D (xmm1, xmm0, _v0);
|
||||
xMovDest (xmm1);
|
||||
xRET();
|
||||
|
||||
indexer.xSetCall(0x2); // S-8
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX8 (xmm0, ptr32[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV8 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LBW(xmm0, xmm0);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 24);
|
||||
}
|
||||
xPSHUF.D (xmm1, xmm0, _v0);
|
||||
xMovDest (xmm1);
|
||||
xRET();
|
||||
|
@ -98,17 +116,27 @@ void nVifGen(int usn, int mask, int curCycle) {
|
|||
xRET();
|
||||
|
||||
indexer.xSetCall(0x5); // V2-16
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX16 (xmm0, ptr64[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV32 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 16);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
xRET();
|
||||
|
||||
indexer.xSetCall(0x6); // V2-8
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX8 (xmm0, ptr32[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV16 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LBW(xmm0, xmm0);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 24);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
xRET();
|
||||
|
||||
|
@ -120,17 +148,27 @@ void nVifGen(int usn, int mask, int curCycle) {
|
|||
xRET();
|
||||
|
||||
indexer.xSetCall(0x9); // V3-16
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX16 (xmm0, ptr64[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV64 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 16);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
xRET();
|
||||
|
||||
indexer.xSetCall(0xa); // V3-8
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX8 (xmm0, ptr32[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV32 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LBW(xmm0, xmm0);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 24);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
xRET();
|
||||
|
||||
|
@ -142,17 +180,27 @@ void nVifGen(int usn, int mask, int curCycle) {
|
|||
xRET();
|
||||
|
||||
indexer.xSetCall(0xd); // V4-16
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX16 (xmm0, ptr64[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV64 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 16);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
xRET();
|
||||
|
||||
indexer.xSetCall(0xe); // V4-8
|
||||
if (x86caps.hasStreamingSIMD4Extensions) {
|
||||
xPMOVXX8 (xmm0, ptr32[edx]);
|
||||
}
|
||||
else {
|
||||
xMOV32 (xmm0, ptr32[edx]);
|
||||
xPUNPCK.LBW(xmm0, xmm0);
|
||||
xPUNPCK.LWD(xmm0, xmm0);
|
||||
xShiftR (xmm0, 24);
|
||||
}
|
||||
xMovDest (xmm0);
|
||||
xRET();
|
||||
|
||||
|
@ -190,27 +238,3 @@ void nVifGen(int usn, int mask, int curCycle) {
|
|||
|
||||
pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
|
||||
}
|
||||
|
||||
void initNewVif(int idx) {
|
||||
nVif[idx].idx = idx;
|
||||
nVif[idx].VU = idx ? &VU1 : &VU0;
|
||||
nVif[idx].vif = idx ? &vif1 : &vif0;
|
||||
nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
|
||||
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
|
||||
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
|
||||
nVif[idx].vifCache = NULL;
|
||||
|
||||
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
|
||||
memset8<0xcc>( nVifUpkExec );
|
||||
|
||||
xSetPtr( nVifUpkExec );
|
||||
|
||||
for (int a = 0; a < 2; a++) {
|
||||
for (int b = 0; b < 2; b++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
nVifGen(a, b, c);
|
||||
}
|
||||
}}
|
||||
|
||||
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue