Did some optimization and bugfixing on the new VIF unpacker. :) (it's still a bit slower than the current/old one though)

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2346 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-12-15 17:14:57 +00:00
parent 79ee87a90f
commit b5f643950c
4 changed files with 213 additions and 111 deletions

View File

@ -60,7 +60,7 @@ static __forceinline u32 vif_size(u8 num)
return (num == 0) ? 0x1000 : 0x4000; return (num == 0) ? 0x1000 : 0x4000;
} }
//#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code) #define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
//#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined) #define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
//#define newVif0 // Use New Code for Vif0 Unpacks (not implemented) //#define newVif0 // Use New Code for Vif0 Unpacks (not implemented)
#endif #endif

View File

@ -21,20 +21,18 @@ using namespace x86Emitter;
extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0); extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0);
extern void _nVifUnpack(int idx, u8 *data, u32 size); extern void _nVifUnpack(int idx, u8 *data, u32 size);
struct instBlock { u8 data[16*64]; };
static __pagealigned instBlock nVifUpk[2][2][4][3][16]; // [USN][Masking][curCycle][CyclesToWrite-1][Unpack Type]
static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
typedef u32 (__fastcall *nVifCall)(void*, void*); typedef u32 (__fastcall *nVifCall)(void*, void*);
#define nVifUnpackF(dest, src, usn, doMask, curCycle, cycles, unpackType) { \
(((nVifCall)((void*)&nVifUpk[usn][doMask][curCycle][cycles][unpackType]))(dest, src)); \ static __pagealigned u8 nVifUpkExec[__pagesize*16];
} static __aligned16 nVifCall nVifUpk[(2*2*16)*4*4]; // ([USN][Masking][Unpack Type]) [curCycle][CyclesToWrite-1]
static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
#define _v0 0 #define _v0 0
#define _v1 0x55 #define _v1 0x55
#define _v2 0xaa #define _v2 0xaa
#define _v3 0xff #define _v3 0xff
#define aMax(x, y) (((x) > (y) ? (x) : (y))) #define aMax(x, y) std::max(x,y)
#define aMin(x, y) (((x) < (y) ? (x) : (y))) #define aMin(x, y) std::min(x,y)
#define _f __forceinline #define _f __forceinline
#define xShiftR(regX, n) { \ #define xShiftR(regX, n) { \
@ -42,7 +40,7 @@ typedef u32 (__fastcall *nVifCall)(void*, void*);
else { xPSRA.D(regX, n); } \ else { xPSRA.D(regX, n); } \
} }
u32 nVifT[16] = { static const u32 nVifT[16] = {
4, // S-32 4, // S-32
2, // S-16 2, // S-16
1, // S-8 1, // S-8

View File

@ -36,22 +36,29 @@ void initNewVif(int idx) {
nVif[idx].vifBlock = new BlockBuffer(0x2000); // 8kb Block Buffer nVif[idx].vifBlock = new BlockBuffer(0x2000); // 8kb Block Buffer
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
memset_8<0xcc,sizeof(nVifUpk)>(nVifUpk);
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
memset8<0xcc>( nVifUpkExec );
xSetPtr( nVifUpkExec );
for (int a = 0; a < 2; a++) { for (int a = 0; a < 2; a++) {
for (int b = 0; b < 2; b++) { for (int b = 0; b < 2; b++) {
for (int c = 0; c < 4; c++) { for (int c = 0; c < 4; c++) {
for (int d = 0; d < 3; d++) { for (int d = 0; d < 3; d++) {
nVifGen(a, b, c, d); //nVifUpk[2][2][4][3][16]; nVifGen(a, b, c, d);
}}}} }}}}
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
} }
int nVifUnpack(int idx, u32 *data) { int nVifUnpack(int idx, u32 *data) {
XMMRegisters::Freeze(); XMMRegisters::Freeze();
BlockBuffer* vB = nVif[idx].vifBlock; //BlockBuffer* vB = nVif[idx].vifBlock;
int ret = aMin(vif1.vifpacketsize, vif1.tag.size); int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
vif1.tag.size -= ret; vif1.tag.size -= ret;
_nVifUnpack(idx, (u8*)data, ret<<2); _nVifUnpack(idx, (u8*)data, ret<<2);
if (vif1.tag.size <= 0) vif1.tag.size = 0; if (vif1.tag.size <= 0) vif1.tag.size = 0;
if (vif1.tag.size <= 0) vif1.cmd = 0; if (vif1.tag.size <= 0) vif1.cmd = 0;
XMMRegisters::Thaw(); XMMRegisters::Thaw();
return ret; return ret;
@ -70,9 +77,9 @@ _f void incVUptr(int idx, u8* &ptr, int amount) {
if ((uptr)ptr & 0xf) DevCon.WriteLn("unaligned wtf :("); if ((uptr)ptr & 0xf) DevCon.WriteLn("unaligned wtf :(");
} }
_f void setMasks(VIFregisters* v) { static void setMasks(const VIFregisters& v) {
for (int i = 0; i < 16; i++) { for (int i = 0; i < 16; i++) {
int m = (v->mask >> (i*2)) & 3; int m = (v.mask >> (i*2)) & 3;
switch (m) { switch (m) {
case 0: // Data case 0: // Data
nVifMask[0][i/4][i%4] = 0xffffffff; nVifMask[0][i/4][i%4] = 0xffffffff;
@ -82,12 +89,12 @@ _f void setMasks(VIFregisters* v) {
case 1: // Row case 1: // Row
nVifMask[0][i/4][i%4] = 0; nVifMask[0][i/4][i%4] = 0;
nVifMask[1][i/4][i%4] = 0; nVifMask[1][i/4][i%4] = 0;
nVifMask[2][i/4][i%4] = ((u32*)&v->r0)[(i%4)*4]; nVifMask[2][i/4][i%4] = ((u32*)&v.r0)[(i%4)*4];
break; break;
case 2: // Col case 2: // Col
nVifMask[0][i/4][i%4] = 0; nVifMask[0][i/4][i%4] = 0;
nVifMask[1][i/4][i%4] = 0; nVifMask[1][i/4][i%4] = 0;
nVifMask[2][i/4][i%4] = ((u32*)&v->c0)[(i/4)*4]; nVifMask[2][i/4][i%4] = ((u32*)&v.c0)[(i/4)*4];
break; break;
case 3: // Write Protect case 3: // Write Protect
nVifMask[0][i/4][i%4] = 0; nVifMask[0][i/4][i%4] = 0;
@ -98,7 +105,97 @@ _f void setMasks(VIFregisters* v) {
} }
} }
_f void _nVifUnpack(int idx, u8 *data, u32 size) { // ----------------------------------------------------------------------------
// Unpacking Optimization notes:
// ----------------------------------------------------------------------------
// Some games send a LOT of small packets. This is a problem because the new VIF unpacker
// has a lot of setup code to establish which unpack function to call. The best way to
// optimize this is to cache the unpack function's base (see fnbase below) and update it
// when the variables it depends on are modified: writes to vif->tag.cmd and vif->usn.
//
// A secondary optimization would be adding special handlers for packets where vifRegs->num==1.
// (which would remove the loop, simplify the incVUptr code, etc). But checking for it has
// to be simple enough that it doesn't offset the benefits (which I'm not sure is possible).
// -- air
template< int idx, bool doMode, bool isFill >
__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
{
// Eh... template attempt, tho not sure it helped much. There's too much setup code (see
// optimization note above) -- air
const int usn = !!(vif->usn);
const int doMask = !!(vif->tag.cmd & 0x10);
const int upkNum = vif->tag.cmd & 0xf;
const u32& vift = nVifT[upkNum];
u8* dest = setVUptr(idx, vif->tag.addr);
const VIFUnpackFuncTable& ft = VIFfuncTable[vif->tag.cmd & 0xf];
UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS;
const nVifCall* fnbase = &nVifUpk[
((usn*2*16) + (doMask*16) + (upkNum)) * (4*4)
];
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
if (doMask)
setMasks(*vifRegs);
if (vif->cl >= blockSize) {
vif->cl = 0;
}
while (vifRegs->num > 0) {
if (vif->cl < cycleSize) {
//if (size <= 0) { DbgCon.WriteLn("_nVifUnpack: Out of Data!"); break; }
if (doMode /*|| doMask*/) {
//if (doMask)
//DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum);
func((u32*)dest, (u32*)data, ft.qsize);
data += ft.gsize;
size -= ft.gsize;
vifRegs->num--;
}
else if (1) {
//DevCon.WriteLn("SSE Unpack!");
fnbase[aMin(vif->cl, 4) * 4](dest, data);
data += vift;
size -= vift;
vifRegs->num--;
}
else {
//DevCon.WriteLn("SSE Unpack!");
int c = aMin((cycleSize - vif->cl), 3);
size -= vift * c;
//if (c>1) { DevCon.WriteLn("C > 1!"); }
if (c<0||c>3) { DbgCon.WriteLn("C wtf!"); }
if (size < 0) { DbgCon.WriteLn("Size Shit"); size+=vift*c;c=1;size-=vift*c;}
fnbase[(aMin(vif->cl, 4) * 4) + c-1](dest, data);
data += vift * c;
vifRegs->num -= c;
}
}
else if (isFill) {
func((u32*)dest, (u32*)data, ft.qsize);
vifRegs->num--;
}
incVUptr(idx, dest, 16);
// Removing this modulo was a huge speedup for God of War. (62->73 fps)
// (GoW uses a lot of blockSize==1 packets, resulting in tons of loops -- so the biggest
// factor in performance ends up being the top-level conditionals of the loop, and
// also the loop prep code.) --air
//vif->cl = (vif->cl+1) % blockSize;
if( ++vif->cl == blockSize ) vif->cl = 0;
}
}
void _nVifUnpack(int idx, u8 *data, u32 size) {
/*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write /*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write
if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2); if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2);
else VIFunpack<1>((u32*)data, &vif1.tag, size>>2); else VIFunpack<1>((u32*)data, &vif1.tag, size>>2);
@ -107,82 +204,56 @@ _f void _nVifUnpack(int idx, u8 *data, u32 size) {
else*/ { // filling write else*/ { // filling write
vif = nVif[idx].vif; vif = nVif[idx].vif;
vifRegs = nVif[idx].vifRegs; vifRegs = nVif[idx].vifRegs;
int isFill = !!(vifRegs->cycle.cl < vifRegs->cycle.wl);
int usn = !!(vif->usn); const bool doMode = !!vifRegs->mode;
int doMask = !!(vif->tag.cmd & 0x10); const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
int upkNum = vif->tag.cmd & 0xf;
int doMode = !!(vifRegs->mode); //UnpackLoopTable[idx][doMode][isFill]( data, size );
if (doMask) setMasks(vifRegs);
if( idx )
{
if( doMode )
{
if( isFill )
_nVifUnpackLoop<1,true,true>( data, size );
else
_nVifUnpackLoop<1,true,false>( data, size );
}
else
{
if( isFill )
_nVifUnpackLoop<1,false,true>( data, size );
else
_nVifUnpackLoop<1,false,false>( data, size );
}
}
else
{
pxFailDev( "No VIF0 support yet, sorry!" );
}
//if (isFill) //if (isFill)
//DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok")); //DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
//DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num); //DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);
u8* dest = setVUptr(idx, vif->tag.addr);
const VIFUnpackFuncTable* ft = &VIFfuncTable[vif->tag.cmd & 0xf];
UNPACKFUNCTYPE func = vif->usn ? ft->funcU : ft->funcS;
int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
//vif->cl = 0;
while (vifRegs->num > 0) {
if (vif->cl >= blockSize) {
vif->cl = 0;
}
if (vif->cl < cycleSize) {
if (size <= 0) { DevCon.WriteLn("_nVifUnpack: Out of Data!"); break; }
if (doMode /*|| doMask*/) {
//if (doMask)
//DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum);
func((u32*)dest, (u32*)data, ft->qsize);
data += ft->gsize;
size -= ft->gsize;
vifRegs->num--;
}
else if (1) {
//DevCon.WriteLn("SSE Unpack!");
nVifUnpackF(dest, data, usn, doMask, aMin(vif->cl, 4), 0, upkNum);
data += nVifT[upkNum];
size -= nVifT[upkNum];
vifRegs->num--;
}
else {
//DevCon.WriteLn("SSE Unpack!");
int c = aMin((cycleSize - vif->cl), 3);
int t = nVifT[upkNum];
size -= t * c;
//if (c>1) { DevCon.WriteLn("C > 1!"); }
if (c<0||c>3) { DevCon.WriteLn("C wtf!"); }
if (size < 0) { DevCon.WriteLn("Size Shit"); size+=t*c;c=1;size-=t*c;}
nVifUnpackF(dest, data, usn, doMask, aMin(vif->cl, 4), c-1, upkNum);
data += t * c;
vifRegs->num -= c;
}
}
else if (isFill) {
func((u32*)dest, (u32*)data, ft->qsize);
vifRegs->num--;
}
incVUptr(idx, dest, 16);
vif->cl = (vif->cl+1) % blockSize;
}
} }
} }
//int nVifUnpack(int idx, u32 *data) { //int nVifUnpack(int idx, u32 *data) {
// XMMRegisters::Freeze(); // XMMRegisters::Freeze();
// BlockBuffer* vB = nVif[idx].vifBlock; // BlockBuffer* vB = nVif[idx].vifBlock;
// int ret = aMin(vif1.vifpacketsize, vif1.tag.size); // int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
// //vB->append(data, ret<<2); // //vB->append(data, ret<<2);
// vif1.tag.size -= ret; // vif1.tag.size -= ret;
// //DevCon.WriteLn("2 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2); // //DevCon.WriteLn("2 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2);
// //if (vif1.tag.size <= 0) { // //if (vif1.tag.size <= 0) {
// //DevCon.WriteLn("3 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2); // //DevCon.WriteLn("3 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2);
// //VIFunpack<1>(vB->getBlock(), &vif1.tag, vB->getSize()>>2); // //VIFunpack<1>(vB->getBlock(), &vif1.tag, vB->getSize()>>2);
// //_nVifUnpack(idx, vB->getBlock(), vB->getSize()); // //_nVifUnpack(idx, vB->getBlock(), vB->getSize());
// _nVifUnpack(idx, (u8*)data, ret<<2); // _nVifUnpack(idx, (u8*)data, ret<<2);
// if (vif1.tag.size <= 0) vif1.tag.size = 0; // if (vif1.tag.size <= 0) vif1.tag.size = 0;
// if (vif1.tag.size <= 0) vif1.cmd = 0; // if (vif1.tag.size <= 0) vif1.cmd = 0;
// //vB->clear(); // //vB->clear();
// //} // //}
// //else { vif1.tag.size+=ret; ret = -1; vB->clear(); } // //else { vif1.tag.size+=ret; ret = -1; vB->clear(); }
// XMMRegisters::Thaw(); // XMMRegisters::Thaw();

View File

@ -66,11 +66,40 @@ void convertRGB() {
xPSRL.D (xmm2, 24); // single AND... xPSRL.D (xmm2, 24); // single AND...
} }
struct VifUnpackIndexer
{
int usn, mask;
int curCycle, cyclesToWrite;
nVifCall& GetCall( int packType ) const
{
int usnpart = usn*2*16;
int maskpart = mask*16;
int packpart = packType;
int curpart = curCycle*4;
int cycpespart = cyclesToWrite;
return nVifUpk[((usnpart+maskpart+packpart)*(4*4)) + (curpart+cycpespart)];
}
void xSetCall( int packType ) const
{
xAlignPtr(16);
GetCall( packType ) = (nVifCall)xGetPtr();
}
void xSetNullCall( int packType ) const
{
GetCall( packType ) = NULL;
}
};
// ecx = dest, edx = src // ecx = dest, edx = src
void nVifGen(int usn, int mask, int curCycle, int cycles) { void nVifGen(int usn, int mask, int curCycle, int cycles) {
HostSys::MemProtect(nVifUpk, sizeof(nVifUpk), Protect_ReadWrite, false); const VifUnpackIndexer indexer = { usn, mask, curCycle, cycles };
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x0]); // S-32 indexer.xSetCall(0x0); // S-32
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0);
if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1);
@ -78,7 +107,7 @@ void nVifGen(int usn, int mask, int curCycle, int cycles) {
if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x1]); // S-16 indexer.xSetCall(0x1); // S-16
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
if (cycles>=0) xShiftR (xmm0, 16); if (cycles>=0) xShiftR (xmm0, 16);
@ -88,7 +117,7 @@ void nVifGen(int usn, int mask, int curCycle, int cycles) {
if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x2]); // S-8 indexer.xSetCall(0x2); // S-8
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
@ -99,15 +128,16 @@ void nVifGen(int usn, int mask, int curCycle, int cycles) {
if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x3]); // ---- indexer.xSetNullCall(0x3); // ----
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x4]); // V2-32
indexer.xSetCall(0x4); // V2-32
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x10]); if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x10]);
if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x5]); // V2-16 indexer.xSetCall(0x5); // V2-16
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2); if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
@ -118,7 +148,7 @@ void nVifGen(int usn, int mask, int curCycle, int cycles) {
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x6]); // V2-8 indexer.xSetCall(0x6); // V2-8
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2); if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2);
@ -130,15 +160,16 @@ void nVifGen(int usn, int mask, int curCycle, int cycles) {
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x7]); // ---- indexer.xSetNullCall(0x7); // ----
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x8]); // V3-32
indexer.xSetCall(0x8); // V3-32
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+12]); if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+12]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+24]); if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+24]);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x9]); // V3-16 indexer.xSetCall(0x9); // V3-16
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+6]); if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+6]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+12]); if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+12]);
@ -151,7 +182,7 @@ void nVifGen(int usn, int mask, int curCycle, int cycles) {
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xa]); // V3-8 indexer.xSetCall(0xa); // V3-8
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+3]); if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+3]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+6]); if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+6]);
@ -167,15 +198,16 @@ void nVifGen(int usn, int mask, int curCycle, int cycles) {
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xb]); // ---- indexer.xSetNullCall(0xb); // ----
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xc]); // V4-32
indexer.xSetCall(0xc); // V4-32
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]); if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]); if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xd]); // V4-16 indexer.xSetCall(0xd); // V4-16
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]); if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]); if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]);
@ -188,7 +220,7 @@ void nVifGen(int usn, int mask, int curCycle, int cycles) {
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xRET(); xRET();
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xe]); // V4-8 indexer.xSetCall(0xe); // V4-8
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+4]); if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+4]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+8]); if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+8]);
@ -206,7 +238,7 @@ void nVifGen(int usn, int mask, int curCycle, int cycles) {
// A | B5 | G5 | R5 // A | B5 | G5 | R5
// ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000 // ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000
xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xf]); // V4-5 indexer.xSetCall(0xf); // V4-5
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=0) xMOVAPS (xmm1, xmm0); if (cycles>=0) xMOVAPS (xmm1, xmm0);
if (cycles>=0) convertRGB(); if (cycles>=0) convertRGB();
@ -219,5 +251,6 @@ void nVifGen(int usn, int mask, int curCycle, int cycles) {
if (cycles>=2) convertRGB(); if (cycles>=2) convertRGB();
if (cycles>=2) xMOVAPS (ptr32[ecx+0x20], xmm2); if (cycles>=2) xMOVAPS (ptr32[ecx+0x20], xmm2);
xRET(); xRET();
HostSys::MemProtect(nVifUpk, sizeof(nVifUpk), Protect_ReadOnly, true);
pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
} }