newVif: optimizations, cleanups, and bug fixes...

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2349 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
cottonvibes 2009-12-16 02:27:53 +00:00
parent 1b4de736cc
commit e76e1c66db
3 changed files with 141 additions and 245 deletions

View File

@ -24,8 +24,8 @@ extern void _nVifUnpack(int idx, u8 *data, u32 size);
typedef u32 (__fastcall *nVifCall)(void*, void*);
static __pagealigned u8 nVifUpkExec[__pagesize*16];
static __aligned16 nVifCall nVifUpk[(2*2*16)*4*4]; // ([USN][Masking][Unpack Type]) [curCycle][CyclesToWrite-1]
static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
static __aligned16 nVifCall nVifUpk[(2*2*16)*4]; // ([USN][Masking][Unpack Type]) [curCycle]
static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
#define _v0 0
#define _v1 0x55

View File

@ -46,21 +46,21 @@ void initNewVif(int idx) {
for (int a = 0; a < 2; a++) {
for (int b = 0; b < 2; b++) {
for (int c = 0; c < 4; c++) {
for (int d = 0; d < 3; d++) {
nVifGen(a, b, c, d);
}}}}
nVifGen(a, b, c);
}}}
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
}
int nVifUnpack(int idx, u32 *data) {
XMMRegisters::Freeze();
//BlockBuffer* vB = nVif[idx].vifBlock;
int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
vif1.tag.size -= ret;
_nVifUnpack(idx, (u8*)data, ret<<2);
if (vif1.tag.size <= 0) vif1.tag.size = 0;
if (vif1.tag.size <= 0) vif1.cmd = 0;
if (vif1.tag.size <= 0) {
vif1.tag.size = 0;
vif1.cmd = 0;
}
XMMRegisters::Thaw();
return ret;
}
@ -78,7 +78,14 @@ _f void incVUptr(int idx, u8* &ptr, int amount) {
if ((uptr)ptr & 0xf) DevCon.WriteLn("unaligned wtf :(");
}
static void setMasks(const VIFregisters& v) {
static u32 oldMaskIdx = -1;
static u32 oldMask = 0;
static void setMasks(int idx, const VIFregisters& v) {
if (idx == oldMaskIdx && oldMask == v.mask) return;
oldMaskIdx = idx;
oldMask = v.mask;
//DevCon.WriteLn("mask");
for (int i = 0; i < 16; i++) {
int m = (v.mask >> (i*2)) & 3;
switch (m) {
@ -120,14 +127,8 @@ static void setMasks(const VIFregisters& v) {
// to be simple enough that it doesn't offset the benefits (which I'm not sure is possible).
// -- air
//template< int idx, bool doMode, bool isFill >
//__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size )
__releaseinline void __fastcall _nVifUnpackLoop( int idx, u8 *data, u32 size )
{
// comment out the following 2 lines to test templated version...
const bool doMode = !!vifRegs->mode;
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
template< int idx, bool doMode, bool isFill >
__releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) {
const int usn = !!(vif->usn);
const int doMask = !!(vif->tag.cmd & 0x10);
@ -141,67 +142,49 @@ __releaseinline void __fastcall _nVifUnpackLoop( int idx, u8 *data, u32 size )
// Did a bunch of work to make it so I could optimize this index lookup to outside
// the main loop but it was for naught -- too often the loop is only 1-2 iterations,
// so this setup code ends up being slower (1 iter) or same speed (2 iters).
const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*4) ];
const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*1) ];
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
const int skipSize = blockSize - cycleSize;
//if (skipSize > 2)
//DevCon.WriteLn("[num = %d][cl = %d][bl = %d][diff = %d]", vifRegs->num, vif->cl, blockSize, skipSize);
if (doMask)
setMasks(*vifRegs);
// This condition doesn't appear to ever occur, and really it never should.
// Normally it wouldn't matter, but even simple setup code matters here (see
// optimization notes above) >_<
if (vif->cl >= blockSize) vif->cl = 0;
if (doMask) setMasks(idx, *vifRegs);
if (vif->cl >= blockSize) {
// This condition doesn't appear to ever occur, and really it never should.
// Normally it wouldn't matter, but even simple setup code matters here (see
// optimization notes above) >_<
vif->cl = 0;
}
while (vifRegs->num > 0) {
if (vif->cl < cycleSize) {
//if (size <= 0) { DbgCon.WriteLn("_nVifUnpack: Out of Data!"); break; }
while (vifRegs->num /*&& size*/) {
if (vif->cl < cycleSize) {
if (doMode /*|| doMask*/) {
//if (doMask)
//DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum);
func((u32*)dest, (u32*)data, ft.qsize);
data += ft.gsize;
size -= ft.gsize;
vifRegs->num--;
}
else if (1) {
//DevCon.WriteLn("SSE Unpack!");
fnbase[aMin(vif->cl, 4) * 4](dest, data);
data += vift;
size -= vift;
vifRegs->num--;
}
else {
//DevCon.WriteLn("SSE Unpack!");
int c = aMin((cycleSize - vif->cl), 3);
size -= vift * c;
//if (c>1) { DevCon.WriteLn("C > 1!"); }
if (c<0||c>3) { DbgCon.WriteLn("C wtf!"); }
if (size < 0) { DbgCon.WriteLn("Size Shit"); size+=vift*c;c=1;size-=vift*c;}
fnbase[(aMin(vif->cl, 4) * 4) + c-1](dest, data);
data += vift * c;
vifRegs->num -= c;
fnbase[aMin(vif->cl, 4)](dest, data);
}
data += vift;
size -= vift;
vifRegs->num--;
incVUptr(idx, dest, 16);
if (++vif->cl == blockSize) vif->cl = 0;
}
else if (isFill) {
func((u32*)dest, (u32*)data, ft.qsize);
vifRegs->num--;
incVUptr(idx, dest, 16);
if (++vif->cl == blockSize) vif->cl = 0;
}
else {
incVUptr(idx, dest, 16 * skipSize);
vif->cl = 0;
}
incVUptr(idx, dest, 16);
// Removing this modulo was a huge speedup for God of War start menu. (62->73 fps)
// (GoW and tri-ace games both use a lot of blockSize==1 packets, resulting in tons
// of loops -- so the biggest factor in performance ends up being the top-level
// conditionals of the loop, and also the loop prep code.) --air
//vif->cl = (vif->cl+1) % blockSize;
if( ++vif->cl == blockSize ) vif->cl = 0;
}
//if (size > 0) DevCon.WriteLn("size = %d", size);
}
void _nVifUnpack(int idx, u8 *data, u32 size) {
@ -212,68 +195,42 @@ void _nVifUnpack(int idx, u8 *data, u32 size) {
}
else*/ { // filling write
vif = nVif[idx].vif;
vifRegs = nVif[idx].vifRegs;
#if 1
_nVifUnpackLoop( idx, data, size );
#else
// Eh... template attempt, tho it didn't help much. There's too much setup code,
// and the template only optimizes code inside the loop, which often times seems to
// only be run once or twice anyway. Better to use recompilation than templating
// anyway, but I'll leave it in for now for reference. -- air
const bool doMode = !!vifRegs->mode;
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
vif = nVif[idx].vif;
vifRegs = nVif[idx].vifRegs;
const bool doMode = !!vifRegs->mode;
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
//UnpackLoopTable[idx][doMode][isFill]( data, size );
if( idx )
{
if( doMode )
{
if( isFill )
_nVifUnpackLoop<1,true,true>( data, size );
else
_nVifUnpackLoop<1,true,false>( data, size );
if (idx) {
if (doMode) {
if (isFill) _nVifUnpackLoop<1,true,true> (data, size);
else _nVifUnpackLoop<1,true,false> (data, size);
}
else
{
if( isFill )
_nVifUnpackLoop<1,false,true>( data, size );
else
_nVifUnpackLoop<1,false,false>( data, size );
else {
if (isFill) _nVifUnpackLoop<1,false,true> (data, size);
else _nVifUnpackLoop<1,false,false>(data, size);
}
}
else
{
pxFailDev( "No VIF0 support yet, sorry!" );
}
#endif
else pxFailDev( "No VIF0 support yet, sorry!" );
//if (isFill)
//DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok"));
//DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);
//DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num);
}
}
//int nVifUnpack(int idx, u32 *data) {
// XMMRegisters::Freeze();
// BlockBuffer* vB = nVif[idx].vifBlock;
// int ret = aMin(vif1.vifpacketsize, vif1.tag.size);
// //vB->append(data, ret<<2);
// vif1.tag.size -= ret;
// //DevCon.WriteLn("2 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2);
// //if (vif1.tag.size <= 0) {
// //DevCon.WriteLn("3 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2);
// //VIFunpack<1>(vB->getBlock(), &vif1.tag, vB->getSize()>>2);
// //_nVifUnpack(idx, vB->getBlock(), vB->getSize());
// _nVifUnpack(idx, (u8*)data, ret<<2);
// if (vif1.tag.size <= 0) vif1.tag.size = 0;
// if (vif1.tag.size <= 0) vif1.cmd = 0;
// //vB->clear();
// //}
// //else { vif1.tag.size+=ret; ret = -1; vB->clear(); }
// XMMRegisters::Thaw();
// return ret;
//}
//data += ft.gsize;
//size -= ft.gsize;
//vifRegs->num--;
//else {
// //DevCon.WriteLn("SSE Unpack!");
// int c = aMin((cycleSize - vif->cl), 3);
// size -= vift * c;
// //if (c>1) { DevCon.WriteLn("C > 1!"); }
// if (c<0||c>3) { DbgCon.WriteLn("C wtf!"); }
// if (size < 0) { DbgCon.WriteLn("Size Shit"); size+=vift*c;c=1;size-=vift*c;}
// fnbase[(aMin(vif->cl, 4) * 4) + c-1](dest, data);
// data += vift * c;
// vifRegs->num -= c;
//}

View File

@ -29,17 +29,9 @@
if (x==2) xMOVAPS(ptr32[ecx+0x20], regX); \
}
#define xMovDest(reg0, reg1, reg2) { \
if (mask==0) { \
if (cycles>=0) { xMOVAPS (ptr32[ecx], reg0); } \
if (cycles>=1) { xMOVAPS (ptr32[ecx+0x10], reg1); } \
if (cycles>=2) { xMOVAPS (ptr32[ecx+0x20], reg2); } \
} \
else { \
if (cycles>=0) { xMaskWrite(reg0, 0); } \
if (cycles>=1) { xMaskWrite(reg1, 1); } \
if (cycles>=2) { xMaskWrite(reg2, 2); } \
} \
#define xMovDest(reg0) { \
if (mask==0) { xMOVAPS (ptr32[ecx], reg0); } \
else { xMaskWrite(reg0, 0); } \
}
// xmm2 gets result
@ -66,189 +58,136 @@ void convertRGB() {
xPSRL.D (xmm2, 24); // single AND...
}
struct VifUnpackIndexer
{
struct VifUnpackIndexer {
int usn, mask;
int curCycle, cyclesToWrite;
nVifCall& GetCall( int packType ) const
{
nVifCall& GetCall(int packType) const {
int usnpart = usn*2*16;
int maskpart = mask*16;
int packpart = packType;
int curpart = curCycle;
int curpart = curCycle*4;
int cycpespart = cyclesToWrite;
return nVifUpk[((usnpart+maskpart+packpart)*(4*4)) + (curpart+cycpespart)];
return nVifUpk[((usnpart+maskpart+packpart)*4) + (curpart)];
}
void xSetCall( int packType ) const
{
void xSetCall(int packType) const {
GetCall( packType ) = (nVifCall)xGetAlignedCallTarget();
}
void xSetNullCall( int packType ) const
{
void xSetNullCall(int packType) const {
GetCall( packType ) = NULL;
}
};
// xMOVSS doesn't seem to have all overloads defined with new emitter
#define xMOVSSS(regX, loc) SSE_MOVSS_Rm_to_XMM(0, 2, 0)
#define xMOV8(regX, loc) xMOVSSS(regX, loc)
#define xMOV16(regX, loc) xMOVSSS(regX, loc)
#define xMOV32(regX, loc) xMOVSSS(regX, loc)
#define xMOV64(regX, loc) xMOVUPS(regX, loc)
#define xMOV128(regX, loc) xMOVUPS(regX, loc)
// ecx = dest, edx = src
void nVifGen(int usn, int mask, int curCycle, int cycles) {
const VifUnpackIndexer indexer = { usn, mask, curCycle, cycles };
void nVifGen(int usn, int mask, int curCycle) {
const VifUnpackIndexer indexer = { usn, mask, curCycle, 0 };
indexer.xSetCall(0x0); // S-32
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0);
if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1);
if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2);
if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
xMOV32 (xmm0, ptr32[edx]);
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
xRET();
indexer.xSetCall(0x1); // S-16
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
if (cycles>=0) xShiftR (xmm0, 16);
if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0);
if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1);
if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2);
if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
xMOV16 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
xRET();
indexer.xSetCall(0x2); // S-8
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
if (cycles>=0) xShiftR (xmm0, 24);
if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0);
if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1);
if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2);
if (cycles>=0) xMovDest (xmm1, xmm2, xmm3);
xMOV8 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
xRET();
indexer.xSetNullCall(0x3); // ----
indexer.xSetCall(0x4); // V2-32
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x10]);
if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xMOV64 (xmm0, ptr32[edx]);
xMovDest (xmm0);
xRET();
indexer.xSetCall(0x5); // V2-16
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
if (cycles>=0) xShiftR (xmm0, 16);
if (cycles>=2) xShiftR (xmm2, 16);
if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
xMovDest (xmm0);
xRET();
indexer.xSetCall(0x6); // V2-8
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
if (cycles>=0) xShiftR (xmm0, 24);
if (cycles>=2) xShiftR (xmm2, 24);
if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xMOV16 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
xMovDest (xmm0);
xRET();
indexer.xSetNullCall(0x7); // ----
indexer.xSetCall(0x8); // V3-32
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+12]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+24]);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xMOV128 (xmm0, ptr32[edx]);
xMovDest (xmm0);
xRET();
indexer.xSetCall(0x9); // V3-16
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+6]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+12]);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
if (cycles>=0) xShiftR (xmm0, 16);
if (cycles>=1) xShiftR (xmm1, 16);
if (cycles>=2) xShiftR (xmm2, 16);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xMOV64 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
xMovDest (xmm0);
xRET();
indexer.xSetCall(0xa); // V3-8
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+3]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+6]);
if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1);
if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
if (cycles>=0) xShiftR (xmm0, 24);
if (cycles>=1) xShiftR (xmm1, 24);
if (cycles>=2) xShiftR (xmm2, 24);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
xMovDest (xmm0);
xRET();
indexer.xSetNullCall(0xb); // ----
indexer.xSetCall(0xc); // V4-32
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xMOV128 (xmm0, ptr32[edx]);
xMovDest (xmm0);
xRET();
indexer.xSetCall(0xd); // V4-16
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
if (cycles>=0) xShiftR (xmm0, 16);
if (cycles>=1) xShiftR (xmm1, 16);
if (cycles>=2) xShiftR (xmm2, 16);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xMOV64 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
xMovDest (xmm0);
xRET();
indexer.xSetCall(0xe); // V4-8
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+4]);
if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+8]);
if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0);
if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1);
if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2);
if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0);
if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1);
if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2);
if (cycles>=0) xShiftR (xmm0, 24);
if (cycles>=1) xShiftR (xmm1, 24);
if (cycles>=2) xShiftR (xmm2, 24);
if (cycles>=0) xMovDest (xmm0, xmm1, xmm2);
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
xMovDest (xmm0);
xRET();
// A | B5 | G5 | R5
// ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000
indexer.xSetCall(0xf); // V4-5
if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]);
if (cycles>=0) xMOVAPS (xmm1, xmm0);
if (cycles>=0) convertRGB();
if (cycles>=0) xMOVAPS (ptr32[ecx], xmm2);
if (cycles>=1) xMOVAPS (xmm1, xmm0);
if (cycles>=1) xPSRL.D (xmm1, 16);
if (cycles>=1) convertRGB();
if (cycles>=1) xMOVAPS (ptr32[ecx+0x10], xmm2);
if (cycles>=2) xPSHUF.D (xmm1, xmm0, _v1);
if (cycles>=2) convertRGB();
if (cycles>=2) xMOVAPS (ptr32[ecx+0x20], xmm2);
xMOV16 (xmm0, ptr32[edx]);
xMOVAPS (xmm1, xmm0);
convertRGB();
xMovDest (xmm2);
xRET();
pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );