From e76e1c66dba1bec5158b8dc72d99303b53a01a36 Mon Sep 17 00:00:00 2001 From: cottonvibes Date: Wed, 16 Dec 2009 02:27:53 +0000 Subject: [PATCH] newVif: optimizations, cleanups, and bug fixes... git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2349 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/x86/newVif.h | 4 +- pcsx2/x86/newVif_Unpack.inl | 181 +++++++++++------------------ pcsx2/x86/newVif_UnpackGen.inl | 201 ++++++++++++--------------------- 3 files changed, 141 insertions(+), 245 deletions(-) diff --git a/pcsx2/x86/newVif.h b/pcsx2/x86/newVif.h index 6d369a8d00..e814738cd9 100644 --- a/pcsx2/x86/newVif.h +++ b/pcsx2/x86/newVif.h @@ -24,8 +24,8 @@ extern void _nVifUnpack(int idx, u8 *data, u32 size); typedef u32 (__fastcall *nVifCall)(void*, void*); static __pagealigned u8 nVifUpkExec[__pagesize*16]; -static __aligned16 nVifCall nVifUpk[(2*2*16)*4*4]; // ([USN][Masking][Unpack Type]) [curCycle][CyclesToWrite-1] -static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector] +static __aligned16 nVifCall nVifUpk[(2*2*16)*4]; // ([USN][Masking][Unpack Type]) [curCycle] +static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector] #define _v0 0 #define _v1 0x55 diff --git a/pcsx2/x86/newVif_Unpack.inl b/pcsx2/x86/newVif_Unpack.inl index 6d1296f8c0..54f36e3d87 100644 --- a/pcsx2/x86/newVif_Unpack.inl +++ b/pcsx2/x86/newVif_Unpack.inl @@ -46,21 +46,21 @@ void initNewVif(int idx) { for (int a = 0; a < 2; a++) { for (int b = 0; b < 2; b++) { for (int c = 0; c < 4; c++) { - for (int d = 0; d < 3; d++) { - nVifGen(a, b, c, d); - }}}} + nVifGen(a, b, c); + }}} HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true); } int nVifUnpack(int idx, u32 *data) { XMMRegisters::Freeze(); - //BlockBuffer* vB = nVif[idx].vifBlock; - int ret = aMin(vif1.vifpacketsize, vif1.tag.size); + int ret = aMin(vif1.vifpacketsize, vif1.tag.size); vif1.tag.size -= ret; _nVifUnpack(idx, (u8*)data, ret<<2); - if (vif1.tag.size <= 0) vif1.tag.size = 0; - if (vif1.tag.size <= 0) vif1.cmd = 0; + if (vif1.tag.size <= 0) { + vif1.tag.size = 0; + vif1.cmd = 0; + } XMMRegisters::Thaw(); return ret; } @@ -78,7 +78,14 @@ _f void incVUptr(int idx, u8* &ptr, int amount) { if ((uptr)ptr & 0xf) DevCon.WriteLn("unaligned wtf :("); } -static void setMasks(const VIFregisters& v) { +static u32 oldMaskIdx = -1; +static u32 oldMask = 0; + +static void setMasks(int idx, const VIFregisters& v) { + if (idx == oldMaskIdx && oldMask == v.mask) return; + oldMaskIdx = idx; + oldMask = v.mask; + //DevCon.WriteLn("mask"); for (int i = 0; i < 16; i++) { int m = (v.mask >> (i*2)) & 3; switch (m) { @@ -120,14 +127,8 @@ static void setMasks(const VIFregisters& v) { // to be simple enough that it doesn't offset the benefits (which I'm not sure is possible). // -- air - -//template< int idx, bool doMode, bool isFill > -//__releaseinline void __fastcall _nVifUnpackLoop( u8 *data, u32 size ) -__releaseinline void __fastcall _nVifUnpackLoop( int idx, u8 *data, u32 size ) -{ - // comment out the following 2 lines to test templated version... - const bool doMode = !!vifRegs->mode; - const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl); +template< int idx, bool doMode, bool isFill > +__releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) { const int usn = !!(vif->usn); const int doMask = !!(vif->tag.cmd & 0x10); @@ -141,67 +142,49 @@ __releaseinline void __fastcall _nVifUnpackLoop( int idx, u8 *data, u32 size ) // Did a bunch of work to make it so I could optimize this index lookup to outside // the main loop but it was for naught -- too often the loop is only 1-2 iterations, // so this setup code ends up being slower (1 iter) or same speed (2 iters). - const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*4) ]; + const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*1) ]; const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl; const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl; + const int skipSize = blockSize - cycleSize; + //if (skipSize > 2) + //DevCon.WriteLn("[num = %d][cl = %d][bl = %d][diff = %d]", vifRegs->num, vif->cl, blockSize, skipSize); - if (doMask) - setMasks(*vifRegs); + // This condition doesn't appear to ever occur, and really it never should. + // Normally it wouldn't matter, but even simple setup code matters here (see + // optimization notes above) >_< + if (vif->cl >= blockSize) vif->cl = 0; + if (doMask) setMasks(idx, *vifRegs); - if (vif->cl >= blockSize) { - - // This condition doesn't appear to ever occur, and really it never should. - // Normally it wouldn't matter, but even simple setup code matters here (see - // optimization notes above) >_< - - vif->cl = 0; - } - - while (vifRegs->num > 0) { - if (vif->cl < cycleSize) { - //if (size <= 0) { DbgCon.WriteLn("_nVifUnpack: Out of Data!"); break; } + while (vifRegs->num /*&& size*/) { + if (vif->cl < cycleSize) { if (doMode /*|| doMask*/) { //if (doMask) //DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum); func((u32*)dest, (u32*)data, ft.qsize); - data += ft.gsize; - size -= ft.gsize; - vifRegs->num--; - } - else if (1) { - //DevCon.WriteLn("SSE Unpack!"); - fnbase[aMin(vif->cl, 4) * 4](dest, data); - data += vift; - size -= vift; - vifRegs->num--; } else { //DevCon.WriteLn("SSE Unpack!"); - int c = aMin((cycleSize - vif->cl), 3); - size -= vift * c; - //if (c>1) { DevCon.WriteLn("C > 1!"); } - if (c<0||c>3) { DbgCon.WriteLn("C wtf!"); } - if (size < 0) { DbgCon.WriteLn("Size Shit"); size+=vift*c;c=1;size-=vift*c;} - fnbase[(aMin(vif->cl, 4) * 4) + c-1](dest, data); - data += vift * c; - vifRegs->num -= c; + fnbase[aMin(vif->cl, 4)](dest, data); } + data += vift; + size -= vift; + vifRegs->num--; + incVUptr(idx, dest, 16); + if (++vif->cl == blockSize) vif->cl = 0; } else if (isFill) { func((u32*)dest, (u32*)data, ft.qsize); vifRegs->num--; + incVUptr(idx, dest, 16); + if (++vif->cl == blockSize) vif->cl = 0; + } + else { + incVUptr(idx, dest, 16 * skipSize); + vif->cl = 0; } - incVUptr(idx, dest, 16); - - // Removing this modulo was a huge speedup for God of War start menu. (62->73 fps) - // (GoW and tri-ace games both use a lot of blockSize==1 packets, resulting in tons - // of loops -- so the biggest factor in performance ends up being the top-level - // conditionals of the loop, and also the loop prep code.) --air - - //vif->cl = (vif->cl+1) % blockSize; - if( ++vif->cl == blockSize ) vif->cl = 0; } + //if (size > 0) DevCon.WriteLn("size = %d", size); } void _nVifUnpack(int idx, u8 *data, u32 size) { @@ -212,68 +195,42 @@ void _nVifUnpack(int idx, u8 *data, u32 size) { } else*/ { // filling write - vif = nVif[idx].vif; - vifRegs = nVif[idx].vifRegs; - -#if 1 - _nVifUnpackLoop( idx, data, size ); -#else - // Eh... template attempt, tho it didn't help much. There's too much setup code, - // and the template only optimizes code inside the loop, which often times seems to - // only be run once or twice anyway. Better to use recompilation than templating - // anyway, but I'll leave it in for now for reference. -- air - - const bool doMode = !!vifRegs->mode; - const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl); + vif = nVif[idx].vif; + vifRegs = nVif[idx].vifRegs; + const bool doMode = !!vifRegs->mode; + const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl); //UnpackLoopTable[idx][doMode][isFill]( data, size ); - if( idx ) - { - if( doMode ) - { - if( isFill ) - _nVifUnpackLoop<1,true,true>( data, size ); - else - _nVifUnpackLoop<1,true,false>( data, size ); + if (idx) { + if (doMode) { + if (isFill) _nVifUnpackLoop<1,true,true> (data, size); + else _nVifUnpackLoop<1,true,false> (data, size); } - else - { - if( isFill ) - _nVifUnpackLoop<1,false,true>( data, size ); - else - _nVifUnpackLoop<1,false,false>( data, size ); + else { + if (isFill) _nVifUnpackLoop<1,false,true> (data, size); + else _nVifUnpackLoop<1,false,false>(data, size); } } - else - { - pxFailDev( "No VIF0 support yet, sorry!" ); - } -#endif + else pxFailDev( "No VIF0 support yet, sorry!" ); + //if (isFill) //DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok")); - //DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num); - + //DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num); } } -//int nVifUnpack(int idx, u32 *data) { -// XMMRegisters::Freeze(); -// BlockBuffer* vB = nVif[idx].vifBlock; -// int ret = aMin(vif1.vifpacketsize, vif1.tag.size); -// //vB->append(data, ret<<2); -// vif1.tag.size -= ret; -// //DevCon.WriteLn("2 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2); -// //if (vif1.tag.size <= 0) { -// //DevCon.WriteLn("3 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2); -// //VIFunpack<1>(vB->getBlock(), &vif1.tag, vB->getSize()>>2); -// //_nVifUnpack(idx, vB->getBlock(), vB->getSize()); -// _nVifUnpack(idx, (u8*)data, ret<<2); -// if (vif1.tag.size <= 0) vif1.tag.size = 0; -// if (vif1.tag.size <= 0) vif1.cmd = 0; -// //vB->clear(); -// //} -// //else { vif1.tag.size+=ret; ret = -1; vB->clear(); } -// XMMRegisters::Thaw(); -// return ret; -//} +//data += ft.gsize; +//size -= ft.gsize; +//vifRegs->num--; +//else { +// //DevCon.WriteLn("SSE Unpack!"); +// int c = aMin((cycleSize - vif->cl), 3); +// size -= vift * c; +// //if (c>1) { DevCon.WriteLn("C > 1!"); } +// if (c<0||c>3) { DbgCon.WriteLn("C wtf!"); } +// if (size < 0) { DbgCon.WriteLn("Size Shit"); size+=vift*c;c=1;size-=vift*c;} +// fnbase[(aMin(vif->cl, 4) * 4) + c-1](dest, data); +// data += vift * c; +// vifRegs->num -= c; +//} \ No newline at end of file diff --git a/pcsx2/x86/newVif_UnpackGen.inl b/pcsx2/x86/newVif_UnpackGen.inl index e735704e62..b3dad6655b 100644 --- a/pcsx2/x86/newVif_UnpackGen.inl +++ b/pcsx2/x86/newVif_UnpackGen.inl @@ -29,17 +29,9 @@ if (x==2) xMOVAPS(ptr32[ecx+0x20], regX); \ } -#define xMovDest(reg0, reg1, reg2) { \ - if (mask==0) { \ - if (cycles>=0) { xMOVAPS (ptr32[ecx], reg0); } \ - if (cycles>=1) { xMOVAPS (ptr32[ecx+0x10], reg1); } \ - if (cycles>=2) { xMOVAPS (ptr32[ecx+0x20], reg2); } \ - } \ - else { \ - if (cycles>=0) { xMaskWrite(reg0, 0); } \ - if (cycles>=1) { xMaskWrite(reg1, 1); } \ - if (cycles>=2) { xMaskWrite(reg2, 2); } \ - } \ +#define xMovDest(reg0) { \ + if (mask==0) { xMOVAPS (ptr32[ecx], reg0); } \ + else { xMaskWrite(reg0, 0); } \ } // xmm2 gets result @@ -66,189 +58,136 @@ void convertRGB() { xPSRL.D (xmm2, 24); // single AND... } -struct VifUnpackIndexer -{ +struct VifUnpackIndexer { int usn, mask; int curCycle, cyclesToWrite; - nVifCall& GetCall( int packType ) const - { + nVifCall& GetCall(int packType) const { int usnpart = usn*2*16; int maskpart = mask*16; int packpart = packType; + int curpart = curCycle; - int curpart = curCycle*4; - int cycpespart = cyclesToWrite; - - return nVifUpk[((usnpart+maskpart+packpart)*(4*4)) + (curpart+cycpespart)]; + return nVifUpk[((usnpart+maskpart+packpart)*4) + (curpart)]; } - void xSetCall( int packType ) const - { + void xSetCall(int packType) const { GetCall( packType ) = (nVifCall)xGetAlignedCallTarget(); } - void xSetNullCall( int packType ) const - { + void xSetNullCall(int packType) const { GetCall( packType ) = NULL; } }; +// xMOVSS doesn't seem to have all overloads defined with new emitter +#define xMOVSSS(regX, loc) SSE_MOVSS_Rm_to_XMM(0, 2, 0) + +#define xMOV8(regX, loc) xMOVSSS(regX, loc) +#define xMOV16(regX, loc) xMOVSSS(regX, loc) +#define xMOV32(regX, loc) xMOVSSS(regX, loc) +#define xMOV64(regX, loc) xMOVUPS(regX, loc) +#define xMOV128(regX, loc) xMOVUPS(regX, loc) // ecx = dest, edx = src -void nVifGen(int usn, int mask, int curCycle, int cycles) { - const VifUnpackIndexer indexer = { usn, mask, curCycle, cycles }; +void nVifGen(int usn, int mask, int curCycle) { + const VifUnpackIndexer indexer = { usn, mask, curCycle, 0 }; indexer.xSetCall(0x0); // S-32 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); - if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); - if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); - if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); + xMOV32 (xmm0, ptr32[edx]); + xPSHUF.D (xmm1, xmm0, _v0); + xMovDest (xmm1); xRET(); indexer.xSetCall(0x1); // S-16 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=0) xShiftR (xmm0, 16); - if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); - if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); - if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); - if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); + xMOV16 (xmm0, ptr32[edx]); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 16); + xPSHUF.D (xmm1, xmm0, _v0); + xMovDest (xmm1); xRET(); indexer.xSetCall(0x2); // S-8 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=0) xShiftR (xmm0, 24); - if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); - if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); - if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); - if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); + xMOV8 (xmm0, ptr32[edx]); + xPUNPCK.LBW(xmm0, xmm0); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 24); + xPSHUF.D (xmm1, xmm0, _v0); + xMovDest (xmm1); xRET(); indexer.xSetNullCall(0x3); // ---- indexer.xSetCall(0x4); // V2-32 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x10]); - if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xMOV64 (xmm0, ptr32[edx]); + xMovDest (xmm0); xRET(); indexer.xSetCall(0x5); // V2-16 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 16); - if (cycles>=2) xShiftR (xmm2, 16); - if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xMOV32 (xmm0, ptr32[edx]); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 16); + xMovDest (xmm0); xRET(); indexer.xSetCall(0x6); // V2-8 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); - if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 24); - if (cycles>=2) xShiftR (xmm2, 24); - if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xMOV16 (xmm0, ptr32[edx]); + xPUNPCK.LBW(xmm0, xmm0); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 24); + xMovDest (xmm0); xRET(); indexer.xSetNullCall(0x7); // ---- indexer.xSetCall(0x8); // V3-32 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+12]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+24]); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xMOV128 (xmm0, ptr32[edx]); + xMovDest (xmm0); xRET(); indexer.xSetCall(0x9); // V3-16 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+6]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+12]); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 16); - if (cycles>=1) xShiftR (xmm1, 16); - if (cycles>=2) xShiftR (xmm2, 16); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xMOV64 (xmm0, ptr32[edx]); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 16); + xMovDest (xmm0); xRET(); indexer.xSetCall(0xa); // V3-8 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+3]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+6]); - if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 24); - if (cycles>=1) xShiftR (xmm1, 24); - if (cycles>=2) xShiftR (xmm2, 24); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xMOV32 (xmm0, ptr32[edx]); + xPUNPCK.LBW(xmm0, xmm0); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 24); + xMovDest (xmm0); xRET(); indexer.xSetNullCall(0xb); // ---- indexer.xSetCall(0xc); // V4-32 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xMOV128 (xmm0, ptr32[edx]); + xMovDest (xmm0); xRET(); indexer.xSetCall(0xd); // V4-16 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 16); - if (cycles>=1) xShiftR (xmm1, 16); - if (cycles>=2) xShiftR (xmm2, 16); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xMOV64 (xmm0, ptr32[edx]); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 16); + xMovDest (xmm0); xRET(); indexer.xSetCall(0xe); // V4-8 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+4]); - if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+8]); - if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2); - if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); - if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); - if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); - if (cycles>=0) xShiftR (xmm0, 24); - if (cycles>=1) xShiftR (xmm1, 24); - if (cycles>=2) xShiftR (xmm2, 24); - if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xMOV32 (xmm0, ptr32[edx]); + xPUNPCK.LBW(xmm0, xmm0); + xPUNPCK.LWD(xmm0, xmm0); + xShiftR (xmm0, 24); + xMovDest (xmm0); xRET(); // A | B5 | G5 | R5 // ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000 indexer.xSetCall(0xf); // V4-5 - if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); - if (cycles>=0) xMOVAPS (xmm1, xmm0); - if (cycles>=0) convertRGB(); - if (cycles>=0) xMOVAPS (ptr32[ecx], xmm2); - if (cycles>=1) xMOVAPS (xmm1, xmm0); - if (cycles>=1) xPSRL.D (xmm1, 16); - if (cycles>=1) convertRGB(); - if (cycles>=1) xMOVAPS (ptr32[ecx+0x10], xmm2); - if (cycles>=2) xPSHUF.D (xmm1, xmm0, _v1); - if (cycles>=2) convertRGB(); - if (cycles>=2) xMOVAPS (ptr32[ecx+0x20], xmm2); + xMOV16 (xmm0, ptr32[edx]); + xMOVAPS (xmm1, xmm0); + convertRGB(); + xMovDest (xmm2); xRET(); pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );