/* Pcsx2 - Pc Ps2 Emulator * Copyright (C) 2002-2003 Pcsx2 Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include "Common.h" #include "ix86/ix86.h" #include "Vif.h" #include "VUmicro.h" #include VIFregisters *_vifRegs; u32* _vifMaskRegs = NULL; __declspec(align(16)) u32 g_vifRow0[4], g_vifCol0[4], g_vifRow1[4], g_vifCol1[4]; u32* _vifRow = NULL; vifStruct *_vif; static int n; static int i; __inline static int _limit( int a, int max ) { return ( a > max ? max : a ); } #define _UNPACKpart( offnum, func ) \ if ( ( size > 0 ) && ( _vifRegs->offset == offnum ) ) { \ func; \ size--; \ _vifRegs->offset++; \ } #define _UNPACKpart_nosize( offnum, func ) \ if ( ( _vifRegs->offset == offnum ) ) { \ func; \ _vifRegs->offset++; \ } static void _writeX( u32 *dest, u32 data ) { //int n; switch ( _vif->cl ) { case 0: n = 0; break; case 1: n = 8; break; case 2: n = 16; break; default: n = 24; break; } #ifdef VIF_LOG VIF_LOG("_writeX %x = %x (writecycle=%d; mask %x; mode %d)\n", (u32)dest-(u32)VU1.Mem, data, _vif->cl, ( _vifRegs->mask >> n ) & 0x3,_vifRegs->mode); #endif switch ( ( _vifRegs->mask >> n ) & 0x3 ) { case 0: if (_vifRegs->mode == 1) { *dest = data + _vifRegs->r0; } else if (_vifRegs->mode == 2) { _vifRegs->r0 = data + _vifRegs->r0; *dest = _vifRegs->r0; } else { *dest = data; } break; case 1: *dest = _vifRegs->r0; break; case 2: switch ( _vif->cl ) { case 0: *dest = _vifRegs->c0; break; case 1: *dest = _vifRegs->c1; break; case 2: *dest = _vifRegs->c2; break; default: *dest = _vifRegs->c3; break; } break; } #ifdef VIF_LOG VIF_LOG("_writeX-done : Data %x : Row %x\n", *dest, _vifRegs->r0); #endif } static void _writeY( u32 *dest, u32 data ) { //int n; switch ( _vif->cl ) { case 0: n = 2; break; case 1: n = 10; break; case 2: n = 18; break; default: n = 26; break; } #ifdef VIF_LOG VIF_LOG("_writeY %x = %x (writecycle=%d; mask %x; mode %d)\n", (u32)dest-(u32)VU1.Mem, data, _vif->cl, ( _vifRegs->mask >> n ) & 0x3,_vifRegs->mode); #endif switch ( ( _vifRegs->mask >> n ) & 0x3 ) { case 0: if (_vifRegs->mode == 1) { *dest = data + _vifRegs->r1; } else if (_vifRegs->mode == 2) { _vifRegs->r1 = data + _vifRegs->r1; *dest = _vifRegs->r1; } else { *dest = data; } break; case 1: *dest = _vifRegs->r1; break; case 2: switch ( _vif->cl ) { case 0: *dest = _vifRegs->c0; break; case 1: *dest = _vifRegs->c1; break; case 2: *dest = _vifRegs->c2; break; default: *dest = _vifRegs->c3; break; } break; } #ifdef VIF_LOG VIF_LOG("_writeY-done : Data %x : Row %x\n", *dest, _vifRegs->r1); #endif } static void _writeZ( u32 *dest, u32 data ) { //int n; switch ( _vif->cl ) { case 0: n = 4; break; case 1: n = 12; break; case 2: n = 20; break; default: n = 28; break; } #ifdef VIF_LOG VIF_LOG("_writeZ %x = %x (writecycle=%d; mask %x; mode %d)\n", (u32)dest-(u32)VU1.Mem, data, _vif->cl, ( _vifRegs->mask >> n ) & 0x3,_vifRegs->mode); #endif switch ( ( _vifRegs->mask >> n ) & 0x3 ) { case 0: if (_vifRegs->mode == 1) { *dest = data + _vifRegs->r2; } else if (_vifRegs->mode == 2) { _vifRegs->r2 = data + _vifRegs->r2; *dest = _vifRegs->r2; } else { *dest = data; } break; case 1: *dest = _vifRegs->r2; break; case 2: switch ( _vif->cl ) { case 0: *dest = _vifRegs->c0; break; case 1: *dest = _vifRegs->c1; break; case 2: *dest = _vifRegs->c2; break; default: *dest = _vifRegs->c3; break; } break; } } static void _writeW( u32 *dest, u32 data ) { //int n; switch ( _vif->cl ) { case 0: n = 6; break; case 1: n = 14; break; case 2: n = 22; break; default: n = 30; break; } #ifdef VIF_LOG VIF_LOG("_writeW %x = %x (writecycle=%d; mask %x; mode %d)\n", (u32)dest-(u32)VU1.Mem, data, _vif->cl, ( _vifRegs->mask >> n ) & 0x3,_vifRegs->mode); #endif switch ( ( _vifRegs->mask >> n ) & 0x3 ) { case 0: if (_vifRegs->mode == 1) { *dest = data + _vifRegs->r3; } else if (_vifRegs->mode == 2) { _vifRegs->r3 = data + _vifRegs->r3; *dest = _vifRegs->r3; } else { *dest = data; } break; case 1: *dest = _vifRegs->r3; break; case 2: switch ( _vif->cl ) { case 0: *dest = _vifRegs->c0; break; case 1: *dest = _vifRegs->c1; break; case 2: *dest = _vifRegs->c2; break; default: *dest = _vifRegs->c3; break; } break; } } static void writeX( u32 *dest, u32 data ) { if (_vifRegs->code & 0x10000000) { _writeX(dest, data); return; } if (_vifRegs->mode == 1) { *dest = data + _vifRegs->r0; } else if (_vifRegs->mode == 2) { _vifRegs->r0 = data + _vifRegs->r0; *dest = _vifRegs->r0; } else { *dest = data; } #ifdef VIF_LOG VIF_LOG("writeX %8.8x : Mode %d, r0 = %x, writing %8.8x\n", *dest,_vifRegs->mode,_vifRegs->r0,data + _vifRegs->r0); #endif } static void writeY( u32 *dest, u32 data ) { if (_vifRegs->code & 0x10000000) { _writeY(dest, data); return; } if (_vifRegs->mode == 1) { *dest = data + _vifRegs->r1; } else if (_vifRegs->mode == 2) { _vifRegs->r1 = data + _vifRegs->r1; *dest = _vifRegs->r1; } else { *dest = data; } #ifdef VIF_LOG VIF_LOG("writeY %8.8x : Mode %d, r1 = %x, writing %8.8x\n", *dest,_vifRegs->mode,_vifRegs->r1,data + _vifRegs->r1); #endif } static void writeZ( u32 *dest, u32 data ) { if (_vifRegs->code & 0x10000000) { _writeZ(dest, data); return; } if (_vifRegs->mode == 1) { *dest = data + _vifRegs->r2; } else if (_vifRegs->mode == 2) { _vifRegs->r2 = data + _vifRegs->r2; *dest = _vifRegs->r2; } else { *dest = data; } #ifdef VIF_LOG VIF_LOG("writeZ %8.8x : Mode %d, r2 = %x, writing %8.8x\n", *dest,_vifRegs->mode,_vifRegs->r2,data + _vifRegs->r2); #endif } static void writeW( u32 *dest, u32 data ) { if (_vifRegs->code & 0x10000000) { _writeW(dest, data); return; } if (_vifRegs->mode == 1) { *dest = data + _vifRegs->r3; } else if (_vifRegs->mode == 2) { _vifRegs->r3 = data + _vifRegs->r3; *dest = _vifRegs->r3; } else { *dest = data; } #ifdef VIF_LOG VIF_LOG("writeW %8.8x : Mode %d, r3 = %x, writing %8.8x\n", *dest,_vifRegs->mode,_vifRegs->r3,data + _vifRegs->r3); #endif } void UNPACK_S_32(u32 *dest, u32 *data) { writeX(dest++, *data); writeY(dest++, *data); writeZ(dest++, *data); writeW(dest++, *data++); } int UNPACK_S_32part(u32 *dest, u32 *data, int size) { u32 *_data = data; while (size > 0) { _UNPACKpart(0, writeX(dest++, *data) ); _UNPACKpart(1, writeY(dest++, *data) ); _UNPACKpart(2, writeZ(dest++, *data) ); _UNPACKpart(3, writeW(dest++, *data++) ); if (_vifRegs->offset == 4) _vifRegs->offset = 0; } return (u32)data - (u32)_data; } #define _UNPACK_S_16(format) \ format *sdata = (format*)data; \ \ \ writeX(dest++, *sdata); \ writeY(dest++, *sdata); \ writeZ(dest++, *sdata); \ writeW(dest++, *sdata++); void UNPACK_S_16s( u32 *dest, u32 *data ) { _UNPACK_S_16( s16 ); } void UNPACK_S_16u( u32 *dest, u32 *data ) { _UNPACK_S_16( u16 ); } #define _UNPACK_S_16part(format) \ format *sdata = (format*)data; \ while (size > 0) { \ _UNPACKpart(0, writeX(dest++, *sdata) ); \ _UNPACKpart(1, writeY(dest++, *sdata) ); \ _UNPACKpart(2, writeZ(dest++, *sdata) ); \ _UNPACKpart(3, writeW(dest++, *sdata++) ); \ if (_vifRegs->offset == 4) _vifRegs->offset = 0; \ } \ return (u32)sdata - (u32)data; int UNPACK_S_16spart(u32 *dest, u32 *data, int size) { _UNPACK_S_16part(s16); } int UNPACK_S_16upart(u32 *dest, u32 *data, int size) { _UNPACK_S_16part(u16); } #define _UNPACK_S_8(format) \ format *cdata = (format*)data; \ \ \ writeX(dest++, *cdata); \ writeY(dest++, *cdata); \ writeZ(dest++, *cdata); \ writeW(dest++, *cdata++); void UNPACK_S_8s(u32 *dest, u32 *data) { _UNPACK_S_8(s8); } void UNPACK_S_8u(u32 *dest, u32 *data) { _UNPACK_S_8(u8); } #define _UNPACK_S_8part(format) \ format *cdata = (format*)data; \ while (size > 0) { \ _UNPACKpart(0, writeX(dest++, *cdata) ); \ _UNPACKpart(1, writeY(dest++, *cdata) ); \ _UNPACKpart(2, writeZ(dest++, *cdata) ); \ _UNPACKpart(3, writeW(dest++, *cdata++) ); \ if (_vifRegs->offset == 4) _vifRegs->offset = 0; \ } \ return (u32)cdata - (u32)data; int UNPACK_S_8spart(u32 *dest, u32 *data, int size) { _UNPACK_S_8part(s8); } int UNPACK_S_8upart(u32 *dest, u32 *data, int size) { _UNPACK_S_8part(u8); } void UNPACK_V2_32( u32 *dest, u32 *data ) { writeX(dest++, *data++); writeY(dest++, *data++); writeZ(dest++, 0); writeW(dest++, 0); } int UNPACK_V2_32part( u32 *dest, u32 *data, int size ) { u32 *_data = data; while (size > 0) { _UNPACKpart(0, writeX(dest++, *data++)); _UNPACKpart(1, writeY(dest++, *data++)); _UNPACKpart_nosize(2, writeZ(dest++, 0)); _UNPACKpart_nosize(3, writeW(dest++, 0)); if (_vifRegs->offset == 4) _vifRegs->offset = 0; } return (u32)data - (u32)_data; } #define _UNPACK_V2_16(format) \ format *sdata = (format*)data; \ \ \ writeX(dest++, *sdata++); \ writeY(dest++, *sdata++); \ writeZ(dest++, 0); \ writeW(dest++, 0); \ void UNPACK_V2_16s(u32 *dest, u32 *data) { _UNPACK_V2_16(s16); } void UNPACK_V2_16u(u32 *dest, u32 *data) { _UNPACK_V2_16(u16); } #define _UNPACK_V2_16part(format) \ format *sdata = (format*)data; \ \ while(size > 0) { \ _UNPACKpart(0, writeX(dest++, *sdata++)); \ _UNPACKpart(1, writeY(dest++, *sdata++)); \ _UNPACKpart_nosize(2,writeZ(dest++, 0)); \ _UNPACKpart_nosize(3,writeW(dest++, 0)); \ if (_vifRegs->offset == 4) _vifRegs->offset = 0; \ } \ return (u32)sdata - (u32)data; int UNPACK_V2_16spart(u32 *dest, u32 *data, int size) { _UNPACK_V2_16part(s16); } int UNPACK_V2_16upart(u32 *dest, u32 *data, int size) { _UNPACK_V2_16part(u16); } #define _UNPACK_V2_8(format) \ format *cdata = (format*)data; \ \ \ writeX(dest++, *cdata++); \ writeY(dest++, *cdata++); \ writeZ(dest++, 0); \ writeW(dest++, 0); void UNPACK_V2_8s(u32 *dest, u32 *data) { _UNPACK_V2_8(s8); } void UNPACK_V2_8u(u32 *dest, u32 *data) { _UNPACK_V2_8(u8); } #define _UNPACK_V2_8part(format) \ format *cdata = (format*)data; \ while(size > 0) { \ _UNPACKpart(0, writeX(dest++, *cdata++)); \ _UNPACKpart(1, writeY(dest++, *cdata++)); \ _UNPACKpart_nosize(2,writeZ(dest++, 0)); \ _UNPACKpart_nosize(3,writeW(dest++, 0)); \ if (_vifRegs->offset == 4) _vifRegs->offset = 0; \ } \ return (u32)cdata - (u32)data; int UNPACK_V2_8spart(u32 *dest, u32 *data, int size) { _UNPACK_V2_8part(s8); } int UNPACK_V2_8upart(u32 *dest, u32 *data, int size) { _UNPACK_V2_8part(u8); } void UNPACK_V3_32(u32 *dest, u32 *data) { writeX(dest++, *data++); writeY(dest++, *data++); writeZ(dest++, *data++); writeW(dest++, 0); } int UNPACK_V3_32part(u32 *dest, u32 *data, int size) { u32 *_data = data; while (size > 0) { _UNPACKpart(0, writeX(dest++, *data++); ); _UNPACKpart(1, writeY(dest++, *data++); ); _UNPACKpart(2, writeZ(dest++, *data++); ); _UNPACKpart_nosize(3, writeW(dest++, 0); ); if (_vifRegs->offset == 4) _vifRegs->offset = 0; } return (u32)data - (u32)_data; } #define _UNPACK_V3_16(format) \ format *sdata = (format*)data; \ \ \ writeX(dest++, *sdata++); \ writeY(dest++, *sdata++); \ writeZ(dest++, *sdata++); \ writeW(dest++, 0); void UNPACK_V3_16s(u32 *dest, u32 *data) { _UNPACK_V3_16(s16); } void UNPACK_V3_16u(u32 *dest, u32 *data) { _UNPACK_V3_16(u16); } #define _UNPACK_V3_16part(format) \ format *sdata = (format*)data; \ \ while(size > 0) { \ _UNPACKpart(0, writeX(dest++, *sdata++)); \ _UNPACKpart(1, writeY(dest++, *sdata++)); \ _UNPACKpart(2, writeZ(dest++, *sdata++)); \ _UNPACKpart_nosize(3,writeW(dest++, 0)); \ if (_vifRegs->offset == 4) _vifRegs->offset = 0; \ } \ return (u32)sdata - (u32)data; int UNPACK_V3_16spart(u32 *dest, u32 *data, int size) { _UNPACK_V3_16part(s16); } int UNPACK_V3_16upart(u32 *dest, u32 *data, int size) { _UNPACK_V3_16part(u16); } #define _UNPACK_V3_8(format) \ format *cdata = (format*)data; \ \ \ writeX(dest++, *cdata++); \ writeY(dest++, *cdata++); \ writeZ(dest++, *cdata++); \ writeW(dest++, 0); void UNPACK_V3_8s(u32 *dest, u32 *data) { _UNPACK_V3_8(s8); } void UNPACK_V3_8u(u32 *dest, u32 *data) { _UNPACK_V3_8(u8); } #define _UNPACK_V3_8part(format) \ format *cdata = (format*)data; \ while(size > 0) { \ _UNPACKpart(0, writeX(dest++, *cdata++)); \ _UNPACKpart(1, writeY(dest++, *cdata++)); \ _UNPACKpart(2, writeZ(dest++, *cdata++)); \ _UNPACKpart_nosize(3,writeW(dest++, 0)); \ if (_vifRegs->offset == 4) _vifRegs->offset = 0; \ } \ return (u32)cdata - (u32)data; int UNPACK_V3_8spart(u32 *dest, u32 *data, int size) { _UNPACK_V3_8part(s8); } int UNPACK_V3_8upart(u32 *dest, u32 *data, int size) { _UNPACK_V3_8part(u8); } void UNPACK_V4_32( u32 *dest, u32 *data ) { writeX(dest++, *data++); writeY(dest++, *data++); writeZ(dest++, *data++); writeW(dest++, *data++); } int UNPACK_V4_32part(u32 *dest, u32 *data, int size) { u32 *_data = data; while (size > 0) { _UNPACKpart(0, writeX(dest++, *data++) ); _UNPACKpart(1, writeY(dest++, *data++) ); _UNPACKpart(2, writeZ(dest++, *data++) ); _UNPACKpart(3, writeW(dest++, *data++) ); if (_vifRegs->offset == 4) _vifRegs->offset = 0; } return (u32)data - (u32)_data; } #define _UNPACK_V4_16(format) \ format *sdata = (format*)data; \ \ \ writeX(dest++, *sdata++); \ writeY(dest++, *sdata++); \ writeZ(dest++, *sdata++); \ writeW(dest++, *sdata++); void UNPACK_V4_16s(u32 *dest, u32 *data) { _UNPACK_V4_16(s16); } void UNPACK_V4_16u(u32 *dest, u32 *data) { _UNPACK_V4_16(u16); } #define _UNPACK_V4_16part(format) \ format *sdata = (format*)data; \ while (size > 0) { \ _UNPACKpart(0, writeX(dest++, *sdata++) ); \ _UNPACKpart(1, writeY(dest++, *sdata++) ); \ _UNPACKpart(2, writeZ(dest++, *sdata++) ); \ _UNPACKpart(3, writeW(dest++, *sdata++) ); \ if (_vifRegs->offset == 4) _vifRegs->offset = 0; \ } \ return (u32)sdata - (u32)data; int UNPACK_V4_16spart(u32 *dest, u32 *data, int size) { _UNPACK_V4_16part(s16); } int UNPACK_V4_16upart(u32 *dest, u32 *data, int size) { _UNPACK_V4_16part(u16); } #define _UNPACK_V4_8(format) \ format *cdata = (format*)data; \ \ \ writeX(dest++, *cdata++); \ writeY(dest++, *cdata++); \ writeZ(dest++, *cdata++); \ writeW(dest++, *cdata++); void UNPACK_V4_8s(u32 *dest, u32 *data) { _UNPACK_V4_8(s8); } void UNPACK_V4_8u(u32 *dest, u32 *data) { _UNPACK_V4_8(u8); } #define _UNPACK_V4_8part(format) \ format *cdata = (format*)data; \ while (size > 0) { \ _UNPACKpart(0, writeX(dest++, *cdata++) ); \ _UNPACKpart(1, writeY(dest++, *cdata++) ); \ _UNPACKpart(2, writeZ(dest++, *cdata++) ); \ _UNPACKpart(3, writeW(dest++, *cdata++) ); \ if (_vifRegs->offset == 4) _vifRegs->offset = 0; \ } \ return (u32)cdata - (u32)data; int UNPACK_V4_8spart(u32 *dest, u32 *data, int size) { _UNPACK_V4_8part(s8); } int UNPACK_V4_8upart(u32 *dest, u32 *data, int size) { _UNPACK_V4_8part(u8); } void UNPACK_V4_5(u32 *dest, u32 *data) { u16 *sdata = (u16*)data; u32 rgba; rgba = *sdata++; writeX(dest++, (rgba & 0x001f) << 3); writeY(dest++, (rgba & 0x03e0) >> 2); writeZ(dest++, (rgba & 0x7c00) >> 7); writeW(dest++, (rgba & 0x8000) >> 8); } int UNPACK_V4_5part(u32 *dest, u32 *data, int size) { u16 *sdata = (u16*)data; u32 rgba; while (size > 0) { rgba = *sdata++; _UNPACKpart(0, writeX(dest++, (rgba & 0x001f) << 3); ); _UNPACKpart(1, writeY(dest++, (rgba & 0x03e0) >> 2); ); _UNPACKpart(2, writeZ(dest++, (rgba & 0x7c00) >> 7); ); _UNPACKpart(3, writeW(dest++, (rgba & 0x8000) >> 8); ); if (_vifRegs->offset == 4) _vifRegs->offset = 0; } return (u32)sdata - (u32)data; } #if (defined(__i386__) || defined(__x86_64__)) // sse2 highly optimized vif (~200 separate functions are built) zerofrog(@gmail.com) #include #include __declspec(align(16)) u32 g_vif1Masks[64], g_vif0Masks[64]; u32 g_vif1HasMask3[4] = {0}, g_vif0HasMask3[4] = {0}; //static const u32 writearr[4] = { 0xffffffff, 0, 0, 0 }; //static const u32 rowarr[4] = { 0, 0xffffffff, 0, 0 }; //static const u32 colarr[4] = { 0, 0, 0xffffffff, 0 }; //static const u32 updatearr[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0 }; // arranged in writearr, rowarr, colarr, updatearr static __declspec(align(16)) s_maskarr[16][4] = { 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffff0000, 0x0000ffff, 0x00000000, 0xffffffff, 0xffff0000, 0x00000000, 0x0000ffff, 0xffffffff, 0xffff0000, 0x00000000, 0x00000000, 0xffff0000, 0x0000ffff, 0xffff0000, 0x00000000, 0xffffffff, 0x00000000, 0xffffffff, 0x00000000, 0xffffffff, 0x00000000, 0xffff0000, 0x0000ffff, 0xffffffff, 0x00000000, 0xffff0000, 0x00000000, 0xffff0000, 0x0000ffff, 0x00000000, 0xffff0000, 0xffffffff, 0x00000000, 0x0000ffff, 0xffff0000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffff0000, 0xffff0000, 0x0000ffff, 0x00000000, 0x00000000, 0x0000ffff, 0x00000000, 0x0000ffff, 0x00000000, 0x0000ffff, 0x00000000, 0x00000000, 0x0000ffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; u8 s_maskwrite[256]; void SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask) { u32 prev = 0; FreezeXMMRegs(1); for(i = 0; i < 4; ++i, mask >>= 8, oldmask >>= 8, vif1masks += 16) { prev |= s_maskwrite[mask&0xff];//((mask&3)==3)||((mask&0xc)==0xc)||((mask&0x30)==0x30)||((mask&0xc0)==0xc0); hasmask[i] = prev; if( (mask&0xff) != (oldmask&0xff) ) { __m128i r0, r1, r2, r3; r0 = _mm_load_si128((__m128i*)&s_maskarr[mask&15][0]); r2 = _mm_unpackhi_epi16(r0, r0); r0 = _mm_unpacklo_epi16(r0, r0); r1 = _mm_load_si128((__m128i*)&s_maskarr[(mask>>4)&15][0]); r3 = _mm_unpackhi_epi16(r1, r1); r1 = _mm_unpacklo_epi16(r1, r1); _mm_storel_pi((__m64*)&vif1masks[0], *(__m128*)&r0); _mm_storel_pi((__m64*)&vif1masks[2], *(__m128*)&r1); _mm_storeh_pi((__m64*)&vif1masks[4], *(__m128*)&r0); _mm_storeh_pi((__m64*)&vif1masks[6], *(__m128*)&r1); _mm_storel_pi((__m64*)&vif1masks[8], *(__m128*)&r2); _mm_storel_pi((__m64*)&vif1masks[10], *(__m128*)&r3); _mm_storeh_pi((__m64*)&vif1masks[12], *(__m128*)&r2); _mm_storeh_pi((__m64*)&vif1masks[14], *(__m128*)&r3); } } } // msvc++ #define VIF_SRC ecx #define VIF_INC edx #define VIF_DST edi // writing masks #define UNPACK_Write0_Regular(r0, CL, DEST_OFFSET, MOVDQA) \ { \ __asm MOVDQA qword ptr [VIF_DST+(DEST_OFFSET)], r0 \ } \ #define UNPACK_Write1_Regular(r0, CL, DEST_OFFSET, MOVDQA) \ { \ __asm MOVDQA qword ptr [VIF_DST], r0 \ __asm add VIF_DST, VIF_INC \ } \ #define UNPACK_Write0_Mask UNPACK_Write0_Regular #define UNPACK_Write1_Mask UNPACK_Write1_Regular #define UNPACK_Write0_WriteMask(r0, CL, DEST_OFFSET, MOVDQA) \ { \ /* masked write (dest needs to be in edi) */ \ __asm movdqa XMM_WRITEMASK, qword ptr [eax + 64*(CL) + 48] \ /*__asm maskmovdqu r0, XMM_WRITEMASK*/ \ __asm pand r0, XMM_WRITEMASK \ __asm pandn XMM_WRITEMASK, qword ptr [VIF_DST] \ __asm por r0, XMM_WRITEMASK \ __asm MOVDQA qword ptr [VIF_DST], r0 \ __asm add VIF_DST, 16 \ } \ #define UNPACK_Write1_WriteMask(r0, CL, DEST_OFFSET, MOVDQA) \ { \ __asm movdqa XMM_WRITEMASK, qword ptr [eax + 64*(0) + 48] \ /* masked write (dest needs to be in edi) */ \ /*__asm maskmovdqu r0, XMM_WRITEMASK*/ \ __asm pand r0, XMM_WRITEMASK \ __asm pandn XMM_WRITEMASK, qword ptr [VIF_DST] \ __asm por r0, XMM_WRITEMASK \ __asm MOVDQA qword ptr [VIF_DST], r0 \ __asm add VIF_DST, VIF_INC \ } \ #define UNPACK_Mask_SSE_0(r0) \ { \ __asm pand r0, XMM_WRITEMASK \ __asm por r0, XMM_ROWCOLMASK \ } \ // once a qword is uncomprssed, applies masks and saves // note: modifying XMM_WRITEMASK #define UNPACK_Mask_SSE_1(r0) \ { \ /* dest = row + write (only when mask=0), otherwise write */ \ __asm pand r0, XMM_WRITEMASK \ __asm por r0, XMM_ROWCOLMASK \ __asm pand XMM_WRITEMASK, XMM_ROW \ __asm paddd r0, XMM_WRITEMASK \ } \ #define UNPACK_Mask_SSE_2(r0) \ { \ /* dest = row + write (only when mask=0), otherwise write \ row = row + write (only when mask = 0), otherwise row */ \ __asm pand r0, XMM_WRITEMASK \ __asm pand XMM_WRITEMASK, XMM_ROW \ __asm paddd XMM_ROW, r0 \ __asm por r0, XMM_ROWCOLMASK \ __asm paddd r0, XMM_WRITEMASK \ } \ #define UNPACK_WriteMask_SSE_0 UNPACK_Mask_SSE_0 #define UNPACK_WriteMask_SSE_1 UNPACK_Mask_SSE_1 #define UNPACK_WriteMask_SSE_2 UNPACK_Mask_SSE_2 #define UNPACK_Regular_SSE_0(r0) #define UNPACK_Regular_SSE_1(r0) \ { \ __asm paddd r0, XMM_ROW \ } \ #define UNPACK_Regular_SSE_2(r0) \ { \ __asm paddd r0, XMM_ROW \ __asm movdqa XMM_ROW, r0 \ } \ // setting up masks #define UNPACK_Setup_Mask_SSE(CL) \ { \ __asm mov eax, _vifMaskRegs \ __asm movdqa XMM_ROWMASK, qword ptr [eax + 64*(CL) + 16] \ __asm movdqa XMM_ROWCOLMASK, qword ptr [eax + 64*(CL) + 32] \ __asm movdqa XMM_WRITEMASK, qword ptr [eax + 64*(CL)] \ __asm pand XMM_ROWMASK, XMM_ROW \ __asm pand XMM_ROWCOLMASK, XMM_COL \ __asm por XMM_ROWCOLMASK, XMM_ROWMASK \ } \ #define UNPACK_Start_Setup_Mask_SSE_0(CL) UNPACK_Setup_Mask_SSE(CL); #define UNPACK_Start_Setup_Mask_SSE_1(CL) \ { \ __asm mov eax, _vifMaskRegs \ __asm movdqa XMM_ROWMASK, qword ptr [eax + 64*(CL) + 16] \ __asm movdqa XMM_ROWCOLMASK, qword ptr [eax + 64*(CL) + 32] \ __asm pand XMM_ROWMASK, XMM_ROW \ __asm pand XMM_ROWCOLMASK, XMM_COL \ __asm por XMM_ROWCOLMASK, XMM_ROWMASK \ } \ #define UNPACK_Start_Setup_Mask_SSE_2(CL) #define UNPACK_Setup_Mask_SSE_0_1(CL) #define UNPACK_Setup_Mask_SSE_1_1(CL) \ { \ __asm mov eax, _vifMaskRegs \ __asm movdqa XMM_WRITEMASK, qword ptr [eax + 64*(0)] \ } \ #define UNPACK_Setup_Mask_SSE_2_1(CL) { \ /* ignore CL, since vif.cycle.wl == 1 */ \ __asm mov eax, _vifMaskRegs \ __asm movdqa XMM_ROWMASK, qword ptr [eax + 64*(0) + 16] \ __asm movdqa XMM_ROWCOLMASK, qword ptr [eax + 64*(0) + 32] \ __asm movdqa XMM_WRITEMASK, qword ptr [eax + 64*(0)] \ __asm pand XMM_ROWMASK, XMM_ROW \ __asm pand XMM_ROWCOLMASK, XMM_COL \ __asm por XMM_ROWCOLMASK, XMM_ROWMASK \ } \ #define UNPACK_Setup_Mask_SSE_0_0(CL) UNPACK_Setup_Mask_SSE(CL) #define UNPACK_Setup_Mask_SSE_1_0(CL) UNPACK_Setup_Mask_SSE(CL) #define UNPACK_Setup_Mask_SSE_2_0(CL) UNPACK_Setup_Mask_SSE(CL) // write mask always destroys XMM_WRITEMASK, so 0_0 = 1_0 #define UNPACK_Setup_WriteMask_SSE_0_0(CL) UNPACK_Setup_Mask_SSE(CL) #define UNPACK_Setup_WriteMask_SSE_1_0(CL) UNPACK_Setup_Mask_SSE(CL) #define UNPACK_Setup_WriteMask_SSE_2_0(CL) UNPACK_Setup_Mask_SSE(CL) #define UNPACK_Setup_WriteMask_SSE_0_1(CL) UNPACK_Setup_Mask_SSE_1_1(CL) #define UNPACK_Setup_WriteMask_SSE_1_1(CL) UNPACK_Setup_Mask_SSE_1_1(CL) #define UNPACK_Setup_WriteMask_SSE_2_1(CL) UNPACK_Setup_Mask_SSE_2_1(CL) #define UNPACK_Start_Setup_WriteMask_SSE_0(CL) UNPACK_Start_Setup_Mask_SSE_1(CL) #define UNPACK_Start_Setup_WriteMask_SSE_1(CL) UNPACK_Start_Setup_Mask_SSE_1(CL) #define UNPACK_Start_Setup_WriteMask_SSE_2(CL) UNPACK_Start_Setup_Mask_SSE_2(CL) #define UNPACK_Start_Setup_Regular_SSE_0(CL) #define UNPACK_Start_Setup_Regular_SSE_1(CL) #define UNPACK_Start_Setup_Regular_SSE_2(CL) #define UNPACK_Setup_Regular_SSE_0_0(CL) #define UNPACK_Setup_Regular_SSE_1_0(CL) #define UNPACK_Setup_Regular_SSE_2_0(CL) #define UNPACK_Setup_Regular_SSE_0_1(CL) #define UNPACK_Setup_Regular_SSE_1_1(CL) #define UNPACK_Setup_Regular_SSE_2_1(CL) #define UNPACK_INC_DST_0_Regular(qw) __asm add VIF_DST, (16*qw) #define UNPACK_INC_DST_1_Regular(qw) #define UNPACK_INC_DST_0_Mask(qw) __asm add VIF_DST, (16*qw) #define UNPACK_INC_DST_1_Mask(qw) #define UNPACK_INC_DST_0_WriteMask(qw) #define UNPACK_INC_DST_1_WriteMask(qw) // unpacks for 1,2,3,4 elements (V3 uses this directly) #define UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType) { \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+0); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \ \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \ \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+3); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R3); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R3, CL+3, 48, movdqa); \ \ UNPACK_INC_DST_##TOTALCL##_##MaskType##(4) \ } \ // V3 uses this directly #define UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType) { \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \ \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \ \ UNPACK_INC_DST_##TOTALCL##_##MaskType##(3) \ } \ #define UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType) { \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \ \ UNPACK_INC_DST_##TOTALCL##_##MaskType##(2) \ } \ #define UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType) { \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ \ UNPACK_INC_DST_##TOTALCL##_##MaskType##(1) \ } \ // S-32 // only when cl==1 #define UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, MOVDQA) { \ { \ __asm MOVDQA XMM_R3, qword ptr [VIF_SRC] \ \ __asm pshufd XMM_R0, XMM_R3, 0 \ __asm pshufd XMM_R1, XMM_R3, 0x55 \ __asm pshufd XMM_R2, XMM_R3, 0xaa \ __asm pshufd XMM_R3, XMM_R3, 0xff \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 16 \ } \ } #define UNPACK_S_32SSE_4A(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqa) #define UNPACK_S_32SSE_4(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqu) #define UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, MOVDQA) { \ { \ __asm MOVDQA XMM_R2, qword ptr [VIF_SRC] \ \ __asm pshufd XMM_R0, XMM_R2, 0 \ __asm pshufd XMM_R1, XMM_R2, 0x55 \ __asm pshufd XMM_R2, XMM_R2, 0xaa \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 12 \ } \ } \ #define UNPACK_S_32SSE_3A(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqa) #define UNPACK_S_32SSE_3(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqu) #define UNPACK_S_32SSE_2(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R1, qword ptr [VIF_SRC] \ \ __asm pshufd XMM_R0, XMM_R1, 0 \ __asm pshufd XMM_R1, XMM_R1, 0x55 \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } \ #define UNPACK_S_32SSE_2A UNPACK_S_32SSE_2 #define UNPACK_S_32SSE_1(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R0, dword ptr [VIF_SRC] \ __asm pshufd XMM_R0, XMM_R0, 0 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 4 \ } \ } \ #define UNPACK_S_32SSE_1A UNPACK_S_32SSE_1 // S-16 #define UNPACK_S_16SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R3, qword ptr [VIF_SRC] \ __asm punpcklwd XMM_R3, XMM_R3 \ __asm UNPACK_RIGHTSHIFT XMM_R3, 16 \ \ __asm pshufd XMM_R0, XMM_R3, 0 \ __asm pshufd XMM_R1, XMM_R3, 0x55 \ __asm pshufd XMM_R2, XMM_R3, 0xaa \ __asm pshufd XMM_R3, XMM_R3, 0xff \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } #define UNPACK_S_16SSE_4A UNPACK_S_16SSE_4 #define UNPACK_S_16SSE_3(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R2, qword ptr [VIF_SRC] \ __asm punpcklwd XMM_R2, XMM_R2 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ \ __asm pshufd XMM_R0, XMM_R2, 0 \ __asm pshufd XMM_R1, XMM_R2, 0x55 \ __asm pshufd XMM_R2, XMM_R2, 0xaa \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 6 \ } \ } \ #define UNPACK_S_16SSE_3A UNPACK_S_16SSE_3 #define UNPACK_S_16SSE_2(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R1, dword ptr [VIF_SRC] \ __asm punpcklwd XMM_R1, XMM_R1 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 16 \ \ __asm pshufd XMM_R0, XMM_R1, 0 \ __asm pshufd XMM_R1, XMM_R1, 0x55 \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 4 \ } \ } \ #define UNPACK_S_16SSE_2A UNPACK_S_16SSE_2 #define UNPACK_S_16SSE_1(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R0, dword ptr [VIF_SRC] \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm pshufd XMM_R0, XMM_R0, 0 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 2 \ } \ } \ #define UNPACK_S_16SSE_1A UNPACK_S_16SSE_1 // S-8 #define UNPACK_S_8SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R3, dword ptr [VIF_SRC] \ __asm punpcklbw XMM_R3, XMM_R3 \ __asm punpcklwd XMM_R3, XMM_R3 \ __asm UNPACK_RIGHTSHIFT XMM_R3, 24 \ \ __asm pshufd XMM_R0, XMM_R3, 0 \ __asm pshufd XMM_R1, XMM_R3, 0x55 \ __asm pshufd XMM_R2, XMM_R3, 0xaa \ __asm pshufd XMM_R3, XMM_R3, 0xff \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 4 \ } \ } #define UNPACK_S_8SSE_4A UNPACK_S_8SSE_4 #define UNPACK_S_8SSE_3(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R2, dword ptr [VIF_SRC] \ __asm punpcklbw XMM_R2, XMM_R2 \ __asm punpcklwd XMM_R2, XMM_R2 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 24 \ \ __asm pshufd XMM_R0, XMM_R2, 0 \ __asm pshufd XMM_R1, XMM_R2, 0x55 \ __asm pshufd XMM_R2, XMM_R2, 0xaa \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 3 \ } \ } \ #define UNPACK_S_8SSE_3A UNPACK_S_8SSE_3 #define UNPACK_S_8SSE_2(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R1, dword ptr [VIF_SRC] \ __asm punpcklbw XMM_R1, XMM_R1 \ __asm punpcklwd XMM_R1, XMM_R1 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 24 \ \ __asm pshufd XMM_R0, XMM_R1, 0 \ __asm pshufd XMM_R1, XMM_R1, 0x55 \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 2 \ } \ } \ #define UNPACK_S_8SSE_2A UNPACK_S_8SSE_2 #define UNPACK_S_8SSE_1(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R0, dword ptr [VIF_SRC] \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ __asm pshufd XMM_R0, XMM_R0, 0 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm inc VIF_SRC \ } \ } \ #define UNPACK_S_8SSE_1A UNPACK_S_8SSE_1 // V2-32 #define UNPACK_V2_32SSE_4A(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm MOVDQA XMM_R0, qword ptr [VIF_SRC] \ __asm MOVDQA XMM_R2, qword ptr [VIF_SRC+16] \ \ __asm pshufd XMM_R1, XMM_R0, 0xee \ __asm pshufd XMM_R3, XMM_R2, 0xee \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 32 \ } \ } #define UNPACK_V2_32SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm movq XMM_R1, qword ptr [VIF_SRC+8] \ __asm movq XMM_R2, qword ptr [VIF_SRC+16] \ __asm movq XMM_R3, qword ptr [VIF_SRC+24] \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 32 \ } \ } #define UNPACK_V2_32SSE_3A(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm MOVDQA XMM_R0, qword ptr [VIF_SRC] \ __asm movq XMM_R2, qword ptr [VIF_SRC+16] \ __asm pshufd XMM_R1, XMM_R0, 0xee \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 24 \ } \ } \ #define UNPACK_V2_32SSE_3(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm movq XMM_R1, qword ptr [VIF_SRC+8] \ __asm movq XMM_R2, qword ptr [VIF_SRC+16] \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 24 \ } \ } \ #define UNPACK_V2_32SSE_2(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm movq XMM_R1, qword ptr [VIF_SRC+8] \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 16 \ } \ } \ #define UNPACK_V2_32SSE_2A UNPACK_V2_32SSE_2 #define UNPACK_V2_32SSE_1(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } \ #define UNPACK_V2_32SSE_1A UNPACK_V2_32SSE_1 // V2-16 #define UNPACK_V2_16SSE_4A(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm punpcklwd XMM_R0, qword ptr [VIF_SRC] \ __asm punpckhwd XMM_R2, qword ptr [VIF_SRC] \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ \ /* move the lower 64 bits down*/ \ __asm pshufd XMM_R1, XMM_R0, 0xee \ __asm pshufd XMM_R3, XMM_R2, 0xee \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 16 \ } \ } #define UNPACK_V2_16SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movdqu XMM_R0, qword ptr [VIF_SRC] \ \ __asm punpckhwd XMM_R2, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ \ /* move the lower 64 bits down*/ \ __asm pshufd XMM_R1, XMM_R0, 0xee \ __asm pshufd XMM_R3, XMM_R2, 0xee \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 16 \ } \ } #define UNPACK_V2_16SSE_3A(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm punpcklwd XMM_R0, qword ptr [VIF_SRC] \ __asm punpckhwd XMM_R2, qword ptr [VIF_SRC] \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ \ /* move the lower 64 bits down*/ \ __asm pshufd XMM_R1, XMM_R0, 0xee \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 12 \ } \ } \ #define UNPACK_V2_16SSE_3(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movdqu XMM_R0, qword ptr [VIF_SRC] \ \ __asm punpckhwd XMM_R2, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ \ /* move the lower 64 bits down*/ \ __asm pshufd XMM_R1, XMM_R0, 0xee \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 12 \ } \ } \ #define UNPACK_V2_16SSE_2A(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm punpcklwd XMM_R0, qword ptr [VIF_SRC] \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ \ /* move the lower 64 bits down*/ \ __asm pshufd XMM_R1, XMM_R0, 0xee \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } \ #define UNPACK_V2_16SSE_2(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ \ /* move the lower 64 bits down*/ \ __asm pshufd XMM_R1, XMM_R0, 0xee \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } \ #define UNPACK_V2_16SSE_1A(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm punpcklwd XMM_R0, dword ptr [VIF_SRC] \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 4 \ } \ } \ #define UNPACK_V2_16SSE_1(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movd XMM_R0, dword ptr [VIF_SRC] \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 4 \ } \ } \ // V2-8 #define UNPACK_V2_8SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm punpckhwd XMM_R2, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 24 \ \ /* move the lower 64 bits down*/ \ __asm pshufd XMM_R1, XMM_R0, 0xee \ __asm pshufd XMM_R3, XMM_R2, 0xee \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } #define UNPACK_V2_8SSE_4A UNPACK_V2_8SSE_4 #define UNPACK_V2_8SSE_3(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm punpckhwd XMM_R2, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 24 \ \ /* move the lower 64 bits down*/ \ __asm pshufd XMM_R1, XMM_R0, 0xee \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 6 \ } \ } \ #define UNPACK_V2_8SSE_3A UNPACK_V2_8SSE_3 #define UNPACK_V2_8SSE_2(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R0, dword ptr [VIF_SRC] \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ \ /* move the lower 64 bits down*/ \ __asm pshufd XMM_R1, XMM_R0, 0xee \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 4 \ } \ } \ #define UNPACK_V2_8SSE_2A UNPACK_V2_8SSE_2 #define UNPACK_V2_8SSE_1(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R0, dword ptr [VIF_SRC] \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 2 \ } \ } \ #define UNPACK_V2_8SSE_1A UNPACK_V2_8SSE_1 // V3-32 #define UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, MOVDQA) { \ { \ __asm MOVDQA XMM_R0, qword ptr [VIF_SRC] \ __asm movdqu XMM_R1, qword ptr [VIF_SRC+12] \ } \ { \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+0); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \ } \ { \ __asm movdqu XMM_R2, qword ptr [VIF_SRC+24] \ __asm movdqu XMM_R3, qword ptr [VIF_SRC+36] \ } \ { \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \ \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+3); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R3); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R3, CL+3, 48, movdqa); \ \ UNPACK_INC_DST_##TOTALCL##_##MaskType##(4) \ } \ { \ __asm add VIF_SRC, 48 \ } \ } #define UNPACK_V3_32SSE_4A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqa) #define UNPACK_V3_32SSE_4(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqu) #define UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \ { \ { \ __asm MOVDQA XMM_R0, qword ptr [VIF_SRC] \ __asm movdqu XMM_R1, qword ptr [VIF_SRC+12] \ } \ { \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \ } \ { \ __asm movdqu XMM_R2, qword ptr [VIF_SRC+24] \ } \ { \ UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \ UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \ UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \ \ UNPACK_INC_DST_##TOTALCL##_##MaskType##(3) \ } \ { \ __asm add VIF_SRC, 36 \ } \ } \ #define UNPACK_V3_32SSE_3A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqa) #define UNPACK_V3_32SSE_3(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqu) #define UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \ { \ { \ __asm MOVDQA XMM_R0, qword ptr [VIF_SRC] \ __asm movdqu XMM_R1, qword ptr [VIF_SRC+12] \ } \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 24 \ } \ } \ #define UNPACK_V3_32SSE_2A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, movdqa) #define UNPACK_V3_32SSE_2(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, movdqu) #define UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \ { \ { \ __asm MOVDQA XMM_R0, qword ptr [VIF_SRC] \ } \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 12 \ } \ } \ #define UNPACK_V3_32SSE_1A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, movdqa) #define UNPACK_V3_32SSE_1(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, movdqu) // V3-16 #define UNPACK_V3_16SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm movq XMM_R1, qword ptr [VIF_SRC+6] \ \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm movq XMM_R2, qword ptr [VIF_SRC+12] \ __asm punpcklwd XMM_R1, XMM_R1 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm movq XMM_R3, qword ptr [VIF_SRC+18] \ __asm UNPACK_RIGHTSHIFT XMM_R1, 16 \ __asm punpcklwd XMM_R2, XMM_R2 \ __asm punpcklwd XMM_R3, XMM_R3 \ \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R3, 16 \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 24 \ } \ } #define UNPACK_V3_16SSE_4A UNPACK_V3_16SSE_4 #define UNPACK_V3_16SSE_3(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm movq XMM_R1, qword ptr [VIF_SRC+6] \ \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm movq XMM_R2, qword ptr [VIF_SRC+12] \ __asm punpcklwd XMM_R1, XMM_R1 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm punpcklwd XMM_R2, XMM_R2 \ \ __asm UNPACK_RIGHTSHIFT XMM_R1, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 18 \ } \ } \ #define UNPACK_V3_16SSE_3A UNPACK_V3_16SSE_3 #define UNPACK_V3_16SSE_2(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm movq XMM_R1, qword ptr [VIF_SRC+6] \ \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R1, XMM_R1 \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 16 \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 12 \ } \ } \ #define UNPACK_V3_16SSE_2A UNPACK_V3_16SSE_2 #define UNPACK_V3_16SSE_1(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 6 \ } \ } \ #define UNPACK_V3_16SSE_1A UNPACK_V3_16SSE_1 // V3-8 #define UNPACK_V3_8SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R1, qword ptr [VIF_SRC] \ __asm movq XMM_R3, qword ptr [VIF_SRC+6] \ \ __asm punpcklbw XMM_R1, XMM_R1 \ __asm punpcklbw XMM_R3, XMM_R3 \ __asm punpcklwd XMM_R0, XMM_R1 \ __asm psrldq XMM_R1, 6 \ __asm punpcklwd XMM_R2, XMM_R3 \ __asm psrldq XMM_R3, 6 \ __asm punpcklwd XMM_R1, XMM_R1 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ __asm punpcklwd XMM_R3, XMM_R3 \ \ __asm UNPACK_RIGHTSHIFT XMM_R2, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R3, 24 \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 12 \ } \ } #define UNPACK_V3_8SSE_4A UNPACK_V3_8SSE_4 #define UNPACK_V3_8SSE_3(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R0, word ptr [VIF_SRC] \ __asm movd XMM_R1, dword ptr [VIF_SRC+3] \ \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm movd XMM_R2, dword ptr [VIF_SRC+6] \ __asm punpcklbw XMM_R1, XMM_R1 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm punpcklbw XMM_R2, XMM_R2 \ \ __asm punpcklwd XMM_R1, XMM_R1 \ __asm punpcklwd XMM_R2, XMM_R2 \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 24 \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 9 \ } \ } \ #define UNPACK_V3_8SSE_3A UNPACK_V3_8SSE_3 #define UNPACK_V3_8SSE_2(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R0, dword ptr [VIF_SRC] \ __asm movd XMM_R1, dword ptr [VIF_SRC+3] \ \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm punpcklbw XMM_R1, XMM_R1 \ \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R1, XMM_R1 \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 24 \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 6 \ } \ } \ #define UNPACK_V3_8SSE_2A UNPACK_V3_8SSE_2 #define UNPACK_V3_8SSE_1(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R0, dword ptr [VIF_SRC] \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 3 \ } \ } \ #define UNPACK_V3_8SSE_1A UNPACK_V3_8SSE_1 // V4-32 #define UNPACK_V4_32SSE_4A(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movdqa XMM_R0, qword ptr [VIF_SRC] \ __asm movdqa XMM_R1, qword ptr [VIF_SRC+16] \ __asm movdqa XMM_R2, qword ptr [VIF_SRC+32] \ __asm movdqa XMM_R3, qword ptr [VIF_SRC+48] \ } \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 64 \ } \ } #define UNPACK_V4_32SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movdqu XMM_R0, qword ptr [VIF_SRC] \ __asm movdqu XMM_R1, qword ptr [VIF_SRC+16] \ __asm movdqu XMM_R2, qword ptr [VIF_SRC+32] \ __asm movdqu XMM_R3, qword ptr [VIF_SRC+48] \ } \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 64 \ } \ } #define UNPACK_V4_32SSE_3A(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movdqa XMM_R0, qword ptr [VIF_SRC] \ __asm movdqa XMM_R1, qword ptr [VIF_SRC+16] \ __asm movdqa XMM_R2, qword ptr [VIF_SRC+32] \ } \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 48 \ } \ } #define UNPACK_V4_32SSE_3(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movdqu XMM_R0, qword ptr [VIF_SRC] \ __asm movdqu XMM_R1, qword ptr [VIF_SRC+16] \ __asm movdqu XMM_R2, qword ptr [VIF_SRC+32] \ } \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 48 \ } \ } #define UNPACK_V4_32SSE_2A(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movdqa XMM_R0, qword ptr [VIF_SRC] \ __asm movdqa XMM_R1, qword ptr [VIF_SRC+16] \ } \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 32 \ } \ } #define UNPACK_V4_32SSE_2(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movdqu XMM_R0, qword ptr [VIF_SRC] \ __asm movdqu XMM_R1, qword ptr [VIF_SRC+16] \ } \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 32 \ } \ } #define UNPACK_V4_32SSE_1A(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movdqa XMM_R0, qword ptr [VIF_SRC] \ } \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 16 \ } \ } #define UNPACK_V4_32SSE_1(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movdqu XMM_R0, qword ptr [VIF_SRC] \ } \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 16 \ } \ } // V4-16 #define UNPACK_V4_16SSE_4A(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm punpcklwd XMM_R0, qword ptr [VIF_SRC] \ __asm punpckhwd XMM_R1, qword ptr [VIF_SRC] \ __asm punpcklwd XMM_R2, qword ptr [VIF_SRC+16] \ __asm punpckhwd XMM_R3, qword ptr [VIF_SRC+16] \ \ __asm UNPACK_RIGHTSHIFT XMM_R1, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R3, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 32 \ } \ } #define UNPACK_V4_16SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movdqu XMM_R0, qword ptr [VIF_SRC] \ __asm movdqu XMM_R2, qword ptr [VIF_SRC+16] \ \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpckhwd XMM_R3, XMM_R2 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R2, XMM_R2 \ \ __asm UNPACK_RIGHTSHIFT XMM_R1, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R3, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 32 \ } \ } #define UNPACK_V4_16SSE_3A(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm punpcklwd XMM_R0, qword ptr [VIF_SRC] \ __asm punpckhwd XMM_R1, qword ptr [VIF_SRC] \ __asm punpcklwd XMM_R2, qword ptr [VIF_SRC+16] \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 24 \ } \ } \ #define UNPACK_V4_16SSE_3(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movdqu XMM_R0, qword ptr [VIF_SRC] \ __asm movq XMM_R2, qword ptr [VIF_SRC+16] \ \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R2, XMM_R2 \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 16 \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 24 \ } \ } \ #define UNPACK_V4_16SSE_2A(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm punpcklwd XMM_R0, qword ptr [VIF_SRC] \ __asm punpckhwd XMM_R1, qword ptr [VIF_SRC] \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 16 \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 16 \ } \ } \ #define UNPACK_V4_16SSE_2(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm movq XMM_R1, qword ptr [VIF_SRC+8] \ \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R1, XMM_R1 \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 16 \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 16 \ } \ } \ #define UNPACK_V4_16SSE_1A(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm punpcklwd XMM_R0, qword ptr [VIF_SRC] \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } \ #define UNPACK_V4_16SSE_1(CL, TOTALCL, MaskType, ModeType) \ { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 16 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } \ // V4-8 #define UNPACK_V4_8SSE_4A(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm punpcklbw XMM_R0, qword ptr [VIF_SRC] \ __asm punpckhbw XMM_R2, qword ptr [VIF_SRC] \ \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpckhwd XMM_R3, XMM_R2 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R2, XMM_R2 \ \ __asm UNPACK_RIGHTSHIFT XMM_R1, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R3, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 24 \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 16 \ } \ } #define UNPACK_V4_8SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movdqu XMM_R0, qword ptr [VIF_SRC] \ \ __asm punpckhbw XMM_R2, XMM_R0 \ __asm punpcklbw XMM_R0, XMM_R0 \ \ __asm punpckhwd XMM_R3, XMM_R2 \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpcklwd XMM_R2, XMM_R2 \ __asm punpcklwd XMM_R0, XMM_R0 \ \ __asm UNPACK_RIGHTSHIFT XMM_R3, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 24 \ \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R1, 24 \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 16 \ } \ } #define UNPACK_V4_8SSE_3A(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm punpcklbw XMM_R0, qword ptr [VIF_SRC] \ __asm punpckhbw XMM_R2, qword ptr [VIF_SRC] \ \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R2, XMM_R2 \ \ __asm UNPACK_RIGHTSHIFT XMM_R1, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 24 \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 12 \ } \ } \ #define UNPACK_V4_8SSE_3(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ __asm movd XMM_R2, dword ptr [VIF_SRC+8] \ \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm punpcklbw XMM_R2, XMM_R2 \ \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpcklwd XMM_R2, XMM_R2 \ __asm punpcklwd XMM_R0, XMM_R0 \ \ __asm UNPACK_RIGHTSHIFT XMM_R1, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R2, 24 \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 12 \ } \ } \ #define UNPACK_V4_8SSE_2A(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm punpcklbw XMM_R0, qword ptr [VIF_SRC] \ \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ \ __asm UNPACK_RIGHTSHIFT XMM_R1, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } \ #define UNPACK_V4_8SSE_2(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movq XMM_R0, qword ptr [VIF_SRC] \ \ __asm punpcklbw XMM_R0, XMM_R0 \ \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ \ __asm UNPACK_RIGHTSHIFT XMM_R1, 24 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } \ #define UNPACK_V4_8SSE_1A(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm punpcklbw XMM_R0, qword ptr [VIF_SRC] \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 4 \ } \ } \ #define UNPACK_V4_8SSE_1(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm movd XMM_R0, dword ptr [VIF_SRC] \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm UNPACK_RIGHTSHIFT XMM_R0, 24 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 4 \ } \ } \ // V4-5 __declspec(align(16)) static u32 s_TempDecompress[4] = {0}; #define DECOMPRESS_RGBA(OFFSET) { \ /* R */ \ __asm mov bl, al \ __asm shl bl, 3 \ __asm mov byte ptr [s_TempDecompress+OFFSET], bl \ /* G */ \ __asm mov bx, ax \ __asm shr bx, 2 \ __asm and bx, 0xf8 \ __asm mov byte ptr [s_TempDecompress+OFFSET+1], bl \ /* B */ \ __asm mov bx, ax \ __asm shr bx, 7 \ __asm and bx, 0xf8 \ __asm mov byte ptr [s_TempDecompress+OFFSET+2], bl \ __asm mov bx, ax \ __asm shr bx, 8 \ __asm and bx, 0x80 \ __asm mov byte ptr [s_TempDecompress+OFFSET+3], bl \ } \ #define UNPACK_V4_5SSE_4(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm mov eax, dword ptr [VIF_SRC] \ } \ DECOMPRESS_RGBA(0); \ { \ __asm shr eax, 16 \ } \ DECOMPRESS_RGBA(4); \ { \ __asm mov ax, word ptr [VIF_SRC+4] \ } \ DECOMPRESS_RGBA(8); \ { \ __asm mov ax, word ptr [VIF_SRC+6] \ } \ DECOMPRESS_RGBA(12); \ { \ __asm movdqa XMM_R0, qword ptr [s_TempDecompress] \ \ __asm punpckhbw XMM_R2, XMM_R0 \ __asm punpcklbw XMM_R0, XMM_R0 \ \ __asm punpckhwd XMM_R3, XMM_R2 \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R2, XMM_R2 \ \ __asm psrld XMM_R0, 24 \ __asm psrld XMM_R1, 24 \ __asm psrld XMM_R2, 24 \ __asm psrld XMM_R3, 24 \ } \ \ UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 8 \ } \ } #define UNPACK_V4_5SSE_4A UNPACK_V4_5SSE_4 #define UNPACK_V4_5SSE_3(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm mov eax, dword ptr [VIF_SRC] \ } \ DECOMPRESS_RGBA(0); \ { \ __asm shr eax, 16 \ } \ DECOMPRESS_RGBA(4); \ { \ __asm mov ax, word ptr [VIF_SRC+4] \ } \ DECOMPRESS_RGBA(8); \ { \ __asm movdqa XMM_R0, qword ptr [s_TempDecompress] \ \ __asm punpckhbw XMM_R2, XMM_R0 \ __asm punpcklbw XMM_R0, XMM_R0 \ \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R2, XMM_R2 \ \ __asm psrld XMM_R0, 24 \ __asm psrld XMM_R1, 24 \ __asm psrld XMM_R2, 24 \ } \ \ UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 6 \ } \ } \ #define UNPACK_V4_5SSE_3A UNPACK_V4_5SSE_3 #define UNPACK_V4_5SSE_2(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm mov eax, dword ptr [VIF_SRC] \ } \ DECOMPRESS_RGBA(0); \ { \ __asm shr eax, 16 \ } \ DECOMPRESS_RGBA(4); \ { \ __asm movq XMM_R0, qword ptr [s_TempDecompress] \ \ __asm punpcklbw XMM_R0, XMM_R0 \ \ __asm punpckhwd XMM_R1, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ \ __asm psrld XMM_R0, 24 \ __asm psrld XMM_R1, 24 \ } \ \ UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 4 \ } \ } \ #define UNPACK_V4_5SSE_2A UNPACK_V4_5SSE_2 #define UNPACK_V4_5SSE_1(CL, TOTALCL, MaskType, ModeType) { \ { \ __asm mov ax, word ptr [VIF_SRC] \ } \ DECOMPRESS_RGBA(0); \ { \ __asm movd XMM_R0, dword ptr [s_TempDecompress] \ __asm punpcklbw XMM_R0, XMM_R0 \ __asm punpcklwd XMM_R0, XMM_R0 \ \ __asm psrld XMM_R0, 24 \ } \ \ UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ { \ __asm add VIF_SRC, 2 \ } \ } \ #define UNPACK_V4_5SSE_1A UNPACK_V4_5SSE_1 #pragma warning(disable:4731) #ifdef _DEBUG #define PUSHESI __asm mov s_saveesi, esi #define POPESI __asm mov esi, s_saveesi #else #define PUSHESI #define POPESI #endif #define PUSHEDI #define POPEDI #define PUSHEBP //__asm mov dword ptr [esp-4], ebp #define POPEBP //__asm mov ebp, dword ptr [esp-4] //__asm mov eax, pr0 \ ///* load row */ \ //__asm movss XMM_ROW, dword ptr [eax] \ //__asm movss XMM_R1, dword ptr [eax+4] \ //__asm punpckldq XMM_ROW, XMM_R1 \ //__asm movss XMM_R0, dword ptr [eax+8] \ //__asm movss XMM_R1, dword ptr [eax+12] \ //__asm punpckldq XMM_R0, XMM_R1 \ //__asm punpcklqdq XMM_ROW, XMM_R0 \ //\ //__asm mov eax, pc0 \ //__asm movss XMM_R3, dword ptr [eax] \ //__asm movss XMM_R1, dword ptr [eax+4] \ //__asm punpckldq XMM_R3, XMM_R1 \ //__asm movss XMM_R0, dword ptr [eax+8] \ //__asm movss XMM_R1, dword ptr [eax+12] \ //__asm punpckldq XMM_R0, XMM_R1 \ //__asm punpcklqdq XMM_R3, XMM_R0 \ #define SAVE_ROW_REG_BASE { \ { \ /* save the row reg */ \ __asm mov eax, _vifRow \ __asm movdqa qword ptr [eax], XMM_ROW \ __asm mov eax, _vifRegs \ __asm movss dword ptr [eax+0x100], XMM_ROW \ __asm psrldq XMM_ROW, 4 \ __asm movss dword ptr [eax+0x110], XMM_ROW \ __asm psrldq XMM_ROW, 4 \ __asm movss dword ptr [eax+0x120], XMM_ROW \ __asm psrldq XMM_ROW, 4 \ __asm movss dword ptr [eax+0x130], XMM_ROW \ } \ } \ #define SAVE_NO_REG extern int g_nCounters[4]; static int tempcl = 0, incdest; static int s_saveesi, s_saveinc; // qsize - bytes of compressed size of 1 decompressed qword #define defUNPACK_SkippingWrite(name, MaskType, ModeType, qsize, sign, SAVE_ROW_REG) \ int UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType##(u32* dest, u32* data, int dmasize) \ { \ incdest = ((_vifRegs->cycle.cl - _vifRegs->cycle.wl)<<4); \ \ switch( _vifRegs->cycle.wl ) { \ case 1: \ { \ /*__asm inc dword ptr [g_nCounters] \ __asm mov eax, dmasize \ __asm add dword ptr [g_nCounters+4], eax*/ \ PUSHESI \ PUSHEDI \ __asm mov esi, dmasize \ } \ UNPACK_Start_Setup_##MaskType##_SSE_##ModeType##(0); \ { \ __asm cmp esi, qsize \ __asm jl C1_Done3 \ \ /* move source and dest */ \ __asm mov VIF_DST, dest \ __asm mov VIF_SRC, data \ __asm mov VIF_INC, incdest \ __asm add VIF_INC, 16 \ } \ \ /* first align VIF_SRC to 16 bytes */ \ C1_Align16: \ { \ __asm test VIF_SRC, 15 \ __asm jz C1_UnpackAligned \ } \ UNPACK_##name##SSE_1(0, 1, MaskType, ModeType); \ { \ __asm cmp esi, (2*qsize) \ __asm jl C1_DoneWithDec \ __asm sub esi, qsize \ __asm jmp C1_Align16 \ } \ C1_UnpackAligned: \ { \ __asm cmp esi, (2*qsize) \ __asm jl C1_Unpack1 \ __asm cmp esi, (3*qsize) \ __asm jl C1_Unpack2 \ __asm cmp esi, (4*qsize) \ __asm jl C1_Unpack3 \ __asm prefetchnta [eax + 192] \ } \ C1_Unpack4: \ UNPACK_##name##SSE_4A(0, 1, MaskType, ModeType); \ { \ __asm cmp esi, (8*qsize) \ __asm jl C1_DoneUnpack4 \ __asm sub esi, (4*qsize) \ __asm jmp C1_Unpack4 \ } \ C1_DoneUnpack4: \ { \ __asm sub esi, (4*qsize) \ __asm cmp esi, qsize \ __asm jl C1_Done3 \ __asm cmp esi, (2*qsize) \ __asm jl C1_Unpack1 \ __asm cmp esi, (3*qsize) \ __asm jl C1_Unpack2 \ /* fall through */ \ } \ C1_Unpack3: \ UNPACK_##name##SSE_3A(0, 1, MaskType, ModeType); \ { \ __asm sub esi, (3*qsize) \ __asm jmp C1_Done3 \ } \ C1_Unpack2: \ UNPACK_##name##SSE_2A(0, 1, MaskType, ModeType); \ { \ __asm sub esi, (2*qsize) \ __asm jmp C1_Done3 \ } \ C1_Unpack1: \ UNPACK_##name##SSE_1A(0, 1, MaskType, ModeType); \ C1_DoneWithDec: \ { \ __asm sub esi, qsize \ } \ C1_Done3: \ { \ POPEDI \ __asm mov dmasize, esi \ POPESI \ } \ SAVE_ROW_REG; \ return dmasize; \ case 2: \ { \ /*__asm inc dword ptr [g_nCounters+4]*/ \ PUSHESI \ PUSHEDI \ __asm mov VIF_INC, incdest \ __asm mov esi, dmasize \ __asm cmp esi, (2*qsize) \ /* move source and dest */ \ __asm mov VIF_DST, dest \ __asm mov VIF_SRC, data \ __asm jl C2_Done3 \ } \ C2_Unpack: \ UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \ \ { \ __asm add VIF_DST, VIF_INC /* take into account wl */ \ __asm cmp esi, (4*qsize) \ __asm jl C2_Done2 \ __asm sub esi, (2*qsize) \ __asm jmp C2_Unpack /* unpack next */ \ } \ C2_Done2: \ { \ __asm sub esi, (2*qsize) \ } \ C2_Done3: \ { \ __asm cmp esi, qsize \ __asm jl C2_Done4 \ } \ /* execute left over qw */ \ UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \ { \ __asm sub esi, qsize \ } \ C2_Done4: \ { \ POPEDI \ __asm mov dmasize, esi \ POPESI \ } \ SAVE_ROW_REG; \ return dmasize; \ \ case 3: \ { \ /*__asm inc dword ptr [g_nCounters+8]*/ \ PUSHESI \ PUSHEDI \ __asm mov VIF_INC, incdest \ __asm mov esi, dmasize \ __asm cmp esi, (3*qsize) \ /* move source and dest */ \ __asm mov VIF_DST, dest \ __asm mov VIF_SRC, data \ __asm jl C3_Done5 \ } \ C3_Unpack: \ UNPACK_##name##SSE_3(0, 0, MaskType, ModeType); \ \ { \ __asm add VIF_DST, VIF_INC /* take into account wl */ \ __asm cmp esi, (6*qsize) \ __asm jl C3_Done2 \ __asm sub esi, (3*qsize) \ __asm jmp C3_Unpack /* unpack next */ \ } \ C3_Done2: \ { \ __asm sub esi, (3*qsize) \ } \ C3_Done5: \ { \ __asm cmp esi, qsize \ __asm jl C3_Done4 \ /* execute left over qw */ \ __asm cmp esi, (2*qsize) \ __asm jl C3_Done3 \ } \ \ /* process 2 qws */ \ UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \ { \ __asm sub esi, (2*qsize) \ __asm jmp C3_Done4 \ } \ \ C3_Done3: \ /* process 1 qw */ \ { \ __asm sub esi, qsize \ } \ UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \ C3_Done4: \ { \ POPEDI \ __asm mov dmasize, esi \ POPESI \ } \ SAVE_ROW_REG; \ return dmasize; \ \ default: /* >= 4 */ \ tempcl = _vifRegs->cycle.wl-3; \ { \ /*__asm inc dword ptr [g_nCounters+12]*/ \ PUSHESI \ PUSHEDI \ __asm mov VIF_INC, tempcl \ __asm mov s_saveinc, VIF_INC \ __asm mov esi, dmasize \ __asm cmp esi, qsize \ __asm jl C4_Done \ /* move source and dest */ \ __asm mov VIF_DST, dest \ __asm mov VIF_SRC, data \ } \ C4_Unpack: \ { \ __asm cmp esi, (3*qsize) \ __asm jge C4_Unpack3 \ __asm cmp esi, (2*qsize) \ __asm jge C4_Unpack2 \ } \ UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \ /* not enough data left */ \ { \ __asm sub esi, qsize \ __asm jmp C4_Done \ } \ C4_Unpack2: \ UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \ /* not enough data left */ \ { \ __asm sub esi, (2*qsize) \ __asm jmp C4_Done \ } \ C4_Unpack3: \ UNPACK_##name##SSE_3(0, 0, MaskType, ModeType); \ { \ __asm sub esi, (3*qsize) \ /* more data left, process 1qw at a time */ \ __asm mov VIF_INC, s_saveinc \ } \ C4_UnpackX: \ { \ /* check if any data left */ \ __asm cmp esi, qsize \ __asm jl C4_Done \ \ } \ UNPACK_##name##SSE_1(3, 0, MaskType, ModeType); \ { \ __asm sub esi, qsize \ __asm cmp VIF_INC, 1 \ __asm je C4_DoneLoop \ __asm sub VIF_INC, 1 \ __asm jmp C4_UnpackX \ } \ C4_DoneLoop: \ { \ __asm add VIF_DST, incdest /* take into account wl */ \ __asm cmp esi, qsize \ __asm jl C4_Done \ __asm jmp C4_Unpack /* unpack next */ \ } \ C4_Done: \ { \ POPEDI \ __asm mov dmasize, esi \ POPESI \ } \ SAVE_ROW_REG; \ return dmasize; \ } \ \ return dmasize; \ } \ //{ \ // /*__asm inc dword ptr [g_nCounters] \ // __asm mov eax, dmasize \ // __asm add dword ptr [g_nCounters+4], eax*/ \ // PUSHESI \ // PUSHEDI \ // __asm mov esi, dmasize \ // } \ // UNPACK_Start_Setup_##MaskType##_SSE_##ModeType##(0); \ // { \ // __asm cmp esi, qsize \ // __asm jl C1_Done3 \ // \ // /* move source and dest */ \ // __asm mov VIF_DST, dest \ // __asm mov VIF_SRC, data \ // __asm mov VIF_INC, incdest \ // __asm cmp esi, (2*qsize) \ // __asm jl C1_Unpack1 \ // __asm cmp esi, (3*qsize) \ // __asm jl C1_Unpack2 \ // __asm imul VIF_INC, 3 \ // __asm prefetchnta [eax + 192] \ // } \ //C1_Unpack3: \ // UNPACK_##name##SSE_3(0, 1, MaskType, ModeType); \ // { \ // __asm add VIF_DST, VIF_INC /* take into account wl */ \ // __asm cmp esi, (6*qsize) \ // __asm jl C1_DoneUnpack3 \ // __asm sub esi, (3*qsize) \ // __asm jmp C1_Unpack3 \ // } \ //C1_DoneUnpack3: \ // { \ // __asm sub esi, (3*qsize) \ // __asm mov VIF_INC, dword ptr [esp] /* restore old ptr */ \ // __asm cmp esi, (2*qsize) \ // __asm jl C1_Unpack1 \ // /* fall through */ \ // } \ //C1_Unpack2: \ // UNPACK_##name##SSE_2(0, 1, MaskType, ModeType); \ // { \ // __asm add VIF_DST, VIF_INC \ // __asm add VIF_DST, VIF_INC \ // __asm sub esi, (2*qsize) \ // __asm jmp C1_Done3 \ // } \ //C1_Unpack1: \ // UNPACK_##name##SSE_1(0, 1, MaskType, ModeType); \ // { \ // __asm add VIF_DST, VIF_INC /* take into account wl */ \ // __asm sub esi, qsize \ // } \ //C1_Done3: \ // { \ // POPEDI \ // __asm mov dmasize, esi \ // POPESI \ // } \ // SAVE_ROW_REG; \ //while(size >= qsize) { // funcP( dest, (u32*)cdata, chans); // cdata += ft->qsize; // size -= ft->qsize; // // if (vif->cl >= wl) { // dest += incdest; // vif->cl = 0; // } // else { // dest += 4; // vif->cl++; // } //} #define UNPACK_RIGHTSHIFT psrld #define defUNPACK_SkippingWrite2(name, qsize) \ defUNPACK_SkippingWrite(name, Regular, 0, qsize, u, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, Regular, 1, qsize, u, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, Regular, 2, qsize, u, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, Mask, 0, qsize, u, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, Mask, 1, qsize, u, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, Mask, 2, qsize, u, SAVE_ROW_REG_BASE); \ defUNPACK_SkippingWrite(name, WriteMask, 0, qsize, u, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, WriteMask, 1, qsize, u, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, WriteMask, 2, qsize, u, SAVE_ROW_REG_BASE); \ defUNPACK_SkippingWrite2(S_32, 4); defUNPACK_SkippingWrite2(S_16, 2); defUNPACK_SkippingWrite2(S_8, 1); defUNPACK_SkippingWrite2(V2_32, 8); defUNPACK_SkippingWrite2(V2_16, 4); defUNPACK_SkippingWrite2(V2_8, 2); defUNPACK_SkippingWrite2(V3_32, 12); defUNPACK_SkippingWrite2(V3_16, 6); defUNPACK_SkippingWrite2(V3_8, 3); defUNPACK_SkippingWrite2(V4_32, 16); defUNPACK_SkippingWrite2(V4_16, 8); defUNPACK_SkippingWrite2(V4_8, 4); defUNPACK_SkippingWrite2(V4_5, 2); #undef UNPACK_RIGHTSHIFT #undef defUNPACK_SkippingWrite2 #define UNPACK_RIGHTSHIFT psrad #define defUNPACK_SkippingWrite2(name, qsize) \ defUNPACK_SkippingWrite(name, Mask, 0, qsize, s, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, Regular, 0, qsize, s, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, Regular, 1, qsize, s, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, Regular, 2, qsize, s, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, Mask, 1, qsize, s, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, Mask, 2, qsize, s, SAVE_ROW_REG_BASE); \ defUNPACK_SkippingWrite(name, WriteMask, 0, qsize, s, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, WriteMask, 1, qsize, s, SAVE_NO_REG); \ defUNPACK_SkippingWrite(name, WriteMask, 2, qsize, s, SAVE_ROW_REG_BASE); \ defUNPACK_SkippingWrite2(S_16, 2); defUNPACK_SkippingWrite2(S_8, 1); defUNPACK_SkippingWrite2(V2_16, 4); defUNPACK_SkippingWrite2(V2_8, 2); defUNPACK_SkippingWrite2(V3_16, 6); defUNPACK_SkippingWrite2(V3_8, 3); defUNPACK_SkippingWrite2(V4_16, 8); defUNPACK_SkippingWrite2(V4_8, 4); #undef UNPACK_RIGHTSHIFT #undef defUNPACK_SkippingWrite2 #endif static int cycles; int mfifoVIF1rbTransfer() { u32 maddr = psHu32(DMAC_RBOR); int msize = psHu32(DMAC_RBSR)+16, ret; u32 *src; /* Check if the transfer should wrap around the ring buffer */ if ((vif1ch->madr+(vif1ch->qwc << 4)) > (maddr+msize)) { int s1 = (maddr+msize) - vif1ch->madr; int s2 = (vif1ch->qwc << 4) - s1; /* it does, so first copy 's1' bytes from 'addr' to 'data' */ src = (u32*)PSM(vif1ch->madr); if (src == NULL) return -1; ret = VIF1transfer(src, s1/4, 0); assert(ret == 0 ); // vif stall code not implemented /* and second copy 's2' bytes from 'maddr' to '&data[s1]' */ src = (u32*)PSM(maddr); if (src == NULL) return -1; ret = VIF1transfer(src, s2/4, 0); assert(ret == 0 ); // vif stall code not implemented } else { /* it doesn't, so just transfer 'qwc*4' words from 'vif1ch->madr' to VIF1 */ src = (u32*)PSM(vif1ch->madr); if (src == NULL) return -1; ret = VIF1transfer(src, vif1ch->qwc << 2, 0); assert(ret == 0 ); // vif stall code not implemented } vif1ch->madr+= (vif1ch->qwc << 4); vif1ch->madr = psHu32(DMAC_RBOR) + (vif1ch->madr & psHu32(DMAC_RBSR)); return 0; } int mfifoVIF1chain() { u32 maddr = psHu32(DMAC_RBOR); int msize = psHu32(DMAC_RBSR)+16, ret; u32 *pMem; /* Is QWC = 0? if so there is nothing to transfer */ if (vif1ch->qwc == 0) return 0; if (vif1ch->madr >= maddr && vif1ch->madr < (maddr+msize)) { if (mfifoVIF1rbTransfer() == -1) return -1; } else { pMem = (u32*)dmaGetAddr(vif1ch->madr); if (pMem == NULL) return -1; ret = VIF1transfer(pMem, vif1ch->qwc << 2, 0); assert(ret == 0 ); // vif stall code not implemented vif1ch->madr+= (vif1ch->qwc << 4); } cycles+= (vif1ch->qwc) * BIAS; /* guessing */ vif1ch->qwc = 0; return 0; } void mfifoVIF1transfer(int qwc) { u32 *ptag; int id; int done = 0, ret; cycles = 0; #ifdef VIF_LOG VIF_LOG("mfifoVIF1transfer %x\n", vif1ch->chcr); #endif if((vif1ch->chcr & 0x100) == 0)SysPrintf("MFIFO VIF1 not ready!\n"); while (qwc > 0 && done == 0) { vif1ch->tadr = psHu32(DMAC_RBOR) + (vif1ch->tadr & psHu32(DMAC_RBSR)); ptag = (u32*)dmaGetAddr(vif1ch->tadr); id = (ptag[0] >> 28) & 0x7; vif1ch->qwc = (ptag[0] & 0xffff); vif1ch->madr = ptag[1]; cycles += 2; if(vif1ch->chcr & 0x40) { ret = VIF1transfer(ptag+2, 2, 1); assert(ret == 0 ); // vif stall code not implemented } vif1ch->chcr = ( vif1ch->chcr & 0xFFFF ) | ( (*ptag) & 0xFFFF0000 ); #ifdef VIF_LOG VIF_LOG("dmaChain %8.8x_%8.8x size=%d, id=%d, addr=%lx\n", ptag[1], ptag[0], vif1ch->qwc, id, vif1ch->madr); #endif switch (id) { case 0: // refe vif1ch->tadr += 16; qwc = 0; INT(10,cycles); done = 1; break; case 1: // cnt vif1ch->madr = vif1ch->tadr + 16; qwc-= vif1ch->qwc + 1; // Set the taddr to the next tag vif1ch->tadr += 16 + (vif1ch->qwc * 16); break; case 3: // ref case 4: // refs vif1ch->tadr += 16; qwc--; break; case 7: // end vif1ch->madr = vif1ch->tadr + 16; vif1ch->tadr = vif1ch->madr + (vif1ch->qwc * 16); qwc = 0; INT(10,cycles); done = 1; break; } if (mfifoVIF1chain() == -1) { break; } if ((vif1ch->chcr & 0x80) && (ptag[0] >> 31)) { #ifdef VIF_LOG VIF_LOG("dmaIrq Set\n"); #endif //SysPrintf("mfifoVIF1transfer: dmaIrq Set\n"); qwc = 0; done = 1; INT(10,cycles); } } if(done == 0 && qwc == 0)hwDmacIrq(14); vif1ch->tadr = psHu32(DMAC_RBOR) + (vif1ch->tadr & (psHu32(DMAC_RBSR))); //hwDmacIrq(1); // restore FreezeXMMRegs(0); FreezeMMXRegs(0); } int vifMFIFOInterrupt() { vif1ch->chcr &= ~0x100; hwDmacIrq(DMAC_VIF1); // vif1ch->chcr &= ~0x100; // vif1Regs->stat&= ~0x1F000000; // FQC=0 // hwDmacIrq(DMAC_VIF1); // // if (vif1.irq > 0) { // vif1.irq--; // hwIntcIrq(5); // VIF1 Intc // } return 1; }