/* * Copyright (C) 2007-2009 Gabest * http://www.gabest.org * * This Program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This Program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU Make; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. * http://www.gnu.org/copyleft/gpl.html * */ #pragma once #include "GS.h" #include "GSTables.h" #include "GSVector.h" class GSBlock { #if _M_SSE >= 0x501 static const GSVector8i m_r16mask; #else static const GSVector4i m_r16mask; #endif static const GSVector4i m_r8mask; static const GSVector4i m_r4mask; #if _M_SSE >= 0x501 static const GSVector8i m_xxxa; static const GSVector8i m_xxbx; static const GSVector8i m_xgxx; static const GSVector8i m_rxxx; #else static const GSVector4i m_xxxa; static const GSVector4i m_xxbx; static const GSVector4i m_xgxx; static const GSVector4i m_rxxx; #endif static const GSVector4i m_uw8hmask0; static const GSVector4i m_uw8hmask1; static const GSVector4i m_uw8hmask2; static const GSVector4i m_uw8hmask3; public: template __forceinline static void WriteColumn32(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { const uint8* RESTRICT s0 = &src[srcpitch * 0]; const uint8* RESTRICT s1 = &src[srcpitch * 1]; #if _M_SSE >= 0x501 GSVector8i v0, v1; if (alignment == 32) { v0 = GSVector8i::load(s0).acbd(); v1 = GSVector8i::load(s1).acbd(); GSVector8i::sw64(v0, v1); } else { if (alignment == 16) { v0 = GSVector8i::load(&s0[0], &s0[16]).acbd(); v1 = GSVector8i::load(&s1[0], &s1[16]).acbd(); GSVector8i::sw64(v0, v1); } else { //v0 = GSVector8i::load(&s0[0], &s0[16], &s0[8], &s0[24]); //v1 = GSVector8i::load(&s1[0], &s1[16], &s1[8], &s1[24]); GSVector4i v4 = GSVector4i::load(&s0[0], &s1[0]); GSVector4i v5 = GSVector4i::load(&s0[8], &s1[8]); GSVector4i v6 = GSVector4i::load(&s0[16], &s1[16]); GSVector4i v7 = GSVector4i::load(&s0[24], &s1[24]); if (mask == 0xffffffff) { // just write them out directly ((GSVector4i*)dst)[i * 4 + 0] = v4; ((GSVector4i*)dst)[i * 4 + 1] = v5; ((GSVector4i*)dst)[i * 4 + 2] = v6; ((GSVector4i*)dst)[i * 4 + 3] = v7; return; } v0 = GSVector8i::cast(v4).insert<1>(v5); v1 = GSVector8i::cast(v6).insert<1>(v7); } } if (mask == 0xffffffff) { ((GSVector8i*)dst)[i * 2 + 0] = v0; ((GSVector8i*)dst)[i * 2 + 1] = v1; } else { GSVector8i v2((int)mask); if (mask == 0xff000000 || mask == 0x00ffffff) { ((GSVector8i*)dst)[i * 2 + 0] = ((GSVector8i*)dst)[i * 2 + 0].blend8(v0, v2); ((GSVector8i*)dst)[i * 2 + 1] = ((GSVector8i*)dst)[i * 2 + 1].blend8(v1, v2); } else { ((GSVector8i*)dst)[i * 2 + 0] = ((GSVector8i*)dst)[i * 2 + 0].blend(v0, v2); ((GSVector8i*)dst)[i * 2 + 1] = ((GSVector8i*)dst)[i * 2 + 1].blend(v1, v2); } } #else GSVector4i v0, v1, v2, v3; if (alignment != 0) { v0 = GSVector4i::load(&s0[0]); v1 = GSVector4i::load(&s0[16]); v2 = GSVector4i::load(&s1[0]); v3 = GSVector4i::load(&s1[16]); GSVector4i::sw64(v0, v2, v1, v3); } else { v0 = GSVector4i::load(&s0[0], &s1[0]); v1 = GSVector4i::load(&s0[8], &s1[8]); v2 = GSVector4i::load(&s0[16], &s1[16]); v3 = GSVector4i::load(&s0[24], &s1[24]); } if (mask == 0xffffffff) { ((GSVector4i*)dst)[i * 4 + 0] = v0; ((GSVector4i*)dst)[i * 4 + 1] = v1; ((GSVector4i*)dst)[i * 4 + 2] = v2; ((GSVector4i*)dst)[i * 4 + 3] = v3; } else { GSVector4i v4((int)mask); if (mask == 0xff000000 || mask == 0x00ffffff) { ((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, v4); ((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, v4); ((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, v4); ((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, v4); } else { ((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend(v0, v4); ((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend(v1, v4); ((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend(v2, v4); ((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend(v3, v4); } } #endif } template __forceinline static void WriteColumn16(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { const uint8* RESTRICT s0 = &src[srcpitch * 0]; const uint8* RESTRICT s1 = &src[srcpitch * 1]; // for(int j = 0; j < 16; j++) {((uint16*)s0)[j] = columnTable16[0][j]; ((uint16*)s1)[j] = columnTable16[1][j];} #if _M_SSE >= 0x501 GSVector8i v0, v1; if (alignment == 32) { v0 = GSVector8i::load(s0); v1 = GSVector8i::load(s1); GSVector8i::sw128(v0, v1); GSVector8i::sw16(v0, v1); } else { if (alignment == 16) { v0 = GSVector8i::load(&s0[0], &s1[0]); v1 = GSVector8i::load(&s0[16], &s1[16]); } else { v0 = GSVector8i::load(&s0[0], &s0[8], &s1[0], &s1[8]); v1 = GSVector8i::load(&s0[16], &s0[24], &s1[16], &s1[24]); } GSVector8i::sw16(v0, v1); } v0 = v0.acbd(); v1 = v1.acbd(); ((GSVector8i*)dst)[i * 2 + 0] = v0; ((GSVector8i*)dst)[i * 2 + 1] = v1; #else GSVector4i v0, v1, v2, v3; if (alignment != 0) { v0 = GSVector4i::load(&s0[0]); v1 = GSVector4i::load(&s0[16]); v2 = GSVector4i::load(&s1[0]); v3 = GSVector4i::load(&s1[16]); GSVector4i::sw16(v0, v1, v2, v3); GSVector4i::sw64(v0, v1, v2, v3); } else { v0 = GSVector4i::loadl(&s0[0]).upl16(GSVector4i::loadl(&s0[16])); v2 = GSVector4i::loadl(&s0[8]).upl16(GSVector4i::loadl(&s0[24])); v1 = GSVector4i::loadl(&s1[0]).upl16(GSVector4i::loadl(&s1[16])); v3 = GSVector4i::loadl(&s1[8]).upl16(GSVector4i::loadl(&s1[24])); GSVector4i::sw64(v0, v1, v2, v3); } ((GSVector4i*)dst)[i * 4 + 0] = v0; ((GSVector4i*)dst)[i * 4 + 1] = v2; ((GSVector4i*)dst)[i * 4 + 2] = v1; ((GSVector4i*)dst)[i * 4 + 3] = v3; #endif } template __forceinline static void WriteColumn8(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { // TODO: read unaligned as WriteColumn32 does and try saving a few shuffles #if _M_SSE >= 0x501 GSVector4i v4 = GSVector4i::load(&src[srcpitch * 0]); GSVector4i v5 = GSVector4i::load(&src[srcpitch * 1]); GSVector4i v6 = GSVector4i::load(&src[srcpitch * 2]); GSVector4i v7 = GSVector4i::load(&src[srcpitch * 3]); GSVector8i v0(v4, v5); GSVector8i v1(v6, v7); if ((i & 1) == 0) { v1 = v1.yxwz(); } else { v0 = v0.yxwz(); } GSVector8i::sw8(v0, v1); GSVector8i::sw16(v0, v1); v0 = v0.acbd(); v1 = v1.acbd(); ((GSVector8i*)dst)[i * 2 + 0] = v0; ((GSVector8i*)dst)[i * 2 + 1] = v1; #else GSVector4i v0 = GSVector4i::load(&src[srcpitch * 0]); GSVector4i v1 = GSVector4i::load(&src[srcpitch * 1]); GSVector4i v2 = GSVector4i::load(&src[srcpitch * 2]); GSVector4i v3 = GSVector4i::load(&src[srcpitch * 3]); if ((i & 1) == 0) { v2 = v2.yxwz(); v3 = v3.yxwz(); } else { v0 = v0.yxwz(); v1 = v1.yxwz(); } GSVector4i::sw8(v0, v2, v1, v3); GSVector4i::sw16(v0, v1, v2, v3); GSVector4i::sw64(v0, v1, v2, v3); ((GSVector4i*)dst)[i * 4 + 0] = v0; ((GSVector4i*)dst)[i * 4 + 1] = v2; ((GSVector4i*)dst)[i * 4 + 2] = v1; ((GSVector4i*)dst)[i * 4 + 3] = v3; #endif } template __forceinline static void WriteColumn4(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { //printf("WriteColumn4\n"); // TODO: read unaligned as WriteColumn32 does and try saving a few shuffles // TODO: pshufb GSVector4i v0 = GSVector4i::load(&src[srcpitch * 0]); GSVector4i v1 = GSVector4i::load(&src[srcpitch * 1]); GSVector4i v2 = GSVector4i::load(&src[srcpitch * 2]); GSVector4i v3 = GSVector4i::load(&src[srcpitch * 3]); if ((i & 1) == 0) { v2 = v2.yxwzlh(); v3 = v3.yxwzlh(); } else { v0 = v0.yxwzlh(); v1 = v1.yxwzlh(); } GSVector4i::sw4(v0, v2, v1, v3); GSVector4i::sw8(v0, v1, v2, v3); GSVector4i::sw8(v0, v2, v1, v3); GSVector4i::sw64(v0, v2, v1, v3); ((GSVector4i*)dst)[i * 4 + 0] = v0; ((GSVector4i*)dst)[i * 4 + 1] = v1; ((GSVector4i*)dst)[i * 4 + 2] = v2; ((GSVector4i*)dst)[i * 4 + 3] = v3; } template static void WriteColumn32(int y, uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { switch ((y >> 1) & 3) { case 0: WriteColumn32<0, alignment, mask>(dst, src, srcpitch); break; case 1: WriteColumn32<1, alignment, mask>(dst, src, srcpitch); break; case 2: WriteColumn32<2, alignment, mask>(dst, src, srcpitch); break; case 3: WriteColumn32<3, alignment, mask>(dst, src, srcpitch); break; default: __assume(0); } } template static void WriteColumn16(int y, uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { switch ((y >> 1) & 3) { case 0: WriteColumn16<0, alignment>(dst, src, srcpitch); break; case 1: WriteColumn16<1, alignment>(dst, src, srcpitch); break; case 2: WriteColumn16<2, alignment>(dst, src, srcpitch); break; case 3: WriteColumn16<3, alignment>(dst, src, srcpitch); break; default: __assume(0); } } template static void WriteColumn8(int y, uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { switch ((y >> 2) & 3) { case 0: WriteColumn8<0, alignment>(dst, src, srcpitch); break; case 1: WriteColumn8<1, alignment>(dst, src, srcpitch); break; case 2: WriteColumn8<2, alignment>(dst, src, srcpitch); break; case 3: WriteColumn8<3, alignment>(dst, src, srcpitch); break; default: __assume(0); } } template static void WriteColumn4(int y, uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { switch ((y >> 2) & 3) { case 0: WriteColumn4<0, alignment>(dst, src, srcpitch); break; case 1: WriteColumn4<1, alignment>(dst, src, srcpitch); break; case 2: WriteColumn4<2, alignment>(dst, src, srcpitch); break; case 3: WriteColumn4<3, alignment>(dst, src, srcpitch); break; default: __assume(0); } } template static void WriteBlock32(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { WriteColumn32<0, alignment, mask>(dst, src, srcpitch); src += srcpitch * 2; WriteColumn32<1, alignment, mask>(dst, src, srcpitch); src += srcpitch * 2; WriteColumn32<2, alignment, mask>(dst, src, srcpitch); src += srcpitch * 2; WriteColumn32<3, alignment, mask>(dst, src, srcpitch); } template static void WriteBlock16(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { WriteColumn16<0, alignment>(dst, src, srcpitch); src += srcpitch * 2; WriteColumn16<1, alignment>(dst, src, srcpitch); src += srcpitch * 2; WriteColumn16<2, alignment>(dst, src, srcpitch); src += srcpitch * 2; WriteColumn16<3, alignment>(dst, src, srcpitch); } template static void WriteBlock8(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { WriteColumn8<0, alignment>(dst, src, srcpitch); src += srcpitch * 4; WriteColumn8<1, alignment>(dst, src, srcpitch); src += srcpitch * 4; WriteColumn8<2, alignment>(dst, src, srcpitch); src += srcpitch * 4; WriteColumn8<3, alignment>(dst, src, srcpitch); } template static void WriteBlock4(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch) { WriteColumn4<0, alignment>(dst, src, srcpitch); src += srcpitch * 4; WriteColumn4<1, alignment>(dst, src, srcpitch); src += srcpitch * 4; WriteColumn4<2, alignment>(dst, src, srcpitch); src += srcpitch * 4; WriteColumn4<3, alignment>(dst, src, srcpitch); } template __forceinline static void ReadColumn32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { #if _M_SSE >= 0x501 const GSVector8i* s = (const GSVector8i*)src; GSVector8i v0 = s[i * 2 + 0]; GSVector8i v1 = s[i * 2 + 1]; GSVector8i::sw128(v0, v1); GSVector8i::sw64(v0, v1); GSVector8i::store(&dst[dstpitch * 0], v0); GSVector8i::store(&dst[dstpitch * 1], v1); #else const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0 = s[i * 4 + 0]; GSVector4i v1 = s[i * 4 + 1]; GSVector4i v2 = s[i * 4 + 2]; GSVector4i v3 = s[i * 4 + 3]; GSVector4i::sw64(v0, v1, v2, v3); GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; GSVector4i::store(&d0[0], v0); GSVector4i::store(&d0[1], v1); GSVector4i::store(&d1[0], v2); GSVector4i::store(&d1[1], v3); #endif } template __forceinline static void ReadColumn16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { #if _M_SSE >= 0x501 const GSVector8i* s = (const GSVector8i*)src; GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask); GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask); GSVector8i::sw128(v0, v1); GSVector8i::sw32(v0, v1); v0 = v0.acbd(); v1 = v1.acbd(); GSVector8i::store(&dst[dstpitch * 0], v0); GSVector8i::store(&dst[dstpitch * 1], v1); #elif _M_SSE >= 0x301 const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0 = s[i * 4 + 0].shuffle8(m_r16mask); GSVector4i v1 = s[i * 4 + 1].shuffle8(m_r16mask); GSVector4i v2 = s[i * 4 + 2].shuffle8(m_r16mask); GSVector4i v3 = s[i * 4 + 3].shuffle8(m_r16mask); GSVector4i::sw32(v0, v1, v2, v3); GSVector4i::sw64(v0, v1, v2, v3); GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; GSVector4i::store(&d0[0], v0); GSVector4i::store(&d0[1], v2); GSVector4i::store(&d1[0], v1); GSVector4i::store(&d1[1], v3); #endif } template __forceinline static void ReadColumn8(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { //for(int j = 0; j < 64; j++) ((uint8*)src)[j] = (uint8)j; #if 0 //_M_SSE >= 0x501 const GSVector8i* s = (const GSVector8i*)src; GSVector8i v0 = s[i * 2 + 0]; GSVector8i v1 = s[i * 2 + 1]; GSVector8i::sw8(v0, v1); GSVector8i::sw16(v0, v1); GSVector8i::sw8(v0, v1); GSVector8i::sw128(v0, v1); GSVector8i::sw16(v0, v1); v0 = v0.acbd(); v1 = v1.acbd(); v1 = v1.yxwz(); GSVector8i::storel(&dst[dstpitch * 0], v0); GSVector8i::storeh(&dst[dstpitch * 1], v0); GSVector8i::storel(&dst[dstpitch * 2], v1); GSVector8i::storeh(&dst[dstpitch * 3], v1); // TODO: not sure if this is worth it, not in this form, there should be a shorter path #else const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; if ((i & 1) == 0) { v0 = s[i * 4 + 0]; v1 = s[i * 4 + 1]; v2 = s[i * 4 + 2]; v3 = s[i * 4 + 3]; } else { v2 = s[i * 4 + 0]; v3 = s[i * 4 + 1]; v0 = s[i * 4 + 2]; v1 = s[i * 4 + 3]; } v0 = v0.shuffle8(m_r8mask); v1 = v1.shuffle8(m_r8mask); v2 = v2.shuffle8(m_r8mask); v3 = v3.shuffle8(m_r8mask); GSVector4i::sw16(v0, v1, v2, v3); GSVector4i::sw32(v0, v1, v3, v2); GSVector4i::store(&dst[dstpitch * 0], v0); GSVector4i::store(&dst[dstpitch * 1], v3); GSVector4i::store(&dst[dstpitch * 2], v1); GSVector4i::store(&dst[dstpitch * 3], v2); #endif } template __forceinline static void ReadColumn4(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { //printf("ReadColumn4\n"); const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0 = s[i * 4 + 0].xzyw(); GSVector4i v1 = s[i * 4 + 1].xzyw(); GSVector4i v2 = s[i * 4 + 2].xzyw(); GSVector4i v3 = s[i * 4 + 3].xzyw(); GSVector4i::sw64(v0, v1, v2, v3); GSVector4i::sw4(v0, v2, v1, v3); GSVector4i::sw8(v0, v1, v2, v3); v0 = v0.shuffle8(m_r4mask); v1 = v1.shuffle8(m_r4mask); v2 = v2.shuffle8(m_r4mask); v3 = v3.shuffle8(m_r4mask); if ((i & 1) == 0) { GSVector4i::sw16rh(v0, v1, v2, v3); } else { GSVector4i::sw16rl(v0, v1, v2, v3); } GSVector4i::store(&dst[dstpitch * 0], v0); GSVector4i::store(&dst[dstpitch * 1], v1); GSVector4i::store(&dst[dstpitch * 2], v2); GSVector4i::store(&dst[dstpitch * 3], v3); } static void ReadColumn32(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { switch ((y >> 1) & 3) { case 0: ReadColumn32<0>(src, dst, dstpitch); break; case 1: ReadColumn32<1>(src, dst, dstpitch); break; case 2: ReadColumn32<2>(src, dst, dstpitch); break; case 3: ReadColumn32<3>(src, dst, dstpitch); break; default: __assume(0); } } static void ReadColumn16(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { switch ((y >> 1) & 3) { case 0: ReadColumn16<0>(src, dst, dstpitch); break; case 1: ReadColumn16<1>(src, dst, dstpitch); break; case 2: ReadColumn16<2>(src, dst, dstpitch); break; case 3: ReadColumn16<3>(src, dst, dstpitch); break; default: __assume(0); } } static void ReadColumn8(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { switch ((y >> 2) & 3) { case 0: ReadColumn8<0>(src, dst, dstpitch); break; case 1: ReadColumn8<1>(src, dst, dstpitch); break; case 2: ReadColumn8<2>(src, dst, dstpitch); break; case 3: ReadColumn8<3>(src, dst, dstpitch); break; default: __assume(0); } } static void ReadColumn4(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { switch ((y >> 2) & 3) { case 0: ReadColumn4<0>(src, dst, dstpitch); break; case 1: ReadColumn4<1>(src, dst, dstpitch); break; case 2: ReadColumn4<2>(src, dst, dstpitch); break; case 3: ReadColumn4<3>(src, dst, dstpitch); break; default: __assume(0); } } static void ReadBlock32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { ReadColumn32<0>(src, dst, dstpitch); dst += dstpitch * 2; ReadColumn32<1>(src, dst, dstpitch); dst += dstpitch * 2; ReadColumn32<2>(src, dst, dstpitch); dst += dstpitch * 2; ReadColumn32<3>(src, dst, dstpitch); } static void ReadBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { ReadColumn16<0>(src, dst, dstpitch); dst += dstpitch * 2; ReadColumn16<1>(src, dst, dstpitch); dst += dstpitch * 2; ReadColumn16<2>(src, dst, dstpitch); dst += dstpitch * 2; ReadColumn16<3>(src, dst, dstpitch); } static void ReadBlock8(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { ReadColumn8<0>(src, dst, dstpitch); dst += dstpitch * 4; ReadColumn8<1>(src, dst, dstpitch); dst += dstpitch * 4; ReadColumn8<2>(src, dst, dstpitch); dst += dstpitch * 4; ReadColumn8<3>(src, dst, dstpitch); } static void ReadBlock4(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { ReadColumn4<0>(src, dst, dstpitch); dst += dstpitch * 4; ReadColumn4<1>(src, dst, dstpitch); dst += dstpitch * 4; ReadColumn4<2>(src, dst, dstpitch); dst += dstpitch * 4; ReadColumn4<3>(src, dst, dstpitch); } __forceinline static void ReadBlock4P(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { //printf("ReadBlock4P\n"); const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; GSVector4i mask(0x0f0f0f0f); for (int i = 0; i < 2; i++) { // col 0, 2 v0 = s[i * 8 + 0]; v1 = s[i * 8 + 1]; v2 = s[i * 8 + 2]; v3 = s[i * 8 + 3]; GSVector4i::sw8(v0, v1, v2, v3); GSVector4i::sw16(v0, v1, v2, v3); GSVector4i::sw8(v0, v2, v1, v3); GSVector4i::store(&dst[dstpitch * 0 + 0], (v0 & mask)); GSVector4i::store(&dst[dstpitch * 0 + 16], (v1 & mask)); GSVector4i::store(&dst[dstpitch * 1 + 0], (v2 & mask)); GSVector4i::store(&dst[dstpitch * 1 + 16], (v3 & mask)); dst += dstpitch * 2; GSVector4i::store(&dst[dstpitch * 0 + 0], (v0.andnot(mask)).yxwz() >> 4); GSVector4i::store(&dst[dstpitch * 0 + 16], (v1.andnot(mask)).yxwz() >> 4); GSVector4i::store(&dst[dstpitch * 1 + 0], (v2.andnot(mask)).yxwz() >> 4); GSVector4i::store(&dst[dstpitch * 1 + 16], (v3.andnot(mask)).yxwz() >> 4); dst += dstpitch * 2; // col 1, 3 v0 = s[i * 8 + 4]; v1 = s[i * 8 + 5]; v2 = s[i * 8 + 6]; v3 = s[i * 8 + 7]; GSVector4i::sw8(v0, v1, v2, v3); GSVector4i::sw16(v0, v1, v2, v3); GSVector4i::sw8(v0, v2, v1, v3); GSVector4i::store(&dst[dstpitch * 0 + 0], (v0 & mask).yxwz()); GSVector4i::store(&dst[dstpitch * 0 + 16], (v1 & mask).yxwz()); GSVector4i::store(&dst[dstpitch * 1 + 0], (v2 & mask).yxwz()); GSVector4i::store(&dst[dstpitch * 1 + 16], (v3 & mask).yxwz()); dst += dstpitch * 2; GSVector4i::store(&dst[dstpitch * 0 + 0], (v0.andnot(mask)) >> 4); GSVector4i::store(&dst[dstpitch * 0 + 16], (v1.andnot(mask)) >> 4); GSVector4i::store(&dst[dstpitch * 1 + 0], (v2.andnot(mask)) >> 4); GSVector4i::store(&dst[dstpitch * 1 + 16], (v3.andnot(mask)) >> 4); dst += dstpitch * 2; } } __forceinline static void ReadBlock8HP(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { #if _M_SSE >= 0x501 uint8* RESTRICT d0 = &dst[dstpitch * 0]; uint8* RESTRICT d1 = &dst[dstpitch * 4]; const GSVector8i* s = (const GSVector8i*)src; GSVector8i v0, v1, v2, v3; GSVector4i v4, v5; v0 = s[0].acbd(); v1 = s[1].acbd(); v2 = s[2].acbd(); v3 = s[3].acbd(); v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)); v4 = v0.extract<0>(); v5 = v0.extract<1>(); GSVector4i::storel(&d0[dstpitch * 0], v4); GSVector4i::storel(&d0[dstpitch * 1], v5); GSVector4i::storeh(&d0[dstpitch * 2], v4); GSVector4i::storeh(&d0[dstpitch * 3], v5); v0 = s[4].acbd(); v1 = s[5].acbd(); v2 = s[6].acbd(); v3 = s[7].acbd(); v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)); v4 = v0.extract<0>(); v5 = v0.extract<1>(); GSVector4i::storel(&d1[dstpitch * 0], v4); GSVector4i::storel(&d1[dstpitch * 1], v5); GSVector4i::storeh(&d1[dstpitch * 2], v4); GSVector4i::storeh(&d1[dstpitch * 3], v5); #else const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; for (int i = 0; i < 4; i++) { v0 = s[i * 4 + 0]; v1 = s[i * 4 + 1]; v2 = s[i * 4 + 2]; v3 = s[i * 4 + 3]; GSVector4i::sw64(v0, v1, v2, v3); v0 = ((v0 >> 24).ps32(v1 >> 24)).pu16((v2 >> 24).ps32(v3 >> 24)); GSVector4i::storel(dst, v0); dst += dstpitch; GSVector4i::storeh(dst, v0); dst += dstpitch; } #endif } __forceinline static void ReadBlock4HLP(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { #if _M_SSE >= 0x501 uint8* RESTRICT d0 = &dst[dstpitch * 0]; uint8* RESTRICT d1 = &dst[dstpitch * 4]; const GSVector8i* s = (const GSVector8i*)src; GSVector8i v0, v1, v2, v3; GSVector4i v4, v5; GSVector8i mask(0x0f0f0f0f); v0 = s[0].acbd(); v1 = s[1].acbd(); v2 = s[2].acbd(); v3 = s[3].acbd(); v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)) & mask; v4 = v0.extract<0>(); v5 = v0.extract<1>(); GSVector4i::storel(&d0[dstpitch * 0], v4); GSVector4i::storel(&d0[dstpitch * 1], v5); GSVector4i::storeh(&d0[dstpitch * 2], v4); GSVector4i::storeh(&d0[dstpitch * 3], v5); v0 = s[4].acbd(); v1 = s[5].acbd(); v2 = s[6].acbd(); v3 = s[7].acbd(); v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)) & mask; v4 = v0.extract<0>(); v5 = v0.extract<1>(); GSVector4i::storel(&d1[dstpitch * 0], v4); GSVector4i::storel(&d1[dstpitch * 1], v5); GSVector4i::storeh(&d1[dstpitch * 2], v4); GSVector4i::storeh(&d1[dstpitch * 3], v5); #else const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; GSVector4i mask(0x0f0f0f0f); for (int i = 0; i < 4; i++) { v0 = s[i * 4 + 0]; v1 = s[i * 4 + 1]; v2 = s[i * 4 + 2]; v3 = s[i * 4 + 3]; GSVector4i::sw64(v0, v1, v2, v3); v0 = ((v0 >> 24).ps32(v1 >> 24)).pu16((v2 >> 24).ps32(v3 >> 24)) & mask; GSVector4i::storel(dst, v0); dst += dstpitch; GSVector4i::storeh(dst, v0); dst += dstpitch; } #endif } __forceinline static void ReadBlock4HHP(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { #if _M_SSE >= 0x501 uint8* RESTRICT d0 = &dst[dstpitch * 0]; uint8* RESTRICT d1 = &dst[dstpitch * 4]; const GSVector8i* s = (const GSVector8i*)src; GSVector8i v0, v1, v2, v3; GSVector4i v4, v5; v0 = s[0].acbd(); v1 = s[1].acbd(); v2 = s[2].acbd(); v3 = s[3].acbd(); v0 = (v0 >> 28).ps32(v1 >> 28).pu16((v2 >> 28).ps32(v3 >> 28)); v4 = v0.extract<0>(); v5 = v0.extract<1>(); GSVector4i::storel(&d0[dstpitch * 0], v4); GSVector4i::storel(&d0[dstpitch * 1], v5); GSVector4i::storeh(&d0[dstpitch * 2], v4); GSVector4i::storeh(&d0[dstpitch * 3], v5); v0 = s[4].acbd(); v1 = s[5].acbd(); v2 = s[6].acbd(); v3 = s[7].acbd(); v0 = (v0 >> 28).ps32(v1 >> 28).pu16((v2 >> 28).ps32(v3 >> 28)); v4 = v0.extract<0>(); v5 = v0.extract<1>(); GSVector4i::storel(&d1[dstpitch * 0], v4); GSVector4i::storel(&d1[dstpitch * 1], v5); GSVector4i::storeh(&d1[dstpitch * 2], v4); GSVector4i::storeh(&d1[dstpitch * 3], v5); #else const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; for (int i = 0; i < 4; i++) { v0 = s[i * 4 + 0]; v1 = s[i * 4 + 1]; v2 = s[i * 4 + 2]; v3 = s[i * 4 + 3]; GSVector4i::sw64(v0, v1, v2, v3); v0 = ((v0 >> 28).ps32(v1 >> 28)).pu16((v2 >> 28).ps32(v3 >> 28)); GSVector4i::storel(dst, v0); dst += dstpitch; GSVector4i::storeh(dst, v0); dst += dstpitch; } #endif } template __forceinline static V Expand24to32(const V& c, const V& TA0) { return c | (AEM ? TA0.andnot(c == V::zero()) : TA0); // TA0 & (c != GSVector4i::zero()) } template __forceinline static V Expand16to32(const V& c, const V& TA0, const V& TA1) { return ((c & m_rxxx) << 3) | ((c & m_xgxx) << 6) | ((c & m_xxbx) << 9) | (AEM ? TA0.blend8(TA1, c.sra16(15)).andnot(c == V::zero()) : TA0.blend(TA1, c.sra16(15))); } template static void ExpandBlock24(const uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) { #if _M_SSE >= 0x501 const GSVector8i* s = (const GSVector8i*)src; GSVector8i TA0(TEXA.TA0 << 24); GSVector8i mask = GSVector8i::x00ffffff(); for (int i = 0; i < 4; i++, dst += dstpitch * 2) { GSVector8i v0 = s[i * 2 + 0] & mask; GSVector8i v1 = s[i * 2 + 1] & mask; GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0]; GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1]; d0[0] = Expand24to32(v0, TA0); d1[0] = Expand24to32(v1, TA0); } #else const GSVector4i* s = (const GSVector4i*)src; GSVector4i TA0(TEXA.TA0 << 24); GSVector4i mask = GSVector4i::x00ffffff(); for (int i = 0; i < 4; i++, dst += dstpitch * 2) { GSVector4i v0 = s[i * 4 + 0] & mask; GSVector4i v1 = s[i * 4 + 1] & mask; GSVector4i v2 = s[i * 4 + 2] & mask; GSVector4i v3 = s[i * 4 + 3] & mask; GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; d0[0] = Expand24to32(v0, TA0); d0[1] = Expand24to32(v1, TA0); d1[0] = Expand24to32(v2, TA0); d1[1] = Expand24to32(v3, TA0); } #endif } template static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs { #if _M_SSE >= 0x501 const GSVector8i* s = (const GSVector8i*)src; GSVector8i TA0(TEXA.TA0 << 24); GSVector8i TA1(TEXA.TA1 << 24); for (int i = 0; i < 8; i++, dst += dstpitch) { GSVector8i v = s[i].acbd(); ((GSVector8i*)dst)[0] = Expand16to32(v.upl16(v), TA0, TA1); ((GSVector8i*)dst)[1] = Expand16to32(v.uph16(v), TA0, TA1); } #else const GSVector4i* s = (const GSVector4i*)src; GSVector4i TA0(TEXA.TA0 << 24); GSVector4i TA1(TEXA.TA1 << 24); for (int i = 0; i < 8; i++, dst += dstpitch) { GSVector4i v0 = s[i * 2 + 0]; ((GSVector4i*)dst)[0] = Expand16to32(v0.upl16(v0), TA0, TA1); ((GSVector4i*)dst)[1] = Expand16to32(v0.uph16(v0), TA0, TA1); GSVector4i v1 = s[i * 2 + 1]; ((GSVector4i*)dst)[2] = Expand16to32(v1.upl16(v1), TA0, TA1); ((GSVector4i*)dst)[3] = Expand16to32(v1.uph16(v1), TA0, TA1); } #endif } __forceinline static void ExpandBlock8_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { for (int j = 0; j < 16; j++, dst += dstpitch) { ((const GSVector4i*)src)[j].gather32_8(pal, (GSVector4i*)dst); } } __forceinline static void ExpandBlock8_16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { for (int j = 0; j < 16; j++, dst += dstpitch) { ((const GSVector4i*)src)[j].gather16_8(pal, (GSVector4i*)dst); } } __forceinline static void ExpandBlock4_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint64* RESTRICT pal) { for (int j = 0; j < 16; j++, dst += dstpitch) { ((const GSVector4i*)src)[j].gather64_8(pal, (GSVector4i*)dst); } } __forceinline static void ExpandBlock4_16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint64* RESTRICT pal) { for (int j = 0; j < 16; j++, dst += dstpitch) { ((const GSVector4i*)src)[j].gather32_8(pal, (GSVector4i*)dst); } } __forceinline static void ExpandBlock8H_32(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { for (int j = 0; j < 8; j++, dst += dstpitch) { const GSVector4i* s = (const GSVector4i*)src; ((GSVector4i*)dst)[0] = (s[j * 2 + 0] >> 24).gather32_32<>(pal); ((GSVector4i*)dst)[1] = (s[j * 2 + 1] >> 24).gather32_32<>(pal); } } __forceinline static void ExpandBlock8H_16(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { for (int j = 0; j < 8; j++, dst += dstpitch) { const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0 = (s[j * 2 + 0] >> 24).gather32_32<>(pal); GSVector4i v1 = (s[j * 2 + 1] >> 24).gather32_32<>(pal); ((GSVector4i*)dst)[0] = v0.pu32(v1); } } __forceinline static void ExpandBlock4HL_32(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { for (int j = 0; j < 8; j++, dst += dstpitch) { const GSVector4i* s = (const GSVector4i*)src; ((GSVector4i*)dst)[0] = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal); ((GSVector4i*)dst)[1] = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal); } } __forceinline static void ExpandBlock4HL_16(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { for (int j = 0; j < 8; j++, dst += dstpitch) { const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0 = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal); GSVector4i v1 = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal); ((GSVector4i*)dst)[0] = v0.pu32(v1); } } __forceinline static void ExpandBlock4HH_32(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { for (int j = 0; j < 8; j++, dst += dstpitch) { const GSVector4i* s = (const GSVector4i*)src; ((GSVector4i*)dst)[0] = (s[j * 2 + 0] >> 28).gather32_32<>(pal); ((GSVector4i*)dst)[1] = (s[j * 2 + 1] >> 28).gather32_32<>(pal); } } __forceinline static void ExpandBlock4HH_16(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { for (int j = 0; j < 8; j++, dst += dstpitch) { const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0 = (s[j * 2 + 0] >> 28).gather32_32<>(pal); GSVector4i v1 = (s[j * 2 + 1] >> 28).gather32_32<>(pal); ((GSVector4i*)dst)[0] = v0.pu32(v1); } } __forceinline static void UnpackAndWriteBlock24(const uint8* RESTRICT src, int srcpitch, uint8* RESTRICT dst) { #if _M_SSE >= 0x501 const uint8* RESTRICT s0 = &src[srcpitch * 0]; const uint8* RESTRICT s1 = &src[srcpitch * 1]; const uint8* RESTRICT s2 = &src[srcpitch * 2]; const uint8* RESTRICT s3 = &src[srcpitch * 3]; GSVector8i v0, v1, v2, v3, v4, v5, v6; GSVector8i mask = GSVector8i::x00ffffff(); v4 = GSVector8i::load(s0, s0 + 8, s2, s2 + 8); v5 = GSVector8i::load(s0 + 16, s1, s2 + 16, s3); v6 = GSVector8i::load(s1 + 8, s1 + 16, s3 + 8, s3 + 16); v0 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd(); v4 = v4.srl<12>(v5); v1 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd(); v4 = v5.srl<8>(v6); v2 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd(); v4 = v6.srl<4>(); v3 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd(); GSVector8i::sw64(v0, v2, v1, v3); ((GSVector8i*)dst)[0] = ((GSVector8i*)dst)[0].blend8(v0, mask); ((GSVector8i*)dst)[1] = ((GSVector8i*)dst)[1].blend8(v2, mask); ((GSVector8i*)dst)[2] = ((GSVector8i*)dst)[2].blend8(v1, mask); ((GSVector8i*)dst)[3] = ((GSVector8i*)dst)[3].blend8(v3, mask); src += srcpitch * 4; s0 = &src[srcpitch * 0]; s1 = &src[srcpitch * 1]; s2 = &src[srcpitch * 2]; s3 = &src[srcpitch * 3]; v4 = GSVector8i::load(s0, s0 + 8, s2, s2 + 8); v5 = GSVector8i::load(s0 + 16, s1, s2 + 16, s3); v6 = GSVector8i::load(s1 + 8, s1 + 16, s3 + 8, s3 + 16); v0 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd(); v4 = v4.srl<12>(v5); v1 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd(); v4 = v5.srl<8>(v6); v2 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd(); v4 = v6.srl<4>(); v3 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd(); GSVector8i::sw64(v0, v2, v1, v3); ((GSVector8i*)dst)[4] = ((GSVector8i*)dst)[4].blend8(v0, mask); ((GSVector8i*)dst)[5] = ((GSVector8i*)dst)[5].blend8(v2, mask); ((GSVector8i*)dst)[6] = ((GSVector8i*)dst)[6].blend8(v1, mask); ((GSVector8i*)dst)[7] = ((GSVector8i*)dst)[7].blend8(v3, mask); #else GSVector4i v0, v1, v2, v3, v4, v5, v6; GSVector4i mask = GSVector4i::x00ffffff(); for (int i = 0; i < 4; i++, src += srcpitch * 2) { v4 = GSVector4i::load(src); v5 = GSVector4i::load(src + 16, src + srcpitch); v6 = GSVector4i::load(src + srcpitch + 8); v0 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())); v4 = v4.srl<12>(v5); v1 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())); v4 = v5.srl<8>(v6); v2 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())); v4 = v6.srl<4>(); v3 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())); GSVector4i::sw64(v0, v2, v1, v3); ((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, mask); ((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, mask); ((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, mask); ((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask); } #endif } __forceinline static void UnpackAndWriteBlock8H(const uint8* RESTRICT src, int srcpitch, uint8* RESTRICT dst) { GSVector4i v4, v5, v6, v7; #if _M_SSE >= 0x501 GSVector8i v0, v1, v2, v3; GSVector8i mask = GSVector8i::xff000000(); v4 = GSVector4i::loadl(&src[srcpitch * 0]); v5 = GSVector4i::loadl(&src[srcpitch * 1]); v6 = GSVector4i::loadl(&src[srcpitch * 2]); v7 = GSVector4i::loadl(&src[srcpitch * 3]); v2 = GSVector8i::cast(v4.upl16(v5)); v3 = GSVector8i::cast(v6.upl16(v7)); v0 = v2.u8to32c() << 24; v1 = v2.bbbb().u8to32c() << 24; v2 = v3.u8to32c() << 24; v3 = v3.bbbb().u8to32c() << 24; ((GSVector8i*)dst)[0] = ((GSVector8i*)dst)[0].blend8(v0, mask); ((GSVector8i*)dst)[1] = ((GSVector8i*)dst)[1].blend8(v1, mask); ((GSVector8i*)dst)[2] = ((GSVector8i*)dst)[2].blend8(v2, mask); ((GSVector8i*)dst)[3] = ((GSVector8i*)dst)[3].blend8(v3, mask); src += srcpitch * 4; v4 = GSVector4i::loadl(&src[srcpitch * 0]); v5 = GSVector4i::loadl(&src[srcpitch * 1]); v6 = GSVector4i::loadl(&src[srcpitch * 2]); v7 = GSVector4i::loadl(&src[srcpitch * 3]); v2 = GSVector8i::cast(v4.upl16(v5)); v3 = GSVector8i::cast(v6.upl16(v7)); v0 = v2.u8to32c() << 24; v1 = v2.bbbb().u8to32c() << 24; v2 = v3.u8to32c() << 24; v3 = v3.bbbb().u8to32c() << 24; ((GSVector8i*)dst)[4] = ((GSVector8i*)dst)[4].blend8(v0, mask); ((GSVector8i*)dst)[5] = ((GSVector8i*)dst)[5].blend8(v1, mask); ((GSVector8i*)dst)[6] = ((GSVector8i*)dst)[6].blend8(v2, mask); ((GSVector8i*)dst)[7] = ((GSVector8i*)dst)[7].blend8(v3, mask); #elif _M_SSE >= 0x301 GSVector4i v0, v1, v2, v3; GSVector4i mask = GSVector4i::xff000000(); GSVector4i mask0 = m_uw8hmask0; GSVector4i mask1 = m_uw8hmask1; GSVector4i mask2 = m_uw8hmask2; GSVector4i mask3 = m_uw8hmask3; for (int i = 0; i < 4; i++, src += srcpitch * 2) { v4 = GSVector4i::load(src, src + srcpitch); v0 = v4.shuffle8(mask0); v1 = v4.shuffle8(mask1); v2 = v4.shuffle8(mask2); v3 = v4.shuffle8(mask3); ((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, mask); ((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, mask); ((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, mask); ((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask); } #endif } __forceinline static void UnpackAndWriteBlock4HL(const uint8* RESTRICT src, int srcpitch, uint8* RESTRICT dst) { //printf("4HL\n"); if (0) { uint8* s = (uint8*)src; for (int j = 0; j < 8; j++, s += srcpitch) for (int i = 0; i < 4; i++) s[i] = (columnTable32[j][i * 2] & 0x0f) | (columnTable32[j][i * 2 + 1] << 4); } GSVector4i v4, v5, v6; #if _M_SSE >= 0x501 GSVector8i v0, v1, v2, v3; GSVector8i mask(0x0f000000); v6 = GSVector4i(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]); v4 = v6.upl8(v6 >> 4); v5 = v6.uph8(v6 >> 4); v2 = GSVector8i::cast(v4.upl16(v5)); v3 = GSVector8i::cast(v4.uph16(v5)); v0 = v2.u8to32c() << 24; v1 = v2.bbbb().u8to32c() << 24; v2 = v3.u8to32c() << 24; v3 = v3.bbbb().u8to32c() << 24; ((GSVector8i*)dst)[0] = ((GSVector8i*)dst)[0].blend(v0, mask); ((GSVector8i*)dst)[1] = ((GSVector8i*)dst)[1].blend(v1, mask); ((GSVector8i*)dst)[2] = ((GSVector8i*)dst)[2].blend(v2, mask); ((GSVector8i*)dst)[3] = ((GSVector8i*)dst)[3].blend(v3, mask); src += srcpitch * 4; v6 = GSVector4i(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]); v4 = v6.upl8(v6 >> 4); v5 = v6.uph8(v6 >> 4); v2 = GSVector8i::cast(v4.upl16(v5)); v3 = GSVector8i::cast(v4.uph16(v5)); v0 = v2.u8to32c() << 24; v1 = v2.bbbb().u8to32c() << 24; v2 = v3.u8to32c() << 24; v3 = v3.bbbb().u8to32c() << 24; ((GSVector8i*)dst)[4] = ((GSVector8i*)dst)[4].blend(v0, mask); ((GSVector8i*)dst)[5] = ((GSVector8i*)dst)[5].blend(v1, mask); ((GSVector8i*)dst)[6] = ((GSVector8i*)dst)[6].blend(v2, mask); ((GSVector8i*)dst)[7] = ((GSVector8i*)dst)[7].blend(v3, mask); #elif _M_SSE >= 0x301 GSVector4i v0, v1, v2, v3; GSVector4i mask = GSVector4i(0x0f000000); GSVector4i mask0 = m_uw8hmask0; GSVector4i mask1 = m_uw8hmask1; GSVector4i mask2 = m_uw8hmask2; GSVector4i mask3 = m_uw8hmask3; for (int i = 0; i < 2; i++, src += srcpitch * 4) { GSVector4i v(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 3]); v4 = v.upl8(v >> 4); v5 = v.uph8(v >> 4); v0 = v4.shuffle8(mask0); v1 = v4.shuffle8(mask1); v2 = v4.shuffle8(mask2); v3 = v4.shuffle8(mask3); ((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask); ((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask); ((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask); ((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask); v0 = v5.shuffle8(mask0); v1 = v5.shuffle8(mask1); v2 = v5.shuffle8(mask2); v3 = v5.shuffle8(mask3); ((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask); ((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask); ((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask); ((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask); } #endif } __forceinline static void UnpackAndWriteBlock4HH(const uint8* RESTRICT src, int srcpitch, uint8* RESTRICT dst) { GSVector4i v4, v5, v6; #if _M_SSE >= 0x501 GSVector8i v0, v1, v2, v3; GSVector8i mask = GSVector8i::xf0000000(); v6 = GSVector4i(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]); v4 = (v6 << 4).upl8(v6); v5 = (v6 << 4).uph8(v6); v2 = GSVector8i::cast(v4.upl16(v5)); v3 = GSVector8i::cast(v4.uph16(v5)); v0 = v2.u8to32c() << 24; v1 = v2.bbbb().u8to32c() << 24; v2 = v3.u8to32c() << 24; v3 = v3.bbbb().u8to32c() << 24; ((GSVector8i*)dst)[0] = ((GSVector8i*)dst)[0].blend(v0, mask); ((GSVector8i*)dst)[1] = ((GSVector8i*)dst)[1].blend(v1, mask); ((GSVector8i*)dst)[2] = ((GSVector8i*)dst)[2].blend(v2, mask); ((GSVector8i*)dst)[3] = ((GSVector8i*)dst)[3].blend(v3, mask); src += srcpitch * 4; v6 = GSVector4i(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]); v4 = (v6 << 4).upl8(v6); v5 = (v6 << 4).uph8(v6); v2 = GSVector8i::cast(v4.upl16(v5)); v3 = GSVector8i::cast(v4.uph16(v5)); v0 = v2.u8to32c() << 24; v1 = v2.bbbb().u8to32c() << 24; v2 = v3.u8to32c() << 24; v3 = v3.bbbb().u8to32c() << 24; ((GSVector8i*)dst)[4] = ((GSVector8i*)dst)[4].blend(v0, mask); ((GSVector8i*)dst)[5] = ((GSVector8i*)dst)[5].blend(v1, mask); ((GSVector8i*)dst)[6] = ((GSVector8i*)dst)[6].blend(v2, mask); ((GSVector8i*)dst)[7] = ((GSVector8i*)dst)[7].blend(v3, mask); #elif _M_SSE >= 0x301 GSVector4i v0, v1, v2, v3; GSVector4i mask = GSVector4i::xf0000000(); GSVector4i mask0 = m_uw8hmask0; GSVector4i mask1 = m_uw8hmask1; GSVector4i mask2 = m_uw8hmask2; GSVector4i mask3 = m_uw8hmask3; for (int i = 0; i < 2; i++, src += srcpitch * 4) { GSVector4i v(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 3]); v4 = (v << 4).upl8(v); v5 = (v << 4).uph8(v); v0 = v4.shuffle8(mask0); v1 = v4.shuffle8(mask1); v2 = v4.shuffle8(mask2); v3 = v4.shuffle8(mask3); ((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask); ((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask); ((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask); ((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask); v0 = v5.shuffle8(mask0); v1 = v5.shuffle8(mask1); v2 = v5.shuffle8(mask2); v3 = v5.shuffle8(mask3); ((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask); ((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask); ((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask); ((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask); } #endif } template __forceinline static void ReadAndExpandBlock24(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) { #if _M_SSE >= 0x501 const GSVector8i* s = (const GSVector8i*)src; GSVector8i TA0(TEXA.TA0 << 24); GSVector8i mask = GSVector8i::x00ffffff(); GSVector8i v0, v1, v2, v3; v0 = s[0] & mask; v1 = s[1] & mask; v2 = s[2] & mask; v3 = s[3] & mask; GSVector8i::sw128(v0, v1); GSVector8i::sw64(v0, v1); GSVector8i::sw128(v2, v3); GSVector8i::sw64(v2, v3); *(GSVector8i*)&dst[dstpitch * 0] = Expand24to32(v0, TA0); *(GSVector8i*)&dst[dstpitch * 1] = Expand24to32(v1, TA0); *(GSVector8i*)&dst[dstpitch * 2] = Expand24to32(v2, TA0); *(GSVector8i*)&dst[dstpitch * 3] = Expand24to32(v3, TA0); v0 = s[4] & mask; v1 = s[5] & mask; v2 = s[6] & mask; v3 = s[7] & mask; GSVector8i::sw128(v0, v1); GSVector8i::sw64(v0, v1); GSVector8i::sw128(v2, v3); GSVector8i::sw64(v2, v3); dst += dstpitch * 4; *(GSVector8i*)&dst[dstpitch * 0] = Expand24to32(v0, TA0); *(GSVector8i*)&dst[dstpitch * 1] = Expand24to32(v1, TA0); *(GSVector8i*)&dst[dstpitch * 2] = Expand24to32(v2, TA0); *(GSVector8i*)&dst[dstpitch * 3] = Expand24to32(v3, TA0); #else const GSVector4i* s = (const GSVector4i*)src; GSVector4i TA0(TEXA.TA0 << 24); GSVector4i mask = GSVector4i::x00ffffff(); for (int i = 0; i < 4; i++, dst += dstpitch * 2) { GSVector4i v0 = s[i * 4 + 0]; GSVector4i v1 = s[i * 4 + 1]; GSVector4i v2 = s[i * 4 + 2]; GSVector4i v3 = s[i * 4 + 3]; GSVector4i::sw64(v0, v1, v2, v3); v0 &= mask; v1 &= mask; v2 &= mask; v3 &= mask; GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; d0[0] = Expand24to32(v0, TA0); d0[1] = Expand24to32(v1, TA0); d1[0] = Expand24to32(v2, TA0); d1[1] = Expand24to32(v3, TA0); } #endif } template __forceinline static void ReadAndExpandBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) { #if _M_SSE >= 0x501 const GSVector8i* s = (const GSVector8i*)src; GSVector8i TA0(TEXA.TA0 << 24); GSVector8i TA1(TEXA.TA1 << 24); for (int i = 0; i < 4; i++, dst += dstpitch * 2) { GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask); GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask); GSVector8i::sw128(v0, v1); GSVector8i::sw32(v0, v1); GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0]; GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1]; d0[0] = Expand16to32(v0.upl16(v0), TA0, TA1); d0[1] = Expand16to32(v0.uph16(v0), TA0, TA1); d1[0] = Expand16to32(v1.upl16(v1), TA0, TA1); d1[1] = Expand16to32(v1.uph16(v1), TA0, TA1); } #else alignas(32) uint16 block[16 * 8]; ReadBlock16(src, (uint8*)block, sizeof(block) / 8); ExpandBlock16(block, dst, dstpitch, TEXA); #endif } __forceinline static void ReadAndExpandBlock8_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { //printf("ReadAndExpandBlock8_32\n"); const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; GSVector4i mask = m_r8mask; for (int i = 0; i < 2; i++) { v0 = s[i * 8 + 0].shuffle8(mask); v1 = s[i * 8 + 1].shuffle8(mask); v2 = s[i * 8 + 2].shuffle8(mask); v3 = s[i * 8 + 3].shuffle8(mask); GSVector4i::sw16(v0, v1, v2, v3); GSVector4i::sw32(v0, v1, v3, v2); v0.gather32_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v3.gather32_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v1.gather32_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v2.gather32_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v2 = s[i * 8 + 4].shuffle8(mask); v3 = s[i * 8 + 5].shuffle8(mask); v0 = s[i * 8 + 6].shuffle8(mask); v1 = s[i * 8 + 7].shuffle8(mask); GSVector4i::sw16(v0, v1, v2, v3); GSVector4i::sw32(v0, v1, v3, v2); v0.gather32_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v3.gather32_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v1.gather32_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v2.gather32_8<>(pal, (GSVector4i*)dst); dst += dstpitch; } } // TODO: ReadAndExpandBlock8_16 __forceinline static void ReadAndExpandBlock4_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint64* RESTRICT pal) { //printf("ReadAndExpandBlock4_32\n"); const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; GSVector4i mask = m_r4mask; for (int i = 0; i < 2; i++) { v0 = s[i * 8 + 0].xzyw(); v1 = s[i * 8 + 1].xzyw(); v2 = s[i * 8 + 2].xzyw(); v3 = s[i * 8 + 3].xzyw(); GSVector4i::sw64(v0, v1, v2, v3); GSVector4i::sw4(v0, v2, v1, v3); GSVector4i::sw8(v0, v1, v2, v3); v0 = v0.shuffle8(mask); v1 = v1.shuffle8(mask); v2 = v2.shuffle8(mask); v3 = v3.shuffle8(mask); GSVector4i::sw16rh(v0, v1, v2, v3); v0.gather64_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v1.gather64_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v2.gather64_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v3.gather64_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v0 = s[i * 8 + 4].xzyw(); v1 = s[i * 8 + 5].xzyw(); v2 = s[i * 8 + 6].xzyw(); v3 = s[i * 8 + 7].xzyw(); GSVector4i::sw64(v0, v1, v2, v3); GSVector4i::sw4(v0, v2, v1, v3); GSVector4i::sw8(v0, v1, v2, v3); v0 = v0.shuffle8(mask); v1 = v1.shuffle8(mask); v2 = v2.shuffle8(mask); v3 = v3.shuffle8(mask); GSVector4i::sw16rl(v0, v1, v2, v3); v0.gather64_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v1.gather64_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v2.gather64_8<>(pal, (GSVector4i*)dst); dst += dstpitch; v3.gather64_8<>(pal, (GSVector4i*)dst); dst += dstpitch; } } // TODO: ReadAndExpandBlock4_16 __forceinline static void ReadAndExpandBlock8H_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { //printf("ReadAndExpandBlock8H_32\n"); const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; for (int i = 0; i < 4; i++) { v0 = s[i * 4 + 0]; v1 = s[i * 4 + 1]; v2 = s[i * 4 + 2]; v3 = s[i * 4 + 3]; GSVector4i::sw64(v0, v1, v2, v3); (v0 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[0]); (v1 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[16]); dst += dstpitch; (v2 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[0]); (v3 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[16]); dst += dstpitch; } } // TODO: ReadAndExpandBlock8H_16 __forceinline static void ReadAndExpandBlock4HL_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { //printf("ReadAndExpandBlock4HL_32\n"); const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; for (int i = 0; i < 4; i++) { v0 = s[i * 4 + 0]; v1 = s[i * 4 + 1]; v2 = s[i * 4 + 2]; v3 = s[i * 4 + 3]; GSVector4i::sw64(v0, v1, v2, v3); ((v0 >> 24) & 0xf).gather32_32<>(pal, (GSVector4i*)&dst[0]); ((v1 >> 24) & 0xf).gather32_32<>(pal, (GSVector4i*)&dst[16]); dst += dstpitch; ((v2 >> 24) & 0xf).gather32_32<>(pal, (GSVector4i*)&dst[0]); ((v3 >> 24) & 0xf).gather32_32<>(pal, (GSVector4i*)&dst[16]); dst += dstpitch; } } // TODO: ReadAndExpandBlock4HL_16 __forceinline static void ReadAndExpandBlock4HH_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { //printf("ReadAndExpandBlock4HH_32\n"); const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; for (int i = 0; i < 4; i++) { v0 = s[i * 4 + 0]; v1 = s[i * 4 + 1]; v2 = s[i * 4 + 2]; v3 = s[i * 4 + 3]; GSVector4i::sw64(v0, v1, v2, v3); (v0 >> 28).gather32_32<>(pal, (GSVector4i*)&dst[0]); (v1 >> 28).gather32_32<>(pal, (GSVector4i*)&dst[16]); dst += dstpitch; (v2 >> 28).gather32_32<>(pal, (GSVector4i*)&dst[0]); (v3 >> 28).gather32_32<>(pal, (GSVector4i*)&dst[16]); dst += dstpitch; } } // TODO: ReadAndExpandBlock4HH_16 };