From e2169bc1dac1ade8212df0a3767de311c22b5e21 Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Wed, 7 Apr 2021 17:35:34 -0500 Subject: [PATCH] GS: Consolidate repeated BlockH code --- pcsx2/GS/GSBlock.h | 661 +++++++++++++----------------------------- pcsx2/GS/GSVector4i.h | 25 ++ pcsx2/GS/GSVector8i.h | 25 ++ 3 files changed, 256 insertions(+), 455 deletions(-) diff --git a/pcsx2/GS/GSBlock.h b/pcsx2/GS/GSBlock.h index 03e2f8ef6c..d8b2198b36 100644 --- a/pcsx2/GS/GSBlock.h +++ b/pcsx2/GS/GSBlock.h @@ -855,47 +855,40 @@ public: #endif } - __forceinline static void ReadBlock8HP(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch) + template + __forceinline static void ReadBlockHP(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch) { #if _M_SSE >= 0x501 - u8* RESTRICT d0 = &dst[dstpitch * 0]; - u8* RESTRICT d1 = &dst[dstpitch * 4]; - const GSVector8i* s = (const GSVector8i*)src; GSVector8i v0, v1, v2, v3; GSVector4i v4, v5; - v0 = s[0].acbd(); - v1 = s[1].acbd(); - v2 = s[2].acbd(); - v3 = s[3].acbd(); + GSVector8i maskvec = GSVector8i(mask); - v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)); + for (int i = 0; i < 2; i++) + { + v0 = s[i * 4 + 0].acbd(); + v1 = s[i * 4 + 1].acbd(); + v2 = s[i * 4 + 2].acbd(); + v3 = s[i * 4 + 3].acbd(); - v4 = v0.extract<0>(); - v5 = v0.extract<1>(); + v0 = (v0 >> shift).ps32(v1 >> shift).pu16((v2 >> shift).ps32(v3 >> shift)); - GSVector4i::storel(&d0[dstpitch * 0], v4); - GSVector4i::storel(&d0[dstpitch * 1], v5); - GSVector4i::storeh(&d0[dstpitch * 2], v4); - GSVector4i::storeh(&d0[dstpitch * 3], v5); + if (mask != 0xffffffff) + v0 = v0 & maskvec; - v0 = s[4].acbd(); - v1 = s[5].acbd(); - v2 = s[6].acbd(); - v3 = s[7].acbd(); + v4 = v0.extract<0>(); + v5 = v0.extract<1>(); - v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)); + GSVector4i::storel(&dst[dstpitch * 0], v4); + GSVector4i::storel(&dst[dstpitch * 1], v5); + GSVector4i::storeh(&dst[dstpitch * 2], v4); + GSVector4i::storeh(&dst[dstpitch * 3], v5); - v4 = v0.extract<0>(); - v5 = v0.extract<1>(); - - GSVector4i::storel(&d1[dstpitch * 0], v4); - GSVector4i::storel(&d1[dstpitch * 1], v5); - GSVector4i::storeh(&d1[dstpitch * 2], v4); - GSVector4i::storeh(&d1[dstpitch * 3], v5); + dst += dstpitch * 4; + } #else @@ -903,6 +896,8 @@ public: GSVector4i v0, v1, v2, v3; + GSVector4i maskvec(mask); + for (int i = 0; i < 4; i++) { v0 = s[i * 4 + 0]; @@ -912,7 +907,10 @@ public: GSVector4i::sw64(v0, v1, v2, v3); - v0 = ((v0 >> 24).ps32(v1 >> 24)).pu16((v2 >> 24).ps32(v3 >> 24)); + v0 = ((v0 >> shift).ps32(v1 >> shift)).pu16((v2 >> shift).ps32(v3 >> shift)); + + if (mask != 0xffffffff) + v0 = v0 & maskvec; GSVector4i::storel(dst, v0); @@ -926,149 +924,19 @@ public: #endif } + __forceinline static void ReadBlock8HP(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch) + { + ReadBlockHP<24, 0xffffffff>(src, dst, dstpitch); + } + __forceinline static void ReadBlock4HLP(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch) { -#if _M_SSE >= 0x501 - - u8* RESTRICT d0 = &dst[dstpitch * 0]; - u8* RESTRICT d1 = &dst[dstpitch * 4]; - - const GSVector8i* s = (const GSVector8i*)src; - - GSVector8i v0, v1, v2, v3; - GSVector4i v4, v5; - GSVector8i mask(0x0f0f0f0f); - - v0 = s[0].acbd(); - v1 = s[1].acbd(); - v2 = s[2].acbd(); - v3 = s[3].acbd(); - - v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)) & mask; - - v4 = v0.extract<0>(); - v5 = v0.extract<1>(); - - GSVector4i::storel(&d0[dstpitch * 0], v4); - GSVector4i::storel(&d0[dstpitch * 1], v5); - GSVector4i::storeh(&d0[dstpitch * 2], v4); - GSVector4i::storeh(&d0[dstpitch * 3], v5); - - v0 = s[4].acbd(); - v1 = s[5].acbd(); - v2 = s[6].acbd(); - v3 = s[7].acbd(); - - v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)) & mask; - - v4 = v0.extract<0>(); - v5 = v0.extract<1>(); - - GSVector4i::storel(&d1[dstpitch * 0], v4); - GSVector4i::storel(&d1[dstpitch * 1], v5); - GSVector4i::storeh(&d1[dstpitch * 2], v4); - GSVector4i::storeh(&d1[dstpitch * 3], v5); - -#else - - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i v0, v1, v2, v3; - - GSVector4i mask(0x0f0f0f0f); - - for (int i = 0; i < 4; i++) - { - v0 = s[i * 4 + 0]; - v1 = s[i * 4 + 1]; - v2 = s[i * 4 + 2]; - v3 = s[i * 4 + 3]; - - GSVector4i::sw64(v0, v1, v2, v3); - - v0 = ((v0 >> 24).ps32(v1 >> 24)).pu16((v2 >> 24).ps32(v3 >> 24)) & mask; - - GSVector4i::storel(dst, v0); - - dst += dstpitch; - - GSVector4i::storeh(dst, v0); - - dst += dstpitch; - } - -#endif + ReadBlockHP<24, 0x0f0f0f0f>(src, dst, dstpitch); } __forceinline static void ReadBlock4HHP(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch) { -#if _M_SSE >= 0x501 - - u8* RESTRICT d0 = &dst[dstpitch * 0]; - u8* RESTRICT d1 = &dst[dstpitch * 4]; - - const GSVector8i* s = (const GSVector8i*)src; - - GSVector8i v0, v1, v2, v3; - GSVector4i v4, v5; - - v0 = s[0].acbd(); - v1 = s[1].acbd(); - v2 = s[2].acbd(); - v3 = s[3].acbd(); - - v0 = (v0 >> 28).ps32(v1 >> 28).pu16((v2 >> 28).ps32(v3 >> 28)); - - v4 = v0.extract<0>(); - v5 = v0.extract<1>(); - - GSVector4i::storel(&d0[dstpitch * 0], v4); - GSVector4i::storel(&d0[dstpitch * 1], v5); - GSVector4i::storeh(&d0[dstpitch * 2], v4); - GSVector4i::storeh(&d0[dstpitch * 3], v5); - - v0 = s[4].acbd(); - v1 = s[5].acbd(); - v2 = s[6].acbd(); - v3 = s[7].acbd(); - - v0 = (v0 >> 28).ps32(v1 >> 28).pu16((v2 >> 28).ps32(v3 >> 28)); - - v4 = v0.extract<0>(); - v5 = v0.extract<1>(); - - GSVector4i::storel(&d1[dstpitch * 0], v4); - GSVector4i::storel(&d1[dstpitch * 1], v5); - GSVector4i::storeh(&d1[dstpitch * 2], v4); - GSVector4i::storeh(&d1[dstpitch * 3], v5); - -#else - - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i v0, v1, v2, v3; - - for (int i = 0; i < 4; i++) - { - v0 = s[i * 4 + 0]; - v1 = s[i * 4 + 1]; - v2 = s[i * 4 + 2]; - v3 = s[i * 4 + 3]; - - GSVector4i::sw64(v0, v1, v2, v3); - - v0 = ((v0 >> 28).ps32(v1 >> 28)).pu16((v2 >> 28).ps32(v3 >> 28)); - - GSVector4i::storel(dst, v0); - - dst += dstpitch; - - GSVector4i::storeh(dst, v0); - - dst += dstpitch; - } - -#endif + ReadBlockHP<28, 0xffffffff>(src, dst, dstpitch); } template @@ -1232,77 +1100,74 @@ public: } } - __forceinline static void ExpandBlock8H_32(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) + template + __forceinline static void ExpandBlockH_32(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) { for (int j = 0; j < 8; j++, dst += dstpitch) { const GSVector4i* s = (const GSVector4i*)src; - ((GSVector4i*)dst)[0] = (s[j * 2 + 0] >> 24).gather32_32<>(pal); - ((GSVector4i*)dst)[1] = (s[j * 2 + 1] >> 24).gather32_32<>(pal); + GSVector4i v0 = s[j * 2 + 0] >> shift; + GSVector4i v1 = s[j * 2 + 1] >> shift; + if (mask != 0xff) + { + v0 = v0 & mask; + v1 = v1 & mask; + } + ((GSVector4i*)dst)[0] = v0.gather32_32<>(pal); + ((GSVector4i*)dst)[1] = v1.gather32_32<>(pal); } } + template + __forceinline static void ExpandBlockH_16(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) + { + for (int j = 0; j < 8; j++, dst += dstpitch) + { + const GSVector4i* s = (const GSVector4i*)src; + + GSVector4i v0 = s[j * 2 + 0] >> shift; + GSVector4i v1 = s[j * 2 + 1] >> shift; + if (mask != 0xff) + { + v0 = v0 & mask; + v1 = v1 & mask; + } + v0 = v0.gather32_32<>(pal); + v1 = v1.gather32_32<>(pal); + + ((GSVector4i*)dst)[0] = v0.pu32(v1); + } + } + + __forceinline static void ExpandBlock8H_32(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) + { + ExpandBlockH_32<24, 0xff>(src, dst, dstpitch, pal); + } + __forceinline static void ExpandBlock8H_16(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) { - for (int j = 0; j < 8; j++, dst += dstpitch) - { - - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i v0 = (s[j * 2 + 0] >> 24).gather32_32<>(pal); - GSVector4i v1 = (s[j * 2 + 1] >> 24).gather32_32<>(pal); - - ((GSVector4i*)dst)[0] = v0.pu32(v1); - } + ExpandBlockH_16<24, 0xff>(src, dst, dstpitch, pal); } __forceinline static void ExpandBlock4HL_32(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) { - for (int j = 0; j < 8; j++, dst += dstpitch) - { - const GSVector4i* s = (const GSVector4i*)src; - - ((GSVector4i*)dst)[0] = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal); - ((GSVector4i*)dst)[1] = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal); - } + ExpandBlockH_32<24, 0x0f>(src, dst, dstpitch, pal); } __forceinline static void ExpandBlock4HL_16(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) { - for (int j = 0; j < 8; j++, dst += dstpitch) - { - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i v0 = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal); - GSVector4i v1 = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal); - - ((GSVector4i*)dst)[0] = v0.pu32(v1); - } + ExpandBlockH_16<24, 0x0f>(src, dst, dstpitch, pal); } __forceinline static void ExpandBlock4HH_32(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) { - for (int j = 0; j < 8; j++, dst += dstpitch) - { - const GSVector4i* s = (const GSVector4i*)src; - - ((GSVector4i*)dst)[0] = (s[j * 2 + 0] >> 28).gather32_32<>(pal); - ((GSVector4i*)dst)[1] = (s[j * 2 + 1] >> 28).gather32_32<>(pal); - } + ExpandBlockH_32<28, 0xff>(src, dst, dstpitch, pal); } __forceinline static void ExpandBlock4HH_16(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) { - for (int j = 0; j < 8; j++, dst += dstpitch) - { - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i v0 = (s[j * 2 + 0] >> 28).gather32_32<>(pal); - GSVector4i v1 = (s[j * 2 + 1] >> 28).gather32_32<>(pal); - - ((GSVector4i*)dst)[0] = v0.pu32(v1); - } + ExpandBlockH_16<28, 0xff>(src, dst, dstpitch, pal); } __forceinline static void UnpackAndWriteBlock24(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst) @@ -1392,63 +1257,134 @@ public: #endif } - __forceinline static void UnpackAndWriteBlock8H(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst) + template + __forceinline static void UnpackAndWriteBlockH(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst) { + GSVector4i v4, v5, v6, v7; #if _M_SSE >= 0x501 - GSVector4i v4, v5, v6, v7; - GSVector8i v0, v1, v2, v3; - GSVector8i mask = GSVector8i::xff000000(); + GSVector8i* d = reinterpret_cast(dst); - for (int i = 0; i < 2; i++, src += srcpitch * 4) + for (int i = 0; i < 2; i++, src += srcpitch * 4, d += 4) { - v4 = GSVector4i::loadl(&src[srcpitch * 0]); - v5 = GSVector4i::loadl(&src[srcpitch * 1]); - v6 = GSVector4i::loadl(&src[srcpitch * 2]); - v7 = GSVector4i::loadl(&src[srcpitch * 3]); + if (mask == 0xff000000) + { + v4 = GSVector4i::loadl(&src[srcpitch * 0]); + v5 = GSVector4i::loadl(&src[srcpitch * 1]); + v6 = GSVector4i::loadl(&src[srcpitch * 2]); + v7 = GSVector4i::loadl(&src[srcpitch * 3]); - v4 = v4.upl16(v5); - v5 = v6.upl16(v7); + v4 = v4.upl16(v5); + v5 = v6.upl16(v7); + } + else + { + v4 = GSVector4i::load(*(u32*)&src[srcpitch * 0]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 2])); + v5 = GSVector4i::load(*(u32*)&src[srcpitch * 1]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 3])); - v0 = GSVector8i::u8to32(v4) << 24; - v1 = GSVector8i::u8to32(v4.zwzw()) << 24; - v2 = GSVector8i::u8to32(v5) << 24; - v3 = GSVector8i::u8to32(v5.zwzw()) << 24; + if (mask == 0x0f000000) + { + v6 = v4.upl8(v4 >> 4); + v7 = v5.upl8(v5 >> 4); + } + else if (mask == 0xf0000000) + { + v6 = (v4 << 4).upl8(v4); + v7 = (v5 << 4).upl8(v5); + } + else + { + ASSERT(0); + } - ((GSVector8i*)dst)[i * 4 + 0] = ((GSVector8i*)dst)[i * 4 + 0].blend(v0, mask); - ((GSVector8i*)dst)[i * 4 + 1] = ((GSVector8i*)dst)[i * 4 + 1].blend(v1, mask); - ((GSVector8i*)dst)[i * 4 + 2] = ((GSVector8i*)dst)[i * 4 + 2].blend(v2, mask); - ((GSVector8i*)dst)[i * 4 + 3] = ((GSVector8i*)dst)[i * 4 + 3].blend(v3, mask); + v4 = v6.upl16(v7); + v5 = v6.uph16(v7); + } + + GSVector8i v0 = GSVector8i::u8to32(v4) << 24; + GSVector8i v1 = GSVector8i::u8to32(v4.zwzw()) << 24; + GSVector8i v2 = GSVector8i::u8to32(v5) << 24; + GSVector8i v3 = GSVector8i::u8to32(v5.zwzw()) << 24; + + d[0] = d[0].smartblend(v0); + d[1] = d[1].smartblend(v1); + d[2] = d[2].smartblend(v2); + d[3] = d[3].smartblend(v3); } #else - GSVector4i v0, v1, v2, v3, v4; - GSVector4i mask = GSVector4i::xff000000(); + GSVector4i v0, v1, v2, v3; + GSVector4i mask0 = m_uw8hmask0; GSVector4i mask1 = m_uw8hmask1; GSVector4i mask2 = m_uw8hmask2; GSVector4i mask3 = m_uw8hmask3; - for (int i = 0; i < 4; i++, src += srcpitch * 2) + GSVector4i* d = reinterpret_cast(dst); + + for (int i = 0; i < 2; i++, src += srcpitch * 4, d += 8) { - v4 = GSVector4i::load(src, src + srcpitch); + if (mask == 0xff000000) + { + v4 = GSVector4i::load(src + srcpitch * 0, src + srcpitch * 1); + v5 = GSVector4i::load(src + srcpitch * 2, src + srcpitch * 3); + } + else + { + v6 = GSVector4i::load(*(u32*)&src[srcpitch * 0]); + v7 = GSVector4i::load(*(u32*)&src[srcpitch * 1]); + v4 = v6.upl32(v7); + v6 = GSVector4i::load(*(u32*)&src[srcpitch * 2]); + v7 = GSVector4i::load(*(u32*)&src[srcpitch * 3]); + v5 = v6.upl32(v7); + + if (mask == 0x0f000000) + { + v4 = v4.upl8(v4 >> 4); + v5 = v5.upl8(v5 >> 4); + } + else if (mask == 0xf0000000) + { + v4 = (v4 << 4).upl8(v4); + v5 = (v5 << 4).upl8(v5); + } + else + { + ASSERT(0); + } + } v0 = v4.shuffle8(mask0); v1 = v4.shuffle8(mask1); v2 = v4.shuffle8(mask2); v3 = v4.shuffle8(mask3); - ((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, mask); - ((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, mask); - ((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, mask); - ((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask); + d[0] = d[0].smartblend(v0); + d[1] = d[1].smartblend(v1); + d[2] = d[2].smartblend(v2); + d[3] = d[3].smartblend(v3); + + v0 = v5.shuffle8(mask0); + v1 = v5.shuffle8(mask1); + v2 = v5.shuffle8(mask2); + v3 = v5.shuffle8(mask3); + + d[4] = d[4].smartblend(v0); + d[5] = d[5].smartblend(v1); + d[6] = d[6].smartblend(v2); + d[7] = d[7].smartblend(v3); } #endif } + __forceinline static void UnpackAndWriteBlock8H(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst) + { + UnpackAndWriteBlockH<0xff000000>(src, srcpitch, dst); + } + __forceinline static void UnpackAndWriteBlock4HL(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst) { //printf("4HL\n"); @@ -1461,144 +1397,12 @@ public: s[i] = (columnTable32[j][i * 2] & 0x0f) | (columnTable32[j][i * 2 + 1] << 4); } -#if _M_SSE >= 0x501 - - GSVector4i v4, v5, v6, v7; - GSVector8i v0, v1, v2, v3; - GSVector8i mask(0x0f000000); - - for (int i = 0; i < 2; i++, src += srcpitch * 4) - { - v4 = GSVector4i::load(*(u32*)&src[srcpitch * 0]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 2])); - v5 = GSVector4i::load(*(u32*)&src[srcpitch * 1]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 3])); - - v6 = v4.upl8(v4 >> 4); - v7 = v5.upl8(v5 >> 4); - - v4 = v6.upl16(v7); - v5 = v6.uph16(v7); - - v0 = GSVector8i::u8to32(v4) << 24; - v1 = GSVector8i::u8to32(v4.zwzw()) << 24; - v2 = GSVector8i::u8to32(v5) << 24; - v3 = GSVector8i::u8to32(v5.zwzw()) << 24; - - ((GSVector8i*)dst)[i * 4 + 0] = ((GSVector8i*)dst)[i * 4 + 0].blend(v0, mask); - ((GSVector8i*)dst)[i * 4 + 1] = ((GSVector8i*)dst)[i * 4 + 1].blend(v1, mask); - ((GSVector8i*)dst)[i * 4 + 2] = ((GSVector8i*)dst)[i * 4 + 2].blend(v2, mask); - ((GSVector8i*)dst)[i * 4 + 3] = ((GSVector8i*)dst)[i * 4 + 3].blend(v3, mask); - } - -#else - - GSVector4i v0, v1, v2, v3, v4, v5; - GSVector4i mask = GSVector4i(0x0f000000); - GSVector4i mask0 = m_uw8hmask0; - GSVector4i mask1 = m_uw8hmask1; - GSVector4i mask2 = m_uw8hmask2; - GSVector4i mask3 = m_uw8hmask3; - - for (int i = 0; i < 2; i++, src += srcpitch * 4) - { - v4 = GSVector4i::load(*(u32*)&src[srcpitch * 0]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 1])); - v5 = GSVector4i::load(*(u32*)&src[srcpitch * 2]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 3])); - - v4 = v4.upl8(v4 >> 4); - v5 = v5.upl8(v5 >> 4); - - v0 = v4.shuffle8(mask0); - v1 = v4.shuffle8(mask1); - v2 = v4.shuffle8(mask2); - v3 = v4.shuffle8(mask3); - - ((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask); - ((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask); - ((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask); - ((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask); - - v0 = v5.shuffle8(mask0); - v1 = v5.shuffle8(mask1); - v2 = v5.shuffle8(mask2); - v3 = v5.shuffle8(mask3); - - ((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask); - ((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask); - ((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask); - ((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask); - } - -#endif + UnpackAndWriteBlockH<0x0f000000>(src, srcpitch, dst); } __forceinline static void UnpackAndWriteBlock4HH(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst) { -#if _M_SSE >= 0x501 - - GSVector4i v4, v5, v6, v7; - GSVector8i v0, v1, v2, v3; - GSVector8i mask = GSVector8i::xf0000000(); - - for (int i = 0; i < 2; i++, src += srcpitch * 4) - { - v4 = GSVector4i::load(*(u32*)&src[srcpitch * 0]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 2])); - v5 = GSVector4i::load(*(u32*)&src[srcpitch * 1]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 3])); - - v6 = (v4 << 4).upl8(v4); - v7 = (v5 << 4).upl8(v5); - - v4 = v6.upl16(v7); - v5 = v6.uph16(v7); - - v0 = GSVector8i::u8to32(v4) << 24; - v1 = GSVector8i::u8to32(v4.zwzw()) << 24; - v2 = GSVector8i::u8to32(v5) << 24; - v3 = GSVector8i::u8to32(v5.zwzw()) << 24; - - ((GSVector8i*)dst)[i * 4 + 0] = ((GSVector8i*)dst)[i * 4 + 0].blend(v0, mask); - ((GSVector8i*)dst)[i * 4 + 1] = ((GSVector8i*)dst)[i * 4 + 1].blend(v1, mask); - ((GSVector8i*)dst)[i * 4 + 2] = ((GSVector8i*)dst)[i * 4 + 2].blend(v2, mask); - ((GSVector8i*)dst)[i * 4 + 3] = ((GSVector8i*)dst)[i * 4 + 3].blend(v3, mask); - } - -#else - - GSVector4i v0, v1, v2, v3, v4, v5; - GSVector4i mask = GSVector4i::xf0000000(); - GSVector4i mask0 = m_uw8hmask0; - GSVector4i mask1 = m_uw8hmask1; - GSVector4i mask2 = m_uw8hmask2; - GSVector4i mask3 = m_uw8hmask3; - - for (int i = 0; i < 2; i++, src += srcpitch * 4) - { - v4 = GSVector4i::load(*(u32*)&src[srcpitch * 0]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 1])); - v5 = GSVector4i::load(*(u32*)&src[srcpitch * 2]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 3])); - - v4 = (v4 << 4).upl8(v4); - v5 = (v5 << 4).upl8(v5); - - v0 = v4.shuffle8(mask0); - v1 = v4.shuffle8(mask1); - v2 = v4.shuffle8(mask2); - v3 = v4.shuffle8(mask3); - - ((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask); - ((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask); - ((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask); - ((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask); - - v0 = v5.shuffle8(mask0); - v1 = v5.shuffle8(mask1); - v2 = v5.shuffle8(mask2); - v3 = v5.shuffle8(mask3); - - ((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask); - ((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask); - ((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask); - ((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask); - } - -#endif + UnpackAndWriteBlockH<0xf0000000>(src, srcpitch, dst); } template @@ -2078,102 +1882,30 @@ public: // TODO: ReadAndExpandBlock8H_16 - __forceinline static void ReadAndExpandBlock4HL_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) + template + __forceinline static void ReadAndExpandBlock4H_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) { - //printf("ReadAndExpandBlock4HL_32\n"); - #if _M_SSE >= 0x501 const GSVector8i* s = (const GSVector8i*)src; GSVector8i p0, p1, p2, p3; LoadPalVecs(pal, p0, p1, p2, p3); - GSVector8i mask(0x0f0f0f0f); + GSVector8i maskvec(mask); GSVector8i v0, v1, v2, v3; - for (int i = 0; i < 2; i++) + for(int i = 0; i < 2; i++) { GSVector8i* d0 = reinterpret_cast(dst); GSVector8i* d1 = reinterpret_cast(dst + dstpitch); GSVector8i* d2 = reinterpret_cast(dst + dstpitch * 2); GSVector8i* d3 = reinterpret_cast(dst + dstpitch * 3); - v0 = s[i * 4 + 0] >> 24; - v1 = s[i * 4 + 1] >> 24; - v2 = s[i * 4 + 2] >> 24; - v3 = s[i * 4 + 3] >> 24; - - GSVector8i::sw128(v0, v1); - GSVector8i::sw64(v0, v1); - GSVector8i::sw128(v2, v3); - GSVector8i::sw64(v2, v3); - - GSVector8i all = v0.ps32(v1).pu16(v2.ps32(v3)) & mask; - - ReadClut4(p0, p1, p2, p3, all, *d0, *d1, *d2, *d3); - - dst += dstpitch * 4; - } - -#else - - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i p0, p1, p2, p3; - LoadPalVecs(pal, p0, p1, p2, p3); - GSVector4i mask(0x0f0f0f0f); - - GSVector4i v0, v1, v2, v3; - - for (int i = 0; i < 4; i++) - { - GSVector4i* d0 = reinterpret_cast(dst); - GSVector4i* d1 = reinterpret_cast(dst + dstpitch); - - v0 = s[i * 4 + 0] >> 24; - v1 = s[i * 4 + 1] >> 24; - v2 = s[i * 4 + 2] >> 24; - v3 = s[i * 4 + 3] >> 24; - - GSVector4i::sw64(v0, v1, v2, v3); - - GSVector4i all = v0.ps32(v1).pu16(v2.ps32(v3)) & mask; - - ReadClut4(p0, p1, p2, p3, all, d0[0], d0[1], d1[0], d1[1]); - - dst += dstpitch * 2; - } - -#endif - } - - // TODO: ReadAndExpandBlock4HL_16 - - __forceinline static void ReadAndExpandBlock4HH_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) - { - //printf("ReadAndExpandBlock4HH_32\n"); - -#if _M_SSE >= 0x501 - - const GSVector8i* s = (const GSVector8i*)src; - - GSVector8i p0, p1, p2, p3; - LoadPalVecs(pal, p0, p1, p2, p3); - - GSVector8i v0, v1, v2, v3; - - for (int i = 0; i < 2; i++) - { - GSVector8i* d0 = reinterpret_cast(dst); - GSVector8i* d1 = reinterpret_cast(dst + dstpitch); - GSVector8i* d2 = reinterpret_cast(dst + dstpitch * 2); - GSVector8i* d3 = reinterpret_cast(dst + dstpitch * 3); - - v0 = s[i * 4 + 0] >> 28; - v1 = s[i * 4 + 1] >> 28; - v2 = s[i * 4 + 2] >> 28; - v3 = s[i * 4 + 3] >> 28; + v0 = s[i * 4 + 0] >> shift; + v1 = s[i * 4 + 1] >> shift; + v2 = s[i * 4 + 2] >> shift; + v3 = s[i * 4 + 3] >> shift; GSVector8i::sw128(v0, v1); GSVector8i::sw64(v0, v1); @@ -2181,6 +1913,8 @@ public: GSVector8i::sw64(v2, v3); GSVector8i all = v0.ps32(v1).pu16(v2.ps32(v3)); + if (mask != 0xffffffff) + all = all & mask; ReadClut4(p0, p1, p2, p3, all, *d0, *d1, *d2, *d3); @@ -2193,6 +1927,7 @@ public: GSVector4i p0, p1, p2, p3; LoadPalVecs(pal, p0, p1, p2, p3); + GSVector4i maskvec(mask); GSVector4i v0, v1, v2, v3; @@ -2201,14 +1936,16 @@ public: GSVector4i* d0 = reinterpret_cast(dst); GSVector4i* d1 = reinterpret_cast(dst + dstpitch); - v0 = s[i * 4 + 0] >> 28; - v1 = s[i * 4 + 1] >> 28; - v2 = s[i * 4 + 2] >> 28; - v3 = s[i * 4 + 3] >> 28; + v0 = s[i * 4 + 0] >> shift; + v1 = s[i * 4 + 1] >> shift; + v2 = s[i * 4 + 2] >> shift; + v3 = s[i * 4 + 3] >> shift; GSVector4i::sw64(v0, v1, v2, v3); GSVector4i all = v0.ps32(v1).pu16(v2.ps32(v3)); + if (mask != 0xffffffff) + all = all & mask; ReadClut4(p0, p1, p2, p3, all, d0[0], d0[1], d1[0], d1[1]); @@ -2218,5 +1955,19 @@ public: #endif } + __forceinline static void ReadAndExpandBlock4HL_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) + { + //printf("ReadAndExpandBlock4HL_32\n"); + ReadAndExpandBlock4H_32<24, 0x0f0f0f0f>(src, dst, dstpitch, pal); + } + + // TODO: ReadAndExpandBlock4HL_16 + + __forceinline static void ReadAndExpandBlock4HH_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) + { + //printf("ReadAndExpandBlock4HH_32\n"); + ReadAndExpandBlock4H_32<28, 0xffffffff>(src, dst, dstpitch, pal); + } + // TODO: ReadAndExpandBlock4HH_16 }; diff --git a/pcsx2/GS/GSVector4i.h b/pcsx2/GS/GSVector4i.h index b8c59e576a..568bb348ca 100644 --- a/pcsx2/GS/GSVector4i.h +++ b/pcsx2/GS/GSVector4i.h @@ -449,6 +449,31 @@ public: #endif } + /// Equivalent to blend with the given mask broadcasted across the vector + /// May be faster than blend in some cases + template + __forceinline GSVector4i smartblend(const GSVector4i& a) const + { + if (mask == 0) + return *this; + if (mask == 0xffffffff) + return a; + + if (mask == 0x0000ffff) + return blend16<0x55>(a); + if (mask == 0xffff0000) + return blend16<0xaa>(a); + + for (int i = 0; i < 32; i += 8) + { + u8 byte = (mask >> i) & 0xff; + if (byte != 0xff && byte != 0) + return blend(a, GSVector4i(mask)); + } + + return blend8(a, GSVector4i(mask)); + } + __forceinline GSVector4i blend(const GSVector4i& a, const GSVector4i& mask) const { return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a))); diff --git a/pcsx2/GS/GSVector8i.h b/pcsx2/GS/GSVector8i.h index 7616222fc7..784f7ad1f3 100644 --- a/pcsx2/GS/GSVector8i.h +++ b/pcsx2/GS/GSVector8i.h @@ -327,6 +327,31 @@ public: return GSVector8i(_mm256_or_si256(_mm256_andnot_si256(mask, m), _mm256_and_si256(mask, a))); } + /// Equivalent to blend with the given mask broadcasted across the vector + /// May be faster than blend in some cases + template + __forceinline GSVector8i smartblend(const GSVector8i& a) const + { + if (mask == 0) + return *this; + if (mask == 0xffffffff) + return a; + + if (mask == 0x0000ffff) + return blend16<0x55>(a); + if (mask == 0xffff0000) + return blend16<0xaa>(a); + + for (int i = 0; i < 32; i += 8) + { + u8 byte = (mask >> i) & 0xff; + if (byte != 0xff && byte != 0) + return blend(a, GSVector8i(mask)); + } + + return blend8(a, GSVector8i(mask)); + } + __forceinline GSVector8i mix16(const GSVector8i& a) const { return blend16<0xaa>(a);