From 127433628df436853a76c12b4fc76858e734b4ed Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Tue, 6 Apr 2021 23:15:40 -0500 Subject: [PATCH] GS: AVX2 ReadAndExpandBlock4(HH|HL)_32 --- pcsx2/GS/GSBlock.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/pcsx2/GS/GSBlock.h b/pcsx2/GS/GSBlock.h index b1f1786574..94adafee73 100644 --- a/pcsx2/GS/GSBlock.h +++ b/pcsx2/GS/GSBlock.h @@ -2054,6 +2054,42 @@ public: { //printf("ReadAndExpandBlock4HL_32\n"); +#if _M_SSE >= 0x501 + + const GSVector8i* s = (const GSVector8i*)src; + + GSVector8i p0, p1, p2, p3; + LoadPalVecs(pal, p0, p1, p2, p3); + GSVector8i mask(0x0f0f0f0f); + + GSVector8i v0, v1, v2, v3; + + for (int i = 0; i < 2; i++) + { + GSVector8i* d0 = reinterpret_cast(dst); + GSVector8i* d1 = reinterpret_cast(dst + dstpitch); + GSVector8i* d2 = reinterpret_cast(dst + dstpitch * 2); + GSVector8i* d3 = reinterpret_cast(dst + dstpitch * 3); + + v0 = s[i * 4 + 0] >> 24; + v1 = s[i * 4 + 1] >> 24; + v2 = s[i * 4 + 2] >> 24; + v3 = s[i * 4 + 3] >> 24; + + GSVector8i::sw128(v0, v1); + GSVector8i::sw64(v0, v1); + GSVector8i::sw128(v2, v3); + GSVector8i::sw64(v2, v3); + + GSVector8i all = v0.ps32(v1).pu16(v2.ps32(v3)) & mask; + + ReadClut4(p0, p1, p2, p3, all, *d0, *d1, *d2, *d3); + + dst += dstpitch * 4; + } + +#else + const GSVector4i* s = (const GSVector4i*)src; GSVector4i p0, p1, p2, p3; @@ -2080,6 +2116,8 @@ public: dst += dstpitch * 2; } + +#endif } // TODO: ReadAndExpandBlock4HL_16 @@ -2088,6 +2126,41 @@ public: { //printf("ReadAndExpandBlock4HH_32\n"); +#if _M_SSE >= 0x501 + + const GSVector8i* s = (const GSVector8i*)src; + + GSVector8i p0, p1, p2, p3; + LoadPalVecs(pal, p0, p1, p2, p3); + + GSVector8i v0, v1, v2, v3; + + for (int i = 0; i < 2; i++) + { + GSVector8i* d0 = reinterpret_cast(dst); + GSVector8i* d1 = reinterpret_cast(dst + dstpitch); + GSVector8i* d2 = reinterpret_cast(dst + dstpitch * 2); + GSVector8i* d3 = reinterpret_cast(dst + dstpitch * 3); + + v0 = s[i * 4 + 0] >> 28; + v1 = s[i * 4 + 1] >> 28; + v2 = s[i * 4 + 2] >> 28; + v3 = s[i * 4 + 3] >> 28; + + GSVector8i::sw128(v0, v1); + GSVector8i::sw64(v0, v1); + GSVector8i::sw128(v2, v3); + GSVector8i::sw64(v2, v3); + + GSVector8i all = v0.ps32(v1).pu16(v2.ps32(v3)); + + ReadClut4(p0, p1, p2, p3, all, *d0, *d1, *d2, *d3); + + dst += dstpitch * 4; + } + +#else + const GSVector4i* s = (const GSVector4i*)src; GSVector4i p0, p1, p2, p3; @@ -2113,6 +2186,8 @@ public: dst += dstpitch * 2; } + +#endif } // TODO: ReadAndExpandBlock4HH_16