From 244a4da28ad5bc4d109fc56cc4cb745b32780f11 Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Tue, 6 Apr 2021 00:27:49 -0500 Subject: [PATCH] GS: ReadBlock16 performance improvements --- pcsx2/GS/GSBlock.cpp | 14 +----- pcsx2/GS/GSBlock.h | 101 ++++++++++++++++++++++++++++--------------- 2 files changed, 67 insertions(+), 48 deletions(-) diff --git a/pcsx2/GS/GSBlock.cpp b/pcsx2/GS/GSBlock.cpp index 51349c23cd..ccb25fd7fc 100644 --- a/pcsx2/GS/GSBlock.cpp +++ b/pcsx2/GS/GSBlock.cpp @@ -16,22 +16,10 @@ #include "PrecompiledHeader.h" #include "GSBlock.h" -CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15); +CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15); CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); -#if _M_SSE >= 0x501 -CONSTINIT const GSVector8i GSBlock::m_xxxa = GSVector8i::cxpr(0x00008000); -CONSTINIT const GSVector8i GSBlock::m_xxbx = GSVector8i::cxpr(0x00007c00); -CONSTINIT const GSVector8i GSBlock::m_xgxx = GSVector8i::cxpr(0x000003e0); -CONSTINIT const GSVector8i GSBlock::m_rxxx = GSVector8i::cxpr(0x0000001f); -#else -CONSTINIT const GSVector4i GSBlock::m_xxxa = GSVector4i::cxpr(0x00008000); -CONSTINIT const GSVector4i GSBlock::m_xxbx = GSVector4i::cxpr(0x00007c00); -CONSTINIT const GSVector4i GSBlock::m_xgxx = GSVector4i::cxpr(0x000003e0); -CONSTINIT const GSVector4i GSBlock::m_rxxx = GSVector4i::cxpr(0x0000001f); -#endif - CONSTINIT const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9); CONSTINIT const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11); CONSTINIT const GSVector4i GSBlock::m_uw8hmask2(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13); diff --git a/pcsx2/GS/GSBlock.h b/pcsx2/GS/GSBlock.h index e51763bbcb..3d3f7aa8a6 100644 --- a/pcsx2/GS/GSBlock.h +++ b/pcsx2/GS/GSBlock.h @@ -25,18 +25,6 @@ class GSBlock static const GSVector4i m_r8mask; static const GSVector4i m_r4mask; -#if _M_SSE >= 0x501 - static const GSVector8i m_xxxa; - static const GSVector8i m_xxbx; - static const GSVector8i m_xgxx; - static const GSVector8i m_rxxx; -#else - static const GSVector4i m_xxxa; - static const GSVector4i m_xxbx; - static const GSVector4i m_xgxx; - static const GSVector4i m_rxxx; -#endif - static const GSVector4i m_uw8hmask0; static const GSVector4i m_uw8hmask1; static const GSVector4i m_uw8hmask2; @@ -488,17 +476,11 @@ public: GSVector8i mask = GSVector8i::broadcast128(m_r16mask); - GSVector8i v0 = s[i * 2 + 0].shuffle8(mask); - GSVector8i v1 = s[i * 2 + 1].shuffle8(mask); + GSVector8 v0 = GSVector8::cast(s[i * 2 + 0].shuffle8(mask).acbd()); + GSVector8 v1 = GSVector8::cast(s[i * 2 + 1].shuffle8(mask).acbd()); - GSVector8i::sw128(v0, v1); - GSVector8i::sw32(v0, v1); - - v0 = v0.acbd(); - v1 = v1.acbd(); - - GSVector8i::store(&dst[dstpitch * 0], v0); - GSVector8i::store(&dst[dstpitch * 1], v1); + GSVector8::store(&dst[dstpitch * 0], v0.xzxz(v1)); + GSVector8::store(&dst[dstpitch * 1], v0.ywyw(v1)); #else @@ -1005,10 +987,38 @@ public: return c | (AEM ? TA0.andnot(c == V::zero()) : TA0); // TA0 & (c != GSVector4i::zero()) } + /// Expands the 16bpp pixel duplicated across both halves of each dword to a 32bpp pixel template __forceinline static V Expand16to32(const V& c, const V& TA0, const V& TA1) { - return ((c & m_rxxx) << 3) | ((c & m_xgxx) << 6) | ((c & m_xxbx) << 9) | (AEM ? TA0.blend8(TA1, c.sra16(15)).andnot(c == V::zero()) : TA0.blend(TA1, c.sra16(15))); + V rmask = V(0x000000f8); + V gmask = V(0x0000f800); + V bmask = V(0x00f80000); + return ((c << 3) & rmask) | ((c << 6) & gmask) | ((c << 9) & bmask) | (AEM ? TA0.blend8(TA1, c).andnot(c == V::zero()) : TA0.blend8(TA1, c)); + } + + /// Expands the 16bpp pixel in the low half of each dword to a 32bpp pixel + template + __forceinline static V Expand16Lto32(const V& c, const V& TA0, const V& TA1) + { + V rmask = V(0x000000f8); + V gmask = V(0x0000f800); + V bmask = V(0x00f80000); + V o = ((c << 3) & rmask) | ((c << 6) & gmask) | ((c << 9) & bmask); + V ta0 = AEM ? TA0.andnot(o == V::zero()) : TA0; + return o | ta0.blend8(TA1, c << 16); + } + + /// Expands the 16bpp pixel in the high half of each dword to a 32bpp pixel + template + __forceinline static V Expand16Hto32(const V& c, const V& TA0, const V& TA1) + { + V rmask = V(0x000000f8); + V gmask = V(0x0000f800); + V bmask = V(0x00f80000); + V o = ((c >> 13) & rmask) | ((c >> 10) & gmask) | ((c >> 7) & bmask); + V ta0 = AEM ? TA0.andnot(o == V::zero()) : TA0; + return o | ta0.blend8(TA1, c); } template @@ -1636,32 +1646,53 @@ public: GSVector8i TA0(TEXA.TA0 << 24); GSVector8i TA1(TEXA.TA1 << 24); - GSVector8i mask = GSVector8i::broadcast128(m_r16mask); - for (int i = 0; i < 4; i++, dst += dstpitch * 2) { - GSVector8i v0 = s[i * 2 + 0].shuffle8(mask); - GSVector8i v1 = s[i * 2 + 1].shuffle8(mask); + GSVector8i v0 = s[i * 2 + 0]; + GSVector8i v1 = s[i * 2 + 1]; GSVector8i::sw128(v0, v1); - GSVector8i::sw32(v0, v1); + GSVector8i::sw64(v0, v1); GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0]; GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1]; - d0[0] = Expand16to32(v0.upl16(v0), TA0, TA1); - d0[1] = Expand16to32(v0.uph16(v0), TA0, TA1); - d1[0] = Expand16to32(v1.upl16(v1), TA0, TA1); - d1[1] = Expand16to32(v1.uph16(v1), TA0, TA1); + d0[0] = Expand16Lto32(v0, TA0, TA1); + d0[1] = Expand16Hto32(v0, TA0, TA1); + d1[0] = Expand16Lto32(v1, TA0, TA1); + d1[1] = Expand16Hto32(v1, TA0, TA1); } #else - alignas(32) u16 block[16 * 8]; + const GSVector4i* s = (const GSVector4i*)src; - ReadBlock16(src, (u8*)block, sizeof(block) / 8); + GSVector4i TA0(TEXA.TA0 << 24); + GSVector4i TA1(TEXA.TA1 << 24); - ExpandBlock16(block, dst, dstpitch, TEXA); + for (int i = 0; i < 4; i++, dst += dstpitch * 2) + { + GSVector4i v0 = s[i * 4 + 0]; + GSVector4i v1 = s[i * 4 + 1]; + GSVector4i v2 = s[i * 4 + 2]; + GSVector4i v3 = s[i * 4 + 3]; + + GSVector4i::sw64(v0, v1, v2, v3); + + GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; + + d0[0] = Expand16Lto32(v0, TA0, TA1); + d0[1] = Expand16Lto32(v1, TA0, TA1); + d0[2] = Expand16Hto32(v0, TA0, TA1); + d0[3] = Expand16Hto32(v1, TA0, TA1); + + GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; + + d1[0] = Expand16Lto32(v2, TA0, TA1); + d1[1] = Expand16Lto32(v3, TA0, TA1); + d1[2] = Expand16Hto32(v2, TA0, TA1); + d1[3] = Expand16Hto32(v3, TA0, TA1); + } #endif }