From 244a4da28ad5bc4d109fc56cc4cb745b32780f11 Mon Sep 17 00:00:00 2001
From: TellowKrinkle <tellowkrinkle@gmail.com>
Date: Tue, 6 Apr 2021 00:27:49 -0500
Subject: [PATCH] GS: ReadBlock16 performance improvements

---
 pcsx2/GS/GSBlock.cpp |  14 +-----
 pcsx2/GS/GSBlock.h   | 101 ++++++++++++++++++++++++++++---------------
 2 files changed, 67 insertions(+), 48 deletions(-)
diff --git a/pcsx2/GS/GSBlock.cpp b/pcsx2/GS/GSBlock.cpp
index 51349c23cd..ccb25fd7fc 100644
--- a/pcsx2/GS/GSBlock.cpp
+++ b/pcsx2/GS/GSBlock.cpp
@@ -16,22 +16,10 @@
 #include "PrecompiledHeader.h"
 #include "GSBlock.h"
 
-CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
+CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
 CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
 CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
 
-#if _M_SSE >= 0x501
-CONSTINIT const GSVector8i GSBlock::m_xxxa = GSVector8i::cxpr(0x00008000);
-CONSTINIT const GSVector8i GSBlock::m_xxbx = GSVector8i::cxpr(0x00007c00);
-CONSTINIT const GSVector8i GSBlock::m_xgxx = GSVector8i::cxpr(0x000003e0);
-CONSTINIT const GSVector8i GSBlock::m_rxxx = GSVector8i::cxpr(0x0000001f);
-#else
-CONSTINIT const GSVector4i GSBlock::m_xxxa = GSVector4i::cxpr(0x00008000);
-CONSTINIT const GSVector4i GSBlock::m_xxbx = GSVector4i::cxpr(0x00007c00);
-CONSTINIT const GSVector4i GSBlock::m_xgxx = GSVector4i::cxpr(0x000003e0);
-CONSTINIT const GSVector4i GSBlock::m_rxxx = GSVector4i::cxpr(0x0000001f);
-#endif
-
 CONSTINIT const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
 CONSTINIT const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
 CONSTINIT const GSVector4i GSBlock::m_uw8hmask2(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
diff --git a/pcsx2/GS/GSBlock.h b/pcsx2/GS/GSBlock.h
index e51763bbcb..3d3f7aa8a6 100644
--- a/pcsx2/GS/GSBlock.h
+++ b/pcsx2/GS/GSBlock.h
@@ -25,18 +25,6 @@ class GSBlock
 	static const GSVector4i m_r8mask;
 	static const GSVector4i m_r4mask;
 
-#if _M_SSE >= 0x501
-	static const GSVector8i m_xxxa;
-	static const GSVector8i m_xxbx;
-	static const GSVector8i m_xgxx;
-	static const GSVector8i m_rxxx;
-#else
-	static const GSVector4i m_xxxa;
-	static const GSVector4i m_xxbx;
-	static const GSVector4i m_xgxx;
-	static const GSVector4i m_rxxx;
-#endif
-
 	static const GSVector4i m_uw8hmask0;
 	static const GSVector4i m_uw8hmask1;
 	static const GSVector4i m_uw8hmask2;
@@ -488,17 +476,11 @@ public:
 
 		GSVector8i mask = GSVector8i::broadcast128(m_r16mask);
 
-		GSVector8i v0 = s[i * 2 + 0].shuffle8(mask);
-		GSVector8i v1 = s[i * 2 + 1].shuffle8(mask);
+		GSVector8 v0 = GSVector8::cast(s[i * 2 + 0].shuffle8(mask).acbd());
+		GSVector8 v1 = GSVector8::cast(s[i * 2 + 1].shuffle8(mask).acbd());
 
-		GSVector8i::sw128(v0, v1);
-		GSVector8i::sw32(v0, v1);
-
-		v0 = v0.acbd();
-		v1 = v1.acbd();
-
-		GSVector8i::store<true>(&dst[dstpitch * 0], v0);
-		GSVector8i::store<true>(&dst[dstpitch * 1], v1);
+		GSVector8::store<true>(&dst[dstpitch * 0], v0.xzxz(v1));
+		GSVector8::store<true>(&dst[dstpitch * 1], v0.ywyw(v1));
 
 #else
 
@@ -1005,10 +987,38 @@ public:
 		return c | (AEM ? TA0.andnot(c == V::zero()) : TA0); // TA0 & (c != GSVector4i::zero())
 	}
 
+	/// Expands the 16bpp pixel duplicated across both halves of each dword to a 32bpp pixel
 	template <bool AEM, class V>
 	__forceinline static V Expand16to32(const V& c, const V& TA0, const V& TA1)
 	{
-		return ((c & m_rxxx) << 3) | ((c & m_xgxx) << 6) | ((c & m_xxbx) << 9) | (AEM ? TA0.blend8(TA1, c.sra16(15)).andnot(c == V::zero()) : TA0.blend(TA1, c.sra16(15)));
+		V rmask = V(0x000000f8);
+		V gmask = V(0x0000f800);
+		V bmask = V(0x00f80000);
+		return ((c << 3) & rmask) | ((c << 6) & gmask) | ((c << 9) & bmask) | (AEM ? TA0.blend8(TA1, c).andnot(c == V::zero()) : TA0.blend8(TA1, c));
+	}
+
+	/// Expands the 16bpp pixel in the low half of each dword to a 32bpp pixel
+	template <bool AEM, class V>
+	__forceinline static V Expand16Lto32(const V& c, const V& TA0, const V& TA1)
+	{
+		V rmask = V(0x000000f8);
+		V gmask = V(0x0000f800);
+		V bmask = V(0x00f80000);
+		V o = ((c << 3) & rmask) | ((c << 6) & gmask) | ((c << 9) & bmask);
+		V ta0 = AEM ? TA0.andnot(o == V::zero()) : TA0;
+		return o | ta0.blend8(TA1, c << 16);
+	}
+
+	/// Expands the 16bpp pixel in the high half of each dword to a 32bpp pixel
+	template <bool AEM, class V>
+	__forceinline static V Expand16Hto32(const V& c, const V& TA0, const V& TA1)
+	{
+		V rmask = V(0x000000f8);
+		V gmask = V(0x0000f800);
+		V bmask = V(0x00f80000);
+		V o = ((c >> 13) & rmask) | ((c >> 10) & gmask) | ((c >> 7) & bmask);
+		V ta0 = AEM ? TA0.andnot(o == V::zero()) : TA0;
+		return o | ta0.blend8(TA1, c);
 	}
 
 	template <bool AEM>
@@ -1636,32 +1646,53 @@ public:
 		GSVector8i TA0(TEXA.TA0 << 24);
 		GSVector8i TA1(TEXA.TA1 << 24);
 
-		GSVector8i mask = GSVector8i::broadcast128(m_r16mask);
-
 		for (int i = 0; i < 4; i++, dst += dstpitch * 2)
 		{
-			GSVector8i v0 = s[i * 2 + 0].shuffle8(mask);
-			GSVector8i v1 = s[i * 2 + 1].shuffle8(mask);
+			GSVector8i v0 = s[i * 2 + 0];
+			GSVector8i v1 = s[i * 2 + 1];
 
 			GSVector8i::sw128(v0, v1);
-			GSVector8i::sw32(v0, v1);
+			GSVector8i::sw64(v0, v1);
 
 			GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0];
 			GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1];
 
-			d0[0] = Expand16to32<AEM>(v0.upl16(v0), TA0, TA1);
-			d0[1] = Expand16to32<AEM>(v0.uph16(v0), TA0, TA1);
-			d1[0] = Expand16to32<AEM>(v1.upl16(v1), TA0, TA1);
-			d1[1] = Expand16to32<AEM>(v1.uph16(v1), TA0, TA1);
+			d0[0] = Expand16Lto32<AEM>(v0, TA0, TA1);
+			d0[1] = Expand16Hto32<AEM>(v0, TA0, TA1);
+			d1[0] = Expand16Lto32<AEM>(v1, TA0, TA1);
+			d1[1] = Expand16Hto32<AEM>(v1, TA0, TA1);
 		}
 
 #else
 
-		alignas(32) u16 block[16 * 8];
+		const GSVector4i* s = (const GSVector4i*)src;
 
-		ReadBlock16(src, (u8*)block, sizeof(block) / 8);
+		GSVector4i TA0(TEXA.TA0 << 24);
+		GSVector4i TA1(TEXA.TA1 << 24);
 
-		ExpandBlock16<AEM>(block, dst, dstpitch, TEXA);
+		for (int i = 0; i < 4; i++, dst += dstpitch * 2)
+		{
+			GSVector4i v0 = s[i * 4 + 0];
+			GSVector4i v1 = s[i * 4 + 1];
+			GSVector4i v2 = s[i * 4 + 2];
+			GSVector4i v3 = s[i * 4 + 3];
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+
+			GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
+
+			d0[0] = Expand16Lto32<AEM>(v0, TA0, TA1);
+			d0[1] = Expand16Lto32<AEM>(v1, TA0, TA1);
+			d0[2] = Expand16Hto32<AEM>(v0, TA0, TA1);
+			d0[3] = Expand16Hto32<AEM>(v1, TA0, TA1);
+
+			GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
+
+			d1[0] = Expand16Lto32<AEM>(v2, TA0, TA1);
+			d1[1] = Expand16Lto32<AEM>(v3, TA0, TA1);
+			d1[2] = Expand16Hto32<AEM>(v2, TA0, TA1);
+			d1[3] = Expand16Hto32<AEM>(v3, TA0, TA1);
+		}
 
 #endif
 	}