GS: pshufb-based ReadAndExpandBlock4_32

2021-04-06 05:51:13 -05:00 · 2021-04-06 05:51:13 -05:00 · 679cc7af38
parent c4b3239e33
commit 679cc7af38
5 changed files with 85 additions and 42 deletions
--- a/pcsx2/GS/GSBlock.cpp
+++ b/pcsx2/GS/GSBlock.cpp
@ -19,6 +19,7 @@
 CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
 CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
 CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15);
+CONSTINIT const GSVector4i GSBlock::m_palvec_mask(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);

 CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
 CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask2(1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15, 2, 6, 10, 14);
--- a/pcsx2/GS/GSBlock.h
+++ b/pcsx2/GS/GSBlock.h
@ -24,6 +24,7 @@ class GSBlock
 	static const GSVector4i m_r16mask;
 	static const GSVector4i m_r8mask;
 	static const GSVector4i m_r4mask;
+	static const GSVector4i m_palvec_mask;

 	static const GSVector4i m_avx2_r8mask1;
 	static const GSVector4i m_avx2_r8mask2;
@ -1759,72 +1760,106 @@ public:

 	// TODO: ReadAndExpandBlock8_16

-	__forceinline static void ReadAndExpandBlock4_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u64* RESTRICT pal)
+	/// Load 16-element palette into four vectors, with each u32 split across the four vectors
+	template <typename V>
+	__forceinline static void LoadPalVecs(const u32* RESTRICT pal, V& p0, V& p1, V& p2, V& p3)
+	{
+		const GSVector4i* p = (const GSVector4i*)pal;
+		p0 = V::broadcast128(p[0]).shuffle8(V::broadcast128(m_palvec_mask));
+		p1 = V::broadcast128(p[1]).shuffle8(V::broadcast128(m_palvec_mask));
+		p2 = V::broadcast128(p[2]).shuffle8(V::broadcast128(m_palvec_mask));
+		p3 = V::broadcast128(p[3]).shuffle8(V::broadcast128(m_palvec_mask));
+		V::sw32(p0, p1, p2, p3);
+		V::sw64(p0, p1, p2, p3);
+		std::swap(p1, p2);
+	}
+
+	template <typename V>
+	__forceinline static void ReadClut4AndWrite(const V& p0, const V& p1, const V& p2, const V& p3, const V& src, V* dst, int dstride)
+	{
+		V r0 = p0.shuffle8(src);
+		V r1 = p1.shuffle8(src);
+		V r2 = p2.shuffle8(src);
+		V r3 = p3.shuffle8(src);
+
+		V::sw8(r0, r1, r2, r3);
+		V::sw16(r0, r1, r2, r3);
+
+		dst[dstride * 0] = r0;
+		dst[dstride * 1] = r2;
+		dst[dstride * 2] = r1;
+		dst[dstride * 3] = r3;
+	}
+
+	__forceinline static void ReadAndExpandBlock4_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
 	{
 		//printf("ReadAndExpandBlock4_32\n");

 		const GSVector4i* s = (const GSVector4i*)src;

+		GSVector4i p0, p1, p2, p3;
+		LoadPalVecs(pal, p0, p1, p2, p3);
+		GSVector4i mask(0x0f0f0f0f);
+
 		GSVector4i v0, v1, v2, v3;
-		GSVector4 v0f, v1f, v2f, v3f;

 		for (int i = 0; i < 2; i++)
 		{
+			GSVector4i* d0 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 0);
+			GSVector4i* d1 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 1);
+			GSVector4i* d2 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 2);
+			GSVector4i* d3 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 3);
+
 			v0 = s[i * 8 + 0];
 			v1 = s[i * 8 + 1];
 			v2 = s[i * 8 + 2];
 			v3 = s[i * 8 + 3];

-			GSVector4i::sw32_inv(v0, v1, v2, v3);
-			GSVector4i::mix4(v0, v1);
-			GSVector4i::mix4(v2, v3);
+			GSVector4i::sw64(v0, v1, v2, v3);

-			v0f = GSVector4::cast(v0);
-			v1f = GSVector4::cast(v1);
-			v2f = GSVector4::cast(v2);
-			v3f = GSVector4::cast(v3);
+			v0 = v0.shuffle8(m_palvec_mask);
+			v1 = v1.shuffle8(m_palvec_mask);
+			v2 = v2.shuffle8(m_palvec_mask);
+			v3 = v3.shuffle8(m_palvec_mask);

-			v0 = GSVector4i::cast(v0f.xzxz(v2f)).shuffle8(m_r4mask);
-			v1 = GSVector4i::cast(v0f.ywyw(v2f)).shuffle8(m_r4mask);
-			v2 = GSVector4i::cast(v1f.zxzx(v3f)).shuffle8(m_r4mask);
-			v3 = GSVector4i::cast(v1f.wywy(v3f)).shuffle8(m_r4mask);
+			ReadClut4AndWrite(p0, p1, p2, p3,  v0       & mask, d0 + 0, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3, (v0 >> 4) & mask, d2 + 1, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3,  v1       & mask, d0 + 1, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3, (v1 >> 4) & mask, d2 + 0, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3,  v2       & mask, d1 + 0, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3, (v2 >> 4) & mask, d3 + 1, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3,  v3       & mask, d1 + 1, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3, (v3 >> 4) & mask, d3 + 0, 2);

-			v0.gather64_8<>(pal, (GSVector4i*)dst);
-			dst += dstpitch;
-			v1.gather64_8<>(pal, (GSVector4i*)dst);
-			dst += dstpitch;
-			v2.gather64_8<>(pal, (GSVector4i*)dst);
-			dst += dstpitch;
-			v3.gather64_8<>(pal, (GSVector4i*)dst);
-			dst += dstpitch;
+			dst += dstpitch * 4;
+
+			d0 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 0);
+			d1 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 1);
+			d2 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 2);
+			d3 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 3);

 			v0 = s[i * 8 + 4];
 			v1 = s[i * 8 + 5];
 			v2 = s[i * 8 + 6];
 			v3 = s[i * 8 + 7];

-			GSVector4i::sw32_inv(v0, v1, v2, v3);
-			GSVector4i::mix4(v0, v1);
-			GSVector4i::mix4(v2, v3);
+			GSVector4i::sw64(v0, v1, v2, v3);

-			v0f = GSVector4::cast(v0);
-			v1f = GSVector4::cast(v1);
-			v2f = GSVector4::cast(v2);
-			v3f = GSVector4::cast(v3);
+			v0 = v0.shuffle8(m_palvec_mask);
+			v1 = v1.shuffle8(m_palvec_mask);
+			v2 = v2.shuffle8(m_palvec_mask);
+			v3 = v3.shuffle8(m_palvec_mask);

-			v0 = GSVector4i::cast(v0f.zxzx(v2f)).shuffle8(m_r4mask);
-			v1 = GSVector4i::cast(v0f.wywy(v2f)).shuffle8(m_r4mask);
-			v2 = GSVector4i::cast(v1f.xzxz(v3f)).shuffle8(m_r4mask);
-			v3 = GSVector4i::cast(v1f.ywyw(v3f)).shuffle8(m_r4mask);
+			ReadClut4AndWrite(p0, p1, p2, p3,  v0       & mask, d0 + 1, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3, (v0 >> 4) & mask, d2 + 0, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3,  v1       & mask, d0 + 0, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3, (v1 >> 4) & mask, d2 + 1, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3,  v2       & mask, d1 + 1, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3, (v2 >> 4) & mask, d3 + 0, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3,  v3       & mask, d1 + 0, 2);
+			ReadClut4AndWrite(p0, p1, p2, p3, (v3 >> 4) & mask, d3 + 1, 2);

-			v0.gather64_8<>(pal, (GSVector4i*)dst);
-			dst += dstpitch;
-			v1.gather64_8<>(pal, (GSVector4i*)dst);
-			dst += dstpitch;
-			v2.gather64_8<>(pal, (GSVector4i*)dst);
-			dst += dstpitch;
-			v3.gather64_8<>(pal, (GSVector4i*)dst);
-			dst += dstpitch;
+			dst += dstpitch * 4;
 		}
 	}

--- a/pcsx2/GS/GSLocalMemory.cpp
+++ b/pcsx2/GS/GSLocalMemory.cpp
@ -1442,7 +1442,7 @@ void GSLocalMemory::ReadTexture8(const GSOffset& off, const GSVector4i& r, u8* d

 void GSLocalMemory::ReadTexture4(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA)
 {
-	const u64* pal = m_clut;
+	const u32* pal = m_clut;

 	foreachBlock(off.assertSizesMatch(swizzle4), this, r, dst, dstpitch, 32, [&](u8* read_dst, const u8* src)
 	{
--- a/pcsx2/GS/GSVector4i.h
+++ b/pcsx2/GS/GSVector4i.h
@ -2005,6 +2005,13 @@ public:

 	// clang-format on

+	/// Noop, here so broadcast128 can be used generically over all vectors
+	__forceinline static GSVector4i broadcast128(const GSVector4i& v)
+	{
+		return v;
+	}
+
+
 	__forceinline static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); }

 	__forceinline static GSVector4i xffffffff() { return zero() == zero(); }
--- a/tests/ctest/GS/swizzle_test_main.cpp
+++ b/tests/ctest/GS/swizzle_test_main.cpp
@ -432,7 +432,7 @@ TEST(ReadAndExpandTest, Read4)
 	{
 		TestData expected = swizzle4(&columnTable4[0][0], data, true);
 		expected = expand4(expected.prepareExpand());
-		GSBlock::ReadAndExpandBlock4_32(data.block, data.output, 128, data.clut64);
+		GSBlock::ReadAndExpandBlock4_32(data.block, data.output, 128, data.clut32);
 		assertEqual(expected, data, "ReadAndExpand4", 16, 32, 32);
 	});
 }