GS: Faster palette lookup for AVX2 platforms with slow VPGATHERDD

2021-04-20 05:25:50 -05:00 · 2021-04-20 05:25:50 -05:00 · a6887715c7
parent e2169bc1da
commit a6887715c7
4 changed files with 111 additions and 34 deletions
--- a/pcsx2/GS/GSBlock.h
+++ b/pcsx2/GS/GSBlock.h
@ -1072,7 +1072,12 @@ public:
 	{
 		for (int j = 0; j < 16; j++, dst += dstpitch)
 		{
-			((const GSVector4i*)src)[j].gather32_8(pal, (GSVector4i*)dst);
+			for (int k = 0; k < 4; k++)
+			{
+				const u8* s = src + j * 16 + k * 4;
+				GSVector4i v = GSVector4i(pal[s[0]], pal[s[1]], pal[s[2]], pal[s[3]]);
+				reinterpret_cast<GSVector4i*>(dst)[k] = v;
+			}
 		}
 	}

@ -1543,6 +1548,15 @@ public:
 #endif
 	}

+	/// ReadAndExpandBlock8 for AVX2 platforms with slow VPGATHERDD (Haswell, Zen, Zen2, Zen3)
+	/// This is faster than the one in ReadAndExpandBlock8_32 on HSW+ due to a port 5 traffic jam, should be about the same on Zen
+	__forceinline static void ReadAndExpandBlock8_32HSW(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
+	{
+		alignas(32) u8 block[16 * 16];
+		ReadBlock8(src, (u8*)block, sizeof(block) / 16);
+		ExpandBlock8_32(block, dst, dstpitch, pal);
+	}
+
 	__forceinline static void ReadAndExpandBlock8_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
 	{
 		//printf("ReadAndExpandBlock8_32\n");
@ -1829,6 +1843,25 @@ public:

 	// TODO: ReadAndExpandBlock4_16

+	// ReadAndExpandBlock8H for AVX2 platforms with slow VPGATHERDD (Haswell, Zen, Zen2, Zen3)
+	// Also serves as the implementation for AVX / SSE
+	__forceinline static void ReadAndExpandBlock8H_32HSW(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
+	{
+		for (int i = 0; i < 4; i++)
+		{
+			const u8* s = src + i * 64;
+			GSVector4i* d0 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 0);
+			GSVector4i* d1 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 1);
+
+			d0[0] = GSVector4i(pal[s[ 3]], pal[s[ 7]], pal[s[19]], pal[s[23]]);
+			d0[1] = GSVector4i(pal[s[35]], pal[s[39]], pal[s[51]], pal[s[55]]);
+			d1[0] = GSVector4i(pal[s[11]], pal[s[15]], pal[s[27]], pal[s[31]]);
+			d1[1] = GSVector4i(pal[s[43]], pal[s[47]], pal[s[59]], pal[s[63]]);
+
+			dst += dstpitch * 2;
+		}
+	}
+
 	__forceinline static void ReadAndExpandBlock8H_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
 	{
 		//printf("ReadAndExpandBlock8H_32\n");
@ -1853,29 +1886,7 @@ public:

 #else

-		const GSVector4i* s = (const GSVector4i*)src;
-
-		GSVector4i v0, v1, v2, v3;
-
-		for (int i = 0; i < 4; i++)
-		{
-			v0 = s[i * 4 + 0];
-			v1 = s[i * 4 + 1];
-			v2 = s[i * 4 + 2];
-			v3 = s[i * 4 + 3];
-
-			GSVector4i::sw64(v0, v1, v2, v3);
-
-			(v0 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[0]);
-			(v1 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[16]);
-
-			dst += dstpitch;
-
-			(v2 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[0]);
-			(v3 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[16]);
-
-			dst += dstpitch;
-		}
+		ReadAndExpandBlock8H_32HSW(src, dst, dstpitch, pal);

 #endif
 	}
--- a/pcsx2/GS/GSLocalMemory.cpp
+++ b/pcsx2/GS/GSLocalMemory.cpp
@ -17,6 +17,7 @@
 #include "GSLocalMemory.h"
 #include "GS.h"
 #include "GSExtra.h"
+#include <xbyak/xbyak_util.h>
 #include <unordered_set>

 template <typename Fn>
@ -269,6 +270,35 @@ GSLocalMemory::GSLocalMemory()
 	m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16;
 	m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16;

+#if _M_SSE == 0x501
+	Xbyak::util::Cpu cpu;
+	bool slowVPGATHERDD;
+	if (cpu.has(Xbyak::util::Cpu::tINTEL))
+	{
+		// Slow on Haswell
+		// CPUID data from https://en.wikichip.org/wiki/intel/cpuid
+		slowVPGATHERDD = cpu.displayModel == 0x46 || cpu.displayModel == 0x45 || cpu.displayModel == 0x3c;
+	}
+	else
+	{
+		// Currently no Zen CPUs with fast VPGATHERDD
+		// Check https://uops.info/table.html as new CPUs come out for one that doesn't split it into like 40 µops
+		// Doing it manually is about 28 µops (8x xmm -> gpr, 6x extr, 8x load, 6x insr)
+		slowVPGATHERDD = true;
+	}
+	if (const char* over = getenv("SLOW_VPGATHERDD_OVERRIDE")) // Easy override for comparing on vs off
+	{
+		slowVPGATHERDD = over[0] == 'Y' || over[0] == 'y' || over[0] == '1';
+	}
+	if (slowVPGATHERDD)
+	{
+		m_psm[PSM_PSMT8].rtx = &GSLocalMemory::ReadTexture8HSW;
+		m_psm[PSM_PSMT8H].rtx = &GSLocalMemory::ReadTexture8HHSW;
+		m_psm[PSM_PSMT8].rtxb = &GSLocalMemory::ReadTextureBlock8HSW;
+		m_psm[PSM_PSMT8H].rtxb = &GSLocalMemory::ReadTextureBlock8HHSW;
+	}
+#endif
+
 	m_psm[PSM_PSGPU24].bpp = 16;
 	m_psm[PSM_PSMCT16].bpp = m_psm[PSM_PSMCT16S].bpp = 16;
 	m_psm[PSM_PSMT8].bpp = 8;
@ -1460,6 +1490,28 @@ void GSLocalMemory::ReadTexture8H(const GSOffset& off, const GSVector4i& r, u8*
 	});
 }

+#if _M_SSE == 0x501
+void GSLocalMemory::ReadTexture8HSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	const u32* pal = m_clut;
+
+	foreachBlock(off.assertSizesMatch(swizzle8), this, r, dst, dstpitch, 32, [&](u8* read_dst, const u8* src)
+	{
+		GSBlock::ReadAndExpandBlock8_32HSW(src, read_dst, dstpitch, pal);
+	});
+}
+
+void GSLocalMemory::ReadTexture8HHSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	const u32* pal = m_clut;
+
+	foreachBlock(off.assertSizesMatch(swizzle32), this, r, dst, dstpitch, 32, [&](u8* read_dst, const u8* src)
+	{
+		GSBlock::ReadAndExpandBlock8H_32HSW(src, read_dst, dstpitch, pal);
+	});
+}
+#endif
+
 void GSLocalMemory::ReadTexture4HL(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA)
 {
 	const u32* pal = m_clut;
@ -1538,6 +1590,22 @@ void GSLocalMemory::ReadTextureBlock8H(u32 bp, u8* dst, int dstpitch, const GIFR
 	GSBlock::ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }

+#if _M_SSE == 0x501
+void GSLocalMemory::ReadTextureBlock8HSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadAndExpandBlock8_32HSW(BlockPtr(bp), dst, dstpitch, m_clut);
+}
+
+void GSLocalMemory::ReadTextureBlock8HHSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadAndExpandBlock8H_32HSW(BlockPtr(bp), dst, dstpitch, m_clut);
+}
+#endif
+
 void GSLocalMemory::ReadTextureBlock4HL(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
--- a/pcsx2/GS/GSLocalMemory.h
+++ b/pcsx2/GS/GSLocalMemory.h
@ -1156,6 +1156,13 @@ public:
 	void ReadTextureBlock4HL(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
 	void ReadTextureBlock4HH(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;

+#if _M_SSE == 0x501
+	void ReadTexture8HSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture8HHSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTextureBlock8HSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock8HHSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+#endif
+
 	// pal ? 8 : 32

 	void ReadTexture8P(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA);
--- a/pcsx2/GS/GSVector4i.h
+++ b/pcsx2/GS/GSVector4i.h
@ -81,16 +81,7 @@ public:

 	__forceinline GSVector4i(int x, int y, int z, int w)
 	{
-		// 4 gprs
-
-		// m = _mm_set_epi32(w, z, y, x);
-
-		// 2 gprs
-
-		GSVector4i xz = load(x).upl32(load(z));
-		GSVector4i yw = load(y).upl32(load(w));
-
-		*this = xz.upl32(yw);
+		m = _mm_set_epi32(w, z, y, x);
 	}

 	__forceinline GSVector4i(int x, int y)