GS: Minor Zen optimizations

2021-04-21 01:41:03 -05:00 · 2021-04-21 01:41:03 -05:00 · cbf4a83b3f
parent a6887715c7
commit cbf4a83b3f
2 changed files with 27 additions and 29 deletions
--- a/pcsx2/GS/GSBlock.h
+++ b/pcsx2/GS/GSBlock.h
@ -37,6 +37,20 @@ class GSBlock
 	static const GSVector4i m_uw8hmask2;
 	static const GSVector4i m_uw8hmask3;

+#if _M_SSE >= 0x501
+	// Equvialent of `a = *s0; b = *s1; sw128(a, b);`
+	// Loads in two halves instead to reduce shuffle instructions
+	// Especially good for Zen/Zen+, as it's replacing a very expensive vperm2i128
+	template <bool Aligned = true>
+	static void LoadSW128(GSVector8i& a, GSVector8i& b, const void* s0, const void* s1)
+	{
+		const GSVector4i* src0 = static_cast<const GSVector4i*>(s0);
+		const GSVector4i* src1 = static_cast<const GSVector4i*>(s1);
+		a = GSVector8i(GSVector4i::load<Aligned>(src0), GSVector4i::load<Aligned>(src1));
+		b = GSVector8i(GSVector4i::load<Aligned>(src0 + 1), GSVector4i::load<Aligned>(src1 + 1));
+	}
+#endif
+
 public:
 	template <int i, int alignment, u32 mask>
 	__forceinline static void WriteColumn32(u8* RESTRICT dst, const u8* RESTRICT src, int srcpitch)
@ -146,10 +160,9 @@ public:

 #if _M_SSE >= 0x501

-		GSVector8i v0 = GSVector8i::load<false>(s0);
-		GSVector8i v1 = GSVector8i::load<false>(s1);
+		GSVector8i v0, v1;

-		GSVector8i::sw128(v0, v1);
+		LoadSW128<false>(v0, v1, s0, s1);
 		GSVector8i::sw16(v0, v1);

 		v0 = v0.acbd();
@ -435,10 +448,9 @@ public:

 		const GSVector8i* s = (const GSVector8i*)src;

-		GSVector8i v0 = s[i * 2 + 0];
-		GSVector8i v1 = s[i * 2 + 1];
+		GSVector8i v0, v1;

-		GSVector8i::sw128(v0, v1);
+		LoadSW128(v0, v1, &s[i * 2 + 0], &s[i * 2 + 1]);
 		GSVector8i::sw64(v0, v1);

 		GSVector8i::store<true>(&dst[dstpitch * 0], v0);
@ -1499,10 +1511,9 @@ public:

 		for (int i = 0; i < 4; i++, dst += dstpitch * 2)
 		{
-			GSVector8i v0 = s[i * 2 + 0];
-			GSVector8i v1 = s[i * 2 + 1];
+			GSVector8i v0, v1;

-			GSVector8i::sw128(v0, v1);
+			LoadSW128(v0, v1, &s[i * 2 + 0], &s[i * 2 + 1]);
 			GSVector8i::sw64(v0, v1);

 			GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0];
@ -1575,10 +1586,7 @@ public:
 			GSVector8i* d2 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 2);
 			GSVector8i* d3 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 3);

-			v0 = s[i * 4 + 0];
-			v1 = s[i * 4 + 1];
-
-			GSVector8i::sw128(v0, v1);
+			LoadSW128(v0, v1, &s[i * 4 + 0], &s[i * 4 + 1]);
 			GSVector8i::sw64(v0, v1);

 			d0[0] = ((v0      ) & mask).gather32_32(pal);
@ -1599,10 +1607,7 @@ public:
 			d2 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 2);
 			d3 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 3);

-			v1 = s[i * 4 + 2];
-			v0 = s[i * 4 + 3];
-
-			GSVector8i::sw128(v0, v1);
+			LoadSW128(v0, v1, &s[i * 4 + 3], &s[i * 4 + 2]);
 			GSVector8i::sw64(v0, v1);

 			d0[0] = ((v0      ) & mask).gather32_32(pal);
@ -1727,10 +1732,7 @@ public:
 			GSVector8i* d2 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 2);
 			GSVector8i* d3 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 3);

-			v0 = s[i * 4 + 0];
-			v1 = s[i * 4 + 1];
-
-			GSVector8i::sw128(v0, v1);
+			LoadSW128(v0, v1, &s[i * 4 + 0], &s[i * 4 + 1]);
 			GSVector8i::sw64(v0, v1);

 			v0 = v0.shuffle8(shuf);
@ -1750,10 +1752,7 @@ public:
 			d2 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 2);
 			d3 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 3);

-			v1 = s[i * 4 + 2];
-			v0 = s[i * 4 + 3];
-
-			GSVector8i::sw128(v0, v1);
+			LoadSW128(v0, v1, &s[i * 4 + 3], &s[i * 4 + 2]);
 			GSVector8i::sw64(v0, v1);

 			v0 = v0.shuffle8(shuf);
@ -1871,10 +1870,9 @@ public:
 		const GSVector8i* s = (const GSVector8i*)src;
 		for (int i = 0; i < 4; i++)
 		{
-			GSVector8i v0 = s[i * 2 + 0];
-			GSVector8i v1 = s[i * 2 + 1];
+			GSVector8i v0, v1;

-			GSVector8i::sw128(v0, v1);
+			LoadSW128(v0, v1, &s[i * 2 + 0], &s[i * 2 + 1]);
 			GSVector8i::sw64(v0, v1);

 			*reinterpret_cast<GSVector8i*>(dst) = (v0 >> 24).gather32_32(pal);
--- a/pcsx2/GS/GSVector8i.h
+++ b/pcsx2/GS/GSVector8i.h
@ -1333,7 +1333,7 @@ public:
 		GSVector8i c = a;
 		GSVector8i d = b;

-		a = c.ac(d);
+		a = c.insert<1>(d.extract<0>()); // Should become a single vinserti128, faster on Zen+
 		b = c.bd(d);
 	}