GS: ReadColumn8 AVX2 path

2021-04-06 01:17:40 -05:00 · 2021-04-06 01:17:40 -05:00 · bdc7dc2cd8
parent 244a4da28a
commit bdc7dc2cd8
3 changed files with 29 additions and 13 deletions
--- a/pcsx2/GS/GSBlock.cpp
+++ b/pcsx2/GS/GSBlock.cpp
@ -20,6 +20,9 @@ CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6,
 CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
 CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);

+CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask2(1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15, 2, 6, 10, 14);
+
 CONSTINIT const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
 CONSTINIT const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
 CONSTINIT const GSVector4i GSBlock::m_uw8hmask2(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
--- a/pcsx2/GS/GSBlock.h
+++ b/pcsx2/GS/GSBlock.h
@ -25,6 +25,9 @@ class GSBlock
 	static const GSVector4i m_r8mask;
 	static const GSVector4i m_r4mask;

+	static const GSVector4i m_avx2_r8mask1;
+	static const GSVector4i m_avx2_r8mask2;
+
 	static const GSVector4i m_uw8hmask0;
 	static const GSVector4i m_uw8hmask1;
 	static const GSVector4i m_uw8hmask2;
@ -512,30 +515,34 @@ public:

 		//for(int j = 0; j < 64; j++) ((u8*)src)[j] = (u8)j;

-#if 0 //_M_SSE >= 0x501
+#if _M_SSE >= 0x501

 		const GSVector8i* s = (const GSVector8i*)src;

-		GSVector8i v0 = s[i * 2 + 0];
-		GSVector8i v1 = s[i * 2 + 1];
+		GSVector8i v0, v1;

-		GSVector8i::sw8(v0, v1);
-		GSVector8i::sw16(v0, v1);
-		GSVector8i::sw8(v0, v1);
-		GSVector8i::sw128(v0, v1);
-		GSVector8i::sw16(v0, v1);
+		if ((i & 1) == 0)
+		{
+			v0 = s[i * 2 + 0];
+			v1 = s[i * 2 + 1];
+		}
+		else
+		{
+			v1 = s[i * 2 + 0];
+			v0 = s[i * 2 + 1];
+		}

-		v0 = v0.acbd();
-		v1 = v1.acbd();
-		v1 = v1.yxwz();
+		GSVector8i v2 = v0.acbd().shuffle8(GSVector8i::broadcast128(m_avx2_r8mask1));
+		GSVector8i v3 = v1.acbd().shuffle8(GSVector8i::broadcast128(m_avx2_r8mask2));
+
+		v0 = v2.blend32<0xaa>(v3);
+		v1 = v3.blend32<0xaa>(v2);

 		GSVector8i::storel(&dst[dstpitch * 0], v0);
 		GSVector8i::storeh(&dst[dstpitch * 1], v0);
 		GSVector8i::storel(&dst[dstpitch * 2], v1);
 		GSVector8i::storeh(&dst[dstpitch * 3], v1);

-		// TODO: not sure if this is worth it, not in this form, there should be a shorter path
-
 #else

 		const GSVector4i* s = (const GSVector4i*)src;
--- a/pcsx2/GS/GSVector8i.h
+++ b/pcsx2/GS/GSVector8i.h
@ -316,6 +316,12 @@ public:
 		return GSVector8i(_mm256_blend_epi16(m, a, mask));
 	}

+	template <int mask>
+	__forceinline GSVector8i blend32(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_blend_epi32(m, a, mask));
+	}
+
 	__forceinline GSVector8i blend(const GSVector8i& a, const GSVector8i& mask) const
 	{
 		return GSVector8i(_mm256_or_si256(_mm256_andnot_si256(mask, m), _mm256_and_si256(mask, a)));