GS: AVX2 WriteBlock functions

2021-04-07 13:45:25 -05:00 · 2021-04-07 13:45:25 -05:00 · d64e838b6d
parent 263e495561
commit d64e838b6d
2 changed files with 42 additions and 11 deletions
--- a/pcsx2/GS/GSBlock.cpp
+++ b/pcsx2/GS/GSBlock.cpp
@ -19,10 +19,13 @@
 CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
 CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
 CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15);
+CONSTINIT const GSVector4i GSBlock::m_w4mask(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15);
 CONSTINIT const GSVector4i GSBlock::m_palvec_mask(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);

 CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
 CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask2(1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15, 2, 6, 10, 14);
+CONSTINIT const GSVector4i GSBlock::m_avx2_w8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+CONSTINIT const GSVector4i GSBlock::m_avx2_w8mask2(4, 0, 12, 8, 5, 1, 13, 9, 6, 2, 14, 10, 7, 3, 15, 11);

 CONSTINIT const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
 CONSTINIT const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
--- a/pcsx2/GS/GSBlock.h
+++ b/pcsx2/GS/GSBlock.h
@ -24,10 +24,13 @@ class GSBlock
 	static const GSVector4i m_r16mask;
 	static const GSVector4i m_r8mask;
 	static const GSVector4i m_r4mask;
+	static const GSVector4i m_w4mask;
 	static const GSVector4i m_palvec_mask;

 	static const GSVector4i m_avx2_r8mask1;
 	static const GSVector4i m_avx2_r8mask2;
+	static const GSVector4i m_avx2_w8mask1;
+	static const GSVector4i m_avx2_w8mask2;

 	static const GSVector4i m_uw8hmask0;
 	static const GSVector4i m_uw8hmask1;
@ -216,24 +219,20 @@ public:
 		GSVector8i v0(v4, v5);
 		GSVector8i v1(v6, v7);

+		GSVector8i v2 = v0.blend32<0xaa>(v1).shuffle8(GSVector8i::broadcast128(m_avx2_w8mask1)).acbd();
+		GSVector8i v3 = v1.blend32<0xaa>(v0).shuffle8(GSVector8i::broadcast128(m_avx2_w8mask2)).acbd();
+
 		if ((i & 1) == 0)
 		{
-			v1 = v1.yxwz();
+			((GSVector8i*)dst)[i * 2 + 0] = v2;
+			((GSVector8i*)dst)[i * 2 + 1] = v3;
 		}
 		else
 		{
-			v0 = v0.yxwz();
+			((GSVector8i*)dst)[i * 2 + 0] = v3;
+			((GSVector8i*)dst)[i * 2 + 1] = v2;
 		}

-		GSVector8i::sw8(v0, v1);
-		GSVector8i::sw16(v0, v1);
-
-		v0 = v0.acbd();
-		v1 = v1.acbd();
-
-		((GSVector8i*)dst)[i * 2 + 0] = v0;
-		((GSVector8i*)dst)[i * 2 + 1] = v1;
-
 #else

 		GSVector4i v0 = GSVector4i::load<alignment != 0>(&src[srcpitch * 0]);
@ -273,6 +272,33 @@ public:

 		// TODO: pshufb

+#if _M_SSE >= 0x501
+
+		GSVector8i v0 = GSVector8i(GSVector4i::load<false>(&src[srcpitch * 0]), GSVector4i::load<false>(&src[srcpitch * 1]));
+		GSVector8i v1 = GSVector8i(GSVector4i::load<false>(&src[srcpitch * 2]), GSVector4i::load<false>(&src[srcpitch * 3]));
+
+		v0 = v0.shuffle8(GSVector8i::broadcast128(m_w4mask)).acbd();
+		v1 = v1.shuffle8(GSVector8i::broadcast128(m_w4mask)).acbd();
+
+		if ((i & 1) == 0)
+		{
+			v0 = v0.xzyw();
+			v1 = v1.ywxz();
+		}
+		else
+		{
+			v0 = v0.ywxz();
+			v1 = v1.xzyw();
+		}
+
+		GSVector8i::mix4(v0, v1);
+		GSVector8i::sw32(v0, v1);
+
+		((GSVector8i*)dst)[i * 2 + 0] = v0;
+		((GSVector8i*)dst)[i * 2 + 1] = v1;
+
+#else
+
 		GSVector4i v0 = GSVector4i::load<alignment != 0>(&src[srcpitch * 0]);
 		GSVector4i v1 = GSVector4i::load<alignment != 0>(&src[srcpitch * 1]);
 		GSVector4i v2 = GSVector4i::load<alignment != 0>(&src[srcpitch * 2]);
@ -298,6 +324,8 @@ public:
 		((GSVector4i*)dst)[i * 4 + 1] = v1;
 		((GSVector4i*)dst)[i * 4 + 2] = v2;
 		((GSVector4i*)dst)[i * 4 + 3] = v3;
+
+#endif
 	}

 	template <int alignment, u32 mask>