GS: Use broadcast loads on AVX2

Broadcast loads are free on AVX2 processors, might as well use them
2021-04-05 19:51:01 -05:00 · 2021-04-05 19:51:01 -05:00 · 1f6b2e629b
parent 793ba944d6
commit 1f6b2e629b
2 changed files with 8 additions and 12 deletions
--- a/pcsx2/GS/GSBlock.cpp
+++ b/pcsx2/GS/GSBlock.cpp
@ -16,11 +16,7 @@
 #include "PrecompiledHeader.h"
 #include "GSBlock.h"
 #if _M_SSE >= 0x501
 CONSTINIT const GSVector8i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
 #else
 CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
 #endif
 CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
 CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
--- a/pcsx2/GS/GSBlock.h
+++ b/pcsx2/GS/GSBlock.h
@ -21,11 +21,7 @@
 class GSBlock
 {
 #if _M_SSE >= 0x501
 	static const GSVector8i m_r16mask;
 #else
 	static const GSVector4i m_r16mask;
 #endif
 	static const GSVector4i m_r8mask;
 	static const GSVector4i m_r4mask;
@ -490,8 +486,10 @@ public:
 		const GSVector8i* s = (const GSVector8i*)src;
-		GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask);
+		GSVector8i mask = GSVector8i::broadcast128(m_r16mask);
-		GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask);
+
 		GSVector8i v0 = s[i * 2 + 0].shuffle8(mask);
 		GSVector8i v1 = s[i * 2 + 1].shuffle8(mask);
 		GSVector8i::sw128(v0, v1);
 		GSVector8i::sw32(v0, v1);
@ -1637,10 +1635,12 @@ public:
 		GSVector8i TA0(TEXA.TA0 << 24);
 		GSVector8i TA1(TEXA.TA1 << 24);
 		GSVector8i mask = GSVector8i::broadcast128(m_r16mask);
 		for (int i = 0; i < 4; i++, dst += dstpitch * 2)
 		{
-			GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask);
+			GSVector8i v0 = s[i * 2 + 0].shuffle8(mask);
-			GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask);
+			GSVector8i v1 = s[i * 2 + 1].shuffle8(mask);
 			GSVector8i::sw128(v0, v1);
 			GSVector8i::sw32(v0, v1);