From 1f6b2e629bc46bfa71d1896cf9be581bdc080f2d Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Mon, 5 Apr 2021 19:51:01 -0500 Subject: [PATCH] GS: Use broadcast loads on AVX2 Broadcast loads are free on AVX2 processors, might as well use them --- pcsx2/GS/GSBlock.cpp | 4 ---- pcsx2/GS/GSBlock.h | 16 ++++++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/pcsx2/GS/GSBlock.cpp b/pcsx2/GS/GSBlock.cpp index f40ef473aa..51349c23cd 100644 --- a/pcsx2/GS/GSBlock.cpp +++ b/pcsx2/GS/GSBlock.cpp @@ -16,11 +16,7 @@ #include "PrecompiledHeader.h" #include "GSBlock.h" -#if _M_SSE >= 0x501 -CONSTINIT const GSVector8i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15); -#else CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15); -#endif CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15); CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); diff --git a/pcsx2/GS/GSBlock.h b/pcsx2/GS/GSBlock.h index f9f548f2b7..c37c6e394b 100644 --- a/pcsx2/GS/GSBlock.h +++ b/pcsx2/GS/GSBlock.h @@ -21,11 +21,7 @@ class GSBlock { -#if _M_SSE >= 0x501 - static const GSVector8i m_r16mask; -#else static const GSVector4i m_r16mask; -#endif static const GSVector4i m_r8mask; static const GSVector4i m_r4mask; @@ -490,8 +486,10 @@ public: const GSVector8i* s = (const GSVector8i*)src; - GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask); - GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask); + GSVector8i mask = GSVector8i::broadcast128(m_r16mask); + + GSVector8i v0 = s[i * 2 + 0].shuffle8(mask); + GSVector8i v1 = s[i * 2 + 1].shuffle8(mask); GSVector8i::sw128(v0, v1); GSVector8i::sw32(v0, v1); @@ -1637,10 +1635,12 @@ public: GSVector8i TA0(TEXA.TA0 << 24); GSVector8i TA1(TEXA.TA1 << 24); + GSVector8i mask = GSVector8i::broadcast128(m_r16mask); + for (int i = 0; i < 4; i++, dst += dstpitch * 2) { - GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask); - GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask); + GSVector8i v0 = s[i * 2 + 0].shuffle8(mask); + GSVector8i v1 = s[i * 2 + 1].shuffle8(mask); GSVector8i::sw128(v0, v1); GSVector8i::sw32(v0, v1);