GS: Use broadcast loads on AVX2

Broadcast loads are free on AVX2 processors, might as well use them
This commit is contained in:
TellowKrinkle 2021-04-05 19:51:01 -05:00 committed by refractionpcsx2
parent 793ba944d6
commit 1f6b2e629b
2 changed files with 8 additions and 12 deletions

View File

@ -16,11 +16,7 @@
#include "PrecompiledHeader.h" #include "PrecompiledHeader.h"
#include "GSBlock.h" #include "GSBlock.h"
#if _M_SSE >= 0x501
CONSTINIT const GSVector8i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
#else
CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15); CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
#endif
CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15); CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);

View File

@ -21,11 +21,7 @@
class GSBlock class GSBlock
{ {
#if _M_SSE >= 0x501
static const GSVector8i m_r16mask;
#else
static const GSVector4i m_r16mask; static const GSVector4i m_r16mask;
#endif
static const GSVector4i m_r8mask; static const GSVector4i m_r8mask;
static const GSVector4i m_r4mask; static const GSVector4i m_r4mask;
@ -490,8 +486,10 @@ public:
const GSVector8i* s = (const GSVector8i*)src; const GSVector8i* s = (const GSVector8i*)src;
GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask); GSVector8i mask = GSVector8i::broadcast128(m_r16mask);
GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask);
GSVector8i v0 = s[i * 2 + 0].shuffle8(mask);
GSVector8i v1 = s[i * 2 + 1].shuffle8(mask);
GSVector8i::sw128(v0, v1); GSVector8i::sw128(v0, v1);
GSVector8i::sw32(v0, v1); GSVector8i::sw32(v0, v1);
@ -1637,10 +1635,12 @@ public:
GSVector8i TA0(TEXA.TA0 << 24); GSVector8i TA0(TEXA.TA0 << 24);
GSVector8i TA1(TEXA.TA1 << 24); GSVector8i TA1(TEXA.TA1 << 24);
GSVector8i mask = GSVector8i::broadcast128(m_r16mask);
for (int i = 0; i < 4; i++, dst += dstpitch * 2) for (int i = 0; i < 4; i++, dst += dstpitch * 2)
{ {
GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask); GSVector8i v0 = s[i * 2 + 0].shuffle8(mask);
GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask); GSVector8i v1 = s[i * 2 + 1].shuffle8(mask);
GSVector8i::sw128(v0, v1); GSVector8i::sw128(v0, v1);
GSVector8i::sw32(v0, v1); GSVector8i::sw32(v0, v1);