GS: ReadColumn8 AVX2 path

This commit is contained in:
TellowKrinkle 2021-04-06 01:17:40 -05:00 committed by refractionpcsx2
parent 244a4da28a
commit bdc7dc2cd8
3 changed files with 29 additions and 13 deletions

View File

@ -20,6 +20,9 @@ CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6,
CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask2(1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15, 2, 6, 10, 14);
CONSTINIT const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
CONSTINIT const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
CONSTINIT const GSVector4i GSBlock::m_uw8hmask2(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);

View File

@ -25,6 +25,9 @@ class GSBlock
static const GSVector4i m_r8mask;
static const GSVector4i m_r4mask;
static const GSVector4i m_avx2_r8mask1;
static const GSVector4i m_avx2_r8mask2;
static const GSVector4i m_uw8hmask0;
static const GSVector4i m_uw8hmask1;
static const GSVector4i m_uw8hmask2;
@ -512,30 +515,34 @@ public:
//for(int j = 0; j < 64; j++) ((u8*)src)[j] = (u8)j;
#if 0 //_M_SSE >= 0x501
#if _M_SSE >= 0x501
const GSVector8i* s = (const GSVector8i*)src;
GSVector8i v0 = s[i * 2 + 0];
GSVector8i v1 = s[i * 2 + 1];
GSVector8i v0, v1;
GSVector8i::sw8(v0, v1);
GSVector8i::sw16(v0, v1);
GSVector8i::sw8(v0, v1);
GSVector8i::sw128(v0, v1);
GSVector8i::sw16(v0, v1);
if ((i & 1) == 0)
{
v0 = s[i * 2 + 0];
v1 = s[i * 2 + 1];
}
else
{
v1 = s[i * 2 + 0];
v0 = s[i * 2 + 1];
}
v0 = v0.acbd();
v1 = v1.acbd();
v1 = v1.yxwz();
GSVector8i v2 = v0.acbd().shuffle8(GSVector8i::broadcast128(m_avx2_r8mask1));
GSVector8i v3 = v1.acbd().shuffle8(GSVector8i::broadcast128(m_avx2_r8mask2));
v0 = v2.blend32<0xaa>(v3);
v1 = v3.blend32<0xaa>(v2);
GSVector8i::storel(&dst[dstpitch * 0], v0);
GSVector8i::storeh(&dst[dstpitch * 1], v0);
GSVector8i::storel(&dst[dstpitch * 2], v1);
GSVector8i::storeh(&dst[dstpitch * 3], v1);
// TODO: not sure if this is worth it, not in this form, there should be a shorter path
#else
const GSVector4i* s = (const GSVector4i*)src;

View File

@ -316,6 +316,12 @@ public:
return GSVector8i(_mm256_blend_epi16(m, a, mask));
}
template <int mask>
__forceinline GSVector8i blend32(const GSVector8i& a) const
{
return GSVector8i(_mm256_blend_epi32(m, a, mask));
}
__forceinline GSVector8i blend(const GSVector8i& a, const GSVector8i& mask) const
{
return GSVector8i(_mm256_or_si256(_mm256_andnot_si256(mask, m), _mm256_and_si256(mask, a)));