GS: Improved ReadColumn4

This commit is contained in:
TellowKrinkle 2021-04-06 04:32:36 -05:00 committed by refractionpcsx2
parent 4139da82b9
commit c4b3239e33
5 changed files with 133 additions and 53 deletions

View File

@ -18,7 +18,7 @@
CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15); CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15);
CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask2(1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15, 2, 6, 10, 14); CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask2(1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15, 2, 6, 10, 14);

View File

@ -552,35 +552,74 @@ public:
{ {
//printf("ReadColumn4\n"); //printf("ReadColumn4\n");
const GSVector4i* s = (const GSVector4i*)src; #if _M_SSE >= 0x501
GSVector4i v0 = s[i * 4 + 0].xzyw(); const GSVector8i* s = (const GSVector8i*)src;
GSVector4i v1 = s[i * 4 + 1].xzyw();
GSVector4i v2 = s[i * 4 + 2].xzyw();
GSVector4i v3 = s[i * 4 + 3].xzyw();
GSVector4i::sw64(v0, v1, v2, v3); GSVector8i v0 = s[i * 2 + 0];
GSVector4i::sw4(v0, v2, v1, v3); GSVector8i v1 = s[i * 2 + 1];
GSVector4i::sw8(v0, v1, v2, v3);
v0 = v0.shuffle8(m_r4mask); GSVector8i::sw32_inv(v0, v1);
v1 = v1.shuffle8(m_r4mask); GSVector8i::mix4(v0, v1);
v2 = v2.shuffle8(m_r4mask);
v3 = v3.shuffle8(m_r4mask);
if ((i & 1) == 0) if ((i & 1) == 0)
{ {
GSVector4i::sw16rh(v0, v1, v2, v3); v0 = v0.xzyw();
v1 = v1.zxwy();
} }
else else
{ {
GSVector4i::sw16rl(v0, v1, v2, v3); v0 = v0.zxwy();
v1 = v1.xzyw();
}
v0 = v0.acbd().shuffle8(GSVector8i::broadcast128(m_r4mask));
v1 = v1.acbd().shuffle8(GSVector8i::broadcast128(m_r4mask));
GSVector8i::storel(&dst[dstpitch * 0], v0);
GSVector8i::storeh(&dst[dstpitch * 1], v0);
GSVector8i::storel(&dst[dstpitch * 2], v1);
GSVector8i::storeh(&dst[dstpitch * 3], v1);
#else
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i v0 = s[i * 4 + 0];
GSVector4i v1 = s[i * 4 + 1];
GSVector4i v2 = s[i * 4 + 2];
GSVector4i v3 = s[i * 4 + 3];
GSVector4i::sw32_inv(v0, v1, v2, v3);
GSVector4i::mix4(v0, v1);
GSVector4i::mix4(v2, v3);
GSVector4 v0f = GSVector4::cast(v0);
GSVector4 v1f = GSVector4::cast(v1);
GSVector4 v2f = GSVector4::cast(v2);
GSVector4 v3f = GSVector4::cast(v3);
if ((i & 1) == 0)
{
v0 = GSVector4i::cast(v0f.xzxz(v2f)).shuffle8(m_r4mask);
v1 = GSVector4i::cast(v0f.ywyw(v2f)).shuffle8(m_r4mask);
v2 = GSVector4i::cast(v1f.zxzx(v3f)).shuffle8(m_r4mask);
v3 = GSVector4i::cast(v1f.wywy(v3f)).shuffle8(m_r4mask);
}
else
{
v0 = GSVector4i::cast(v0f.zxzx(v2f)).shuffle8(m_r4mask);
v1 = GSVector4i::cast(v0f.wywy(v2f)).shuffle8(m_r4mask);
v2 = GSVector4i::cast(v1f.xzxz(v3f)).shuffle8(m_r4mask);
v3 = GSVector4i::cast(v1f.ywyw(v3f)).shuffle8(m_r4mask);
} }
GSVector4i::store<true>(&dst[dstpitch * 0], v0); GSVector4i::store<true>(&dst[dstpitch * 0], v0);
GSVector4i::store<true>(&dst[dstpitch * 1], v1); GSVector4i::store<true>(&dst[dstpitch * 1], v1);
GSVector4i::store<true>(&dst[dstpitch * 2], v2); GSVector4i::store<true>(&dst[dstpitch * 2], v2);
GSVector4i::store<true>(&dst[dstpitch * 3], v3); GSVector4i::store<true>(&dst[dstpitch * 3], v3);
#endif
} }
static void ReadColumn32(int y, const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch) static void ReadColumn32(int y, const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch)
@ -1727,25 +1766,28 @@ public:
const GSVector4i* s = (const GSVector4i*)src; const GSVector4i* s = (const GSVector4i*)src;
GSVector4i v0, v1, v2, v3; GSVector4i v0, v1, v2, v3;
GSVector4i mask = m_r4mask; GSVector4 v0f, v1f, v2f, v3f;
for (int i = 0; i < 2; i++) for (int i = 0; i < 2; i++)
{ {
v0 = s[i * 8 + 0].xzyw(); v0 = s[i * 8 + 0];
v1 = s[i * 8 + 1].xzyw(); v1 = s[i * 8 + 1];
v2 = s[i * 8 + 2].xzyw(); v2 = s[i * 8 + 2];
v3 = s[i * 8 + 3].xzyw(); v3 = s[i * 8 + 3];
GSVector4i::sw64(v0, v1, v2, v3); GSVector4i::sw32_inv(v0, v1, v2, v3);
GSVector4i::sw4(v0, v2, v1, v3); GSVector4i::mix4(v0, v1);
GSVector4i::sw8(v0, v1, v2, v3); GSVector4i::mix4(v2, v3);
v0 = v0.shuffle8(mask); v0f = GSVector4::cast(v0);
v1 = v1.shuffle8(mask); v1f = GSVector4::cast(v1);
v2 = v2.shuffle8(mask); v2f = GSVector4::cast(v2);
v3 = v3.shuffle8(mask); v3f = GSVector4::cast(v3);
GSVector4i::sw16rh(v0, v1, v2, v3); v0 = GSVector4i::cast(v0f.xzxz(v2f)).shuffle8(m_r4mask);
v1 = GSVector4i::cast(v0f.ywyw(v2f)).shuffle8(m_r4mask);
v2 = GSVector4i::cast(v1f.zxzx(v3f)).shuffle8(m_r4mask);
v3 = GSVector4i::cast(v1f.wywy(v3f)).shuffle8(m_r4mask);
v0.gather64_8<>(pal, (GSVector4i*)dst); v0.gather64_8<>(pal, (GSVector4i*)dst);
dst += dstpitch; dst += dstpitch;
@ -1756,21 +1798,24 @@ public:
v3.gather64_8<>(pal, (GSVector4i*)dst); v3.gather64_8<>(pal, (GSVector4i*)dst);
dst += dstpitch; dst += dstpitch;
v0 = s[i * 8 + 4].xzyw(); v0 = s[i * 8 + 4];
v1 = s[i * 8 + 5].xzyw(); v1 = s[i * 8 + 5];
v2 = s[i * 8 + 6].xzyw(); v2 = s[i * 8 + 6];
v3 = s[i * 8 + 7].xzyw(); v3 = s[i * 8 + 7];
GSVector4i::sw64(v0, v1, v2, v3); GSVector4i::sw32_inv(v0, v1, v2, v3);
GSVector4i::sw4(v0, v2, v1, v3); GSVector4i::mix4(v0, v1);
GSVector4i::sw8(v0, v1, v2, v3); GSVector4i::mix4(v2, v3);
v0 = v0.shuffle8(mask); v0f = GSVector4::cast(v0);
v1 = v1.shuffle8(mask); v1f = GSVector4::cast(v1);
v2 = v2.shuffle8(mask); v2f = GSVector4::cast(v2);
v3 = v3.shuffle8(mask); v3f = GSVector4::cast(v3);
GSVector4i::sw16rl(v0, v1, v2, v3); v0 = GSVector4i::cast(v0f.zxzx(v2f)).shuffle8(m_r4mask);
v1 = GSVector4i::cast(v0f.wywy(v2f)).shuffle8(m_r4mask);
v2 = GSVector4i::cast(v1f.xzxz(v3f)).shuffle8(m_r4mask);
v3 = GSVector4i::cast(v1f.ywyw(v3f)).shuffle8(m_r4mask);
v0.gather64_8<>(pal, (GSVector4i*)dst); v0.gather64_8<>(pal, (GSVector4i*)dst);
dst += dstpitch; dst += dstpitch;

View File

@ -128,6 +128,19 @@ gsforceinline GSVector4::GSVector4(const GSVector4i& v)
m = _mm_cvtepi32_ps(v); m = _mm_cvtepi32_ps(v);
} }
gsforceinline void GSVector4i::sw32_inv(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
{
GSVector4 af = GSVector4::cast(a);
GSVector4 bf = GSVector4::cast(b);
GSVector4 cf = GSVector4::cast(c);
GSVector4 df = GSVector4::cast(d);
a = GSVector4i::cast(af.xzxz(cf));
b = GSVector4i::cast(af.ywyw(cf));
c = GSVector4i::cast(bf.xzxz(df));
d = GSVector4i::cast(bf.ywyw(df));
}
#if _M_SSE >= 0x501 #if _M_SSE >= 0x501
gsforceinline GSVector8i::GSVector8i(const GSVector8& v, bool truncate) gsforceinline GSVector8i::GSVector8i(const GSVector8& v, bool truncate)
@ -140,6 +153,14 @@ gsforceinline GSVector8::GSVector8(const GSVector8i& v)
m = _mm256_cvtepi32_ps(v); m = _mm256_cvtepi32_ps(v);
} }
gsforceinline void GSVector8i::sw32_inv(GSVector8i& a, GSVector8i& b)
{
GSVector8 af = GSVector8::cast(a);
GSVector8 bf = GSVector8::cast(b);
a = GSVector8i::cast(af.xzxz(bf));
b = GSVector8i::cast(af.ywyw(bf));
}
#endif #endif
// casting // casting

View File

@ -1678,21 +1678,21 @@ public:
d = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0xDD)); d = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0xDD));
} }
__forceinline static void mix4(GSVector4i& a, GSVector4i& b)
{
GSVector4i mask(_mm_set1_epi32(0x0f0f0f0f));
GSVector4i c = (b << 4).blend(a, mask);
GSVector4i d = b.blend(a >> 4, mask);
a = c;
b = d;
}
__forceinline static void sw4(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d) __forceinline static void sw4(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
{ {
const __m128i epi32_0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f); mix4(a, b);
mix4(c, d);
GSVector4i mask(epi32_0f0f0f0f); sw8(a, b, c, d);
GSVector4i e = (b << 4).blend(a, mask);
GSVector4i f = b.blend(a >> 4, mask);
GSVector4i g = (d << 4).blend(c, mask);
GSVector4i h = d.blend(c >> 4, mask);
a = e.upl8(f);
c = e.uph8(f);
b = g.upl8(h);
d = g.uph8(h);
} }
__forceinline static void sw8(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d) __forceinline static void sw8(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
@ -1750,6 +1750,8 @@ public:
d = f.uph32(d); d = f.uph32(d);
} }
__forceinline static void sw32_inv(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d);
__forceinline static void sw64(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d) __forceinline static void sw64(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
{ {
GSVector4i e = a; GSVector4i e = a;

View File

@ -1255,6 +1255,16 @@ public:
// TODO: swizzling // TODO: swizzling
__forceinline static void mix4(GSVector8i& a, GSVector8i& b)
{
GSVector8i mask(_mm256_set1_epi32(0x0f0f0f0f));
GSVector8i c = (b << 4).blend(a, mask);
GSVector8i d = b.blend(a >> 4, mask);
a = c;
b = d;
}
__forceinline static void sw8(GSVector8i& a, GSVector8i& b) __forceinline static void sw8(GSVector8i& a, GSVector8i& b)
{ {
GSVector8i c = a; GSVector8i c = a;
@ -1282,6 +1292,8 @@ public:
b = c.uph32(d); b = c.uph32(d);
} }
__forceinline static void sw32_inv(GSVector8i& a, GSVector8i& b);
__forceinline static void sw64(GSVector8i& a, GSVector8i& b) __forceinline static void sw64(GSVector8i& a, GSVector8i& b)
{ {
GSVector8i c = a; GSVector8i c = a;