mirror of https://github.com/PCSX2/pcsx2.git
GS: Improved ReadColumn4
This commit is contained in:
parent
4139da82b9
commit
c4b3239e33
|
@ -18,7 +18,7 @@
|
||||||
|
|
||||||
CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
|
CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
|
||||||
CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
|
CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
|
||||||
CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
|
CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15);
|
||||||
|
|
||||||
CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
|
CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
|
||||||
CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask2(1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15, 2, 6, 10, 14);
|
CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask2(1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15, 2, 6, 10, 14);
|
||||||
|
|
|
@ -552,35 +552,74 @@ public:
|
||||||
{
|
{
|
||||||
//printf("ReadColumn4\n");
|
//printf("ReadColumn4\n");
|
||||||
|
|
||||||
const GSVector4i* s = (const GSVector4i*)src;
|
#if _M_SSE >= 0x501
|
||||||
|
|
||||||
GSVector4i v0 = s[i * 4 + 0].xzyw();
|
const GSVector8i* s = (const GSVector8i*)src;
|
||||||
GSVector4i v1 = s[i * 4 + 1].xzyw();
|
|
||||||
GSVector4i v2 = s[i * 4 + 2].xzyw();
|
|
||||||
GSVector4i v3 = s[i * 4 + 3].xzyw();
|
|
||||||
|
|
||||||
GSVector4i::sw64(v0, v1, v2, v3);
|
GSVector8i v0 = s[i * 2 + 0];
|
||||||
GSVector4i::sw4(v0, v2, v1, v3);
|
GSVector8i v1 = s[i * 2 + 1];
|
||||||
GSVector4i::sw8(v0, v1, v2, v3);
|
|
||||||
|
|
||||||
v0 = v0.shuffle8(m_r4mask);
|
GSVector8i::sw32_inv(v0, v1);
|
||||||
v1 = v1.shuffle8(m_r4mask);
|
GSVector8i::mix4(v0, v1);
|
||||||
v2 = v2.shuffle8(m_r4mask);
|
|
||||||
v3 = v3.shuffle8(m_r4mask);
|
|
||||||
|
|
||||||
if ((i & 1) == 0)
|
if ((i & 1) == 0)
|
||||||
{
|
{
|
||||||
GSVector4i::sw16rh(v0, v1, v2, v3);
|
v0 = v0.xzyw();
|
||||||
|
v1 = v1.zxwy();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
GSVector4i::sw16rl(v0, v1, v2, v3);
|
v0 = v0.zxwy();
|
||||||
|
v1 = v1.xzyw();
|
||||||
|
}
|
||||||
|
|
||||||
|
v0 = v0.acbd().shuffle8(GSVector8i::broadcast128(m_r4mask));
|
||||||
|
v1 = v1.acbd().shuffle8(GSVector8i::broadcast128(m_r4mask));
|
||||||
|
|
||||||
|
GSVector8i::storel(&dst[dstpitch * 0], v0);
|
||||||
|
GSVector8i::storeh(&dst[dstpitch * 1], v0);
|
||||||
|
GSVector8i::storel(&dst[dstpitch * 2], v1);
|
||||||
|
GSVector8i::storeh(&dst[dstpitch * 3], v1);
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
const GSVector4i* s = (const GSVector4i*)src;
|
||||||
|
|
||||||
|
GSVector4i v0 = s[i * 4 + 0];
|
||||||
|
GSVector4i v1 = s[i * 4 + 1];
|
||||||
|
GSVector4i v2 = s[i * 4 + 2];
|
||||||
|
GSVector4i v3 = s[i * 4 + 3];
|
||||||
|
|
||||||
|
GSVector4i::sw32_inv(v0, v1, v2, v3);
|
||||||
|
GSVector4i::mix4(v0, v1);
|
||||||
|
GSVector4i::mix4(v2, v3);
|
||||||
|
|
||||||
|
GSVector4 v0f = GSVector4::cast(v0);
|
||||||
|
GSVector4 v1f = GSVector4::cast(v1);
|
||||||
|
GSVector4 v2f = GSVector4::cast(v2);
|
||||||
|
GSVector4 v3f = GSVector4::cast(v3);
|
||||||
|
|
||||||
|
if ((i & 1) == 0)
|
||||||
|
{
|
||||||
|
v0 = GSVector4i::cast(v0f.xzxz(v2f)).shuffle8(m_r4mask);
|
||||||
|
v1 = GSVector4i::cast(v0f.ywyw(v2f)).shuffle8(m_r4mask);
|
||||||
|
v2 = GSVector4i::cast(v1f.zxzx(v3f)).shuffle8(m_r4mask);
|
||||||
|
v3 = GSVector4i::cast(v1f.wywy(v3f)).shuffle8(m_r4mask);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
v0 = GSVector4i::cast(v0f.zxzx(v2f)).shuffle8(m_r4mask);
|
||||||
|
v1 = GSVector4i::cast(v0f.wywy(v2f)).shuffle8(m_r4mask);
|
||||||
|
v2 = GSVector4i::cast(v1f.xzxz(v3f)).shuffle8(m_r4mask);
|
||||||
|
v3 = GSVector4i::cast(v1f.ywyw(v3f)).shuffle8(m_r4mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
GSVector4i::store<true>(&dst[dstpitch * 0], v0);
|
GSVector4i::store<true>(&dst[dstpitch * 0], v0);
|
||||||
GSVector4i::store<true>(&dst[dstpitch * 1], v1);
|
GSVector4i::store<true>(&dst[dstpitch * 1], v1);
|
||||||
GSVector4i::store<true>(&dst[dstpitch * 2], v2);
|
GSVector4i::store<true>(&dst[dstpitch * 2], v2);
|
||||||
GSVector4i::store<true>(&dst[dstpitch * 3], v3);
|
GSVector4i::store<true>(&dst[dstpitch * 3], v3);
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ReadColumn32(int y, const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch)
|
static void ReadColumn32(int y, const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch)
|
||||||
|
@ -1727,25 +1766,28 @@ public:
|
||||||
const GSVector4i* s = (const GSVector4i*)src;
|
const GSVector4i* s = (const GSVector4i*)src;
|
||||||
|
|
||||||
GSVector4i v0, v1, v2, v3;
|
GSVector4i v0, v1, v2, v3;
|
||||||
GSVector4i mask = m_r4mask;
|
GSVector4 v0f, v1f, v2f, v3f;
|
||||||
|
|
||||||
for (int i = 0; i < 2; i++)
|
for (int i = 0; i < 2; i++)
|
||||||
{
|
{
|
||||||
v0 = s[i * 8 + 0].xzyw();
|
v0 = s[i * 8 + 0];
|
||||||
v1 = s[i * 8 + 1].xzyw();
|
v1 = s[i * 8 + 1];
|
||||||
v2 = s[i * 8 + 2].xzyw();
|
v2 = s[i * 8 + 2];
|
||||||
v3 = s[i * 8 + 3].xzyw();
|
v3 = s[i * 8 + 3];
|
||||||
|
|
||||||
GSVector4i::sw64(v0, v1, v2, v3);
|
GSVector4i::sw32_inv(v0, v1, v2, v3);
|
||||||
GSVector4i::sw4(v0, v2, v1, v3);
|
GSVector4i::mix4(v0, v1);
|
||||||
GSVector4i::sw8(v0, v1, v2, v3);
|
GSVector4i::mix4(v2, v3);
|
||||||
|
|
||||||
v0 = v0.shuffle8(mask);
|
v0f = GSVector4::cast(v0);
|
||||||
v1 = v1.shuffle8(mask);
|
v1f = GSVector4::cast(v1);
|
||||||
v2 = v2.shuffle8(mask);
|
v2f = GSVector4::cast(v2);
|
||||||
v3 = v3.shuffle8(mask);
|
v3f = GSVector4::cast(v3);
|
||||||
|
|
||||||
GSVector4i::sw16rh(v0, v1, v2, v3);
|
v0 = GSVector4i::cast(v0f.xzxz(v2f)).shuffle8(m_r4mask);
|
||||||
|
v1 = GSVector4i::cast(v0f.ywyw(v2f)).shuffle8(m_r4mask);
|
||||||
|
v2 = GSVector4i::cast(v1f.zxzx(v3f)).shuffle8(m_r4mask);
|
||||||
|
v3 = GSVector4i::cast(v1f.wywy(v3f)).shuffle8(m_r4mask);
|
||||||
|
|
||||||
v0.gather64_8<>(pal, (GSVector4i*)dst);
|
v0.gather64_8<>(pal, (GSVector4i*)dst);
|
||||||
dst += dstpitch;
|
dst += dstpitch;
|
||||||
|
@ -1756,21 +1798,24 @@ public:
|
||||||
v3.gather64_8<>(pal, (GSVector4i*)dst);
|
v3.gather64_8<>(pal, (GSVector4i*)dst);
|
||||||
dst += dstpitch;
|
dst += dstpitch;
|
||||||
|
|
||||||
v0 = s[i * 8 + 4].xzyw();
|
v0 = s[i * 8 + 4];
|
||||||
v1 = s[i * 8 + 5].xzyw();
|
v1 = s[i * 8 + 5];
|
||||||
v2 = s[i * 8 + 6].xzyw();
|
v2 = s[i * 8 + 6];
|
||||||
v3 = s[i * 8 + 7].xzyw();
|
v3 = s[i * 8 + 7];
|
||||||
|
|
||||||
GSVector4i::sw64(v0, v1, v2, v3);
|
GSVector4i::sw32_inv(v0, v1, v2, v3);
|
||||||
GSVector4i::sw4(v0, v2, v1, v3);
|
GSVector4i::mix4(v0, v1);
|
||||||
GSVector4i::sw8(v0, v1, v2, v3);
|
GSVector4i::mix4(v2, v3);
|
||||||
|
|
||||||
v0 = v0.shuffle8(mask);
|
v0f = GSVector4::cast(v0);
|
||||||
v1 = v1.shuffle8(mask);
|
v1f = GSVector4::cast(v1);
|
||||||
v2 = v2.shuffle8(mask);
|
v2f = GSVector4::cast(v2);
|
||||||
v3 = v3.shuffle8(mask);
|
v3f = GSVector4::cast(v3);
|
||||||
|
|
||||||
GSVector4i::sw16rl(v0, v1, v2, v3);
|
v0 = GSVector4i::cast(v0f.zxzx(v2f)).shuffle8(m_r4mask);
|
||||||
|
v1 = GSVector4i::cast(v0f.wywy(v2f)).shuffle8(m_r4mask);
|
||||||
|
v2 = GSVector4i::cast(v1f.xzxz(v3f)).shuffle8(m_r4mask);
|
||||||
|
v3 = GSVector4i::cast(v1f.ywyw(v3f)).shuffle8(m_r4mask);
|
||||||
|
|
||||||
v0.gather64_8<>(pal, (GSVector4i*)dst);
|
v0.gather64_8<>(pal, (GSVector4i*)dst);
|
||||||
dst += dstpitch;
|
dst += dstpitch;
|
||||||
|
|
|
@ -128,6 +128,19 @@ gsforceinline GSVector4::GSVector4(const GSVector4i& v)
|
||||||
m = _mm_cvtepi32_ps(v);
|
m = _mm_cvtepi32_ps(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gsforceinline void GSVector4i::sw32_inv(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
|
||||||
|
{
|
||||||
|
GSVector4 af = GSVector4::cast(a);
|
||||||
|
GSVector4 bf = GSVector4::cast(b);
|
||||||
|
GSVector4 cf = GSVector4::cast(c);
|
||||||
|
GSVector4 df = GSVector4::cast(d);
|
||||||
|
|
||||||
|
a = GSVector4i::cast(af.xzxz(cf));
|
||||||
|
b = GSVector4i::cast(af.ywyw(cf));
|
||||||
|
c = GSVector4i::cast(bf.xzxz(df));
|
||||||
|
d = GSVector4i::cast(bf.ywyw(df));
|
||||||
|
}
|
||||||
|
|
||||||
#if _M_SSE >= 0x501
|
#if _M_SSE >= 0x501
|
||||||
|
|
||||||
gsforceinline GSVector8i::GSVector8i(const GSVector8& v, bool truncate)
|
gsforceinline GSVector8i::GSVector8i(const GSVector8& v, bool truncate)
|
||||||
|
@ -140,6 +153,14 @@ gsforceinline GSVector8::GSVector8(const GSVector8i& v)
|
||||||
m = _mm256_cvtepi32_ps(v);
|
m = _mm256_cvtepi32_ps(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gsforceinline void GSVector8i::sw32_inv(GSVector8i& a, GSVector8i& b)
|
||||||
|
{
|
||||||
|
GSVector8 af = GSVector8::cast(a);
|
||||||
|
GSVector8 bf = GSVector8::cast(b);
|
||||||
|
a = GSVector8i::cast(af.xzxz(bf));
|
||||||
|
b = GSVector8i::cast(af.ywyw(bf));
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// casting
|
// casting
|
||||||
|
|
|
@ -1678,21 +1678,21 @@ public:
|
||||||
d = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0xDD));
|
d = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0xDD));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__forceinline static void mix4(GSVector4i& a, GSVector4i& b)
|
||||||
|
{
|
||||||
|
GSVector4i mask(_mm_set1_epi32(0x0f0f0f0f));
|
||||||
|
|
||||||
|
GSVector4i c = (b << 4).blend(a, mask);
|
||||||
|
GSVector4i d = b.blend(a >> 4, mask);
|
||||||
|
a = c;
|
||||||
|
b = d;
|
||||||
|
}
|
||||||
|
|
||||||
__forceinline static void sw4(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
|
__forceinline static void sw4(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
|
||||||
{
|
{
|
||||||
const __m128i epi32_0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
|
mix4(a, b);
|
||||||
|
mix4(c, d);
|
||||||
GSVector4i mask(epi32_0f0f0f0f);
|
sw8(a, b, c, d);
|
||||||
|
|
||||||
GSVector4i e = (b << 4).blend(a, mask);
|
|
||||||
GSVector4i f = b.blend(a >> 4, mask);
|
|
||||||
GSVector4i g = (d << 4).blend(c, mask);
|
|
||||||
GSVector4i h = d.blend(c >> 4, mask);
|
|
||||||
|
|
||||||
a = e.upl8(f);
|
|
||||||
c = e.uph8(f);
|
|
||||||
b = g.upl8(h);
|
|
||||||
d = g.uph8(h);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__forceinline static void sw8(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
|
__forceinline static void sw8(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
|
||||||
|
@ -1750,6 +1750,8 @@ public:
|
||||||
d = f.uph32(d);
|
d = f.uph32(d);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__forceinline static void sw32_inv(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d);
|
||||||
|
|
||||||
__forceinline static void sw64(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
|
__forceinline static void sw64(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
|
||||||
{
|
{
|
||||||
GSVector4i e = a;
|
GSVector4i e = a;
|
||||||
|
|
|
@ -1255,6 +1255,16 @@ public:
|
||||||
|
|
||||||
// TODO: swizzling
|
// TODO: swizzling
|
||||||
|
|
||||||
|
__forceinline static void mix4(GSVector8i& a, GSVector8i& b)
|
||||||
|
{
|
||||||
|
GSVector8i mask(_mm256_set1_epi32(0x0f0f0f0f));
|
||||||
|
|
||||||
|
GSVector8i c = (b << 4).blend(a, mask);
|
||||||
|
GSVector8i d = b.blend(a >> 4, mask);
|
||||||
|
a = c;
|
||||||
|
b = d;
|
||||||
|
}
|
||||||
|
|
||||||
__forceinline static void sw8(GSVector8i& a, GSVector8i& b)
|
__forceinline static void sw8(GSVector8i& a, GSVector8i& b)
|
||||||
{
|
{
|
||||||
GSVector8i c = a;
|
GSVector8i c = a;
|
||||||
|
@ -1282,6 +1292,8 @@ public:
|
||||||
b = c.uph32(d);
|
b = c.uph32(d);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__forceinline static void sw32_inv(GSVector8i& a, GSVector8i& b);
|
||||||
|
|
||||||
__forceinline static void sw64(GSVector8i& a, GSVector8i& b)
|
__forceinline static void sw64(GSVector8i& a, GSVector8i& b)
|
||||||
{
|
{
|
||||||
GSVector8i c = a;
|
GSVector8i c = a;
|
||||||
|
|
Loading…
Reference in New Issue