GS: Minor Zen optimizations

This commit is contained in:
TellowKrinkle 2021-04-21 01:41:03 -05:00 committed by refractionpcsx2
parent a6887715c7
commit cbf4a83b3f
2 changed files with 27 additions and 29 deletions

View File

@ -37,6 +37,20 @@ class GSBlock
static const GSVector4i m_uw8hmask2;
static const GSVector4i m_uw8hmask3;
#if _M_SSE >= 0x501
// Equvialent of `a = *s0; b = *s1; sw128(a, b);`
// Loads in two halves instead to reduce shuffle instructions
// Especially good for Zen/Zen+, as it's replacing a very expensive vperm2i128
template <bool Aligned = true>
static void LoadSW128(GSVector8i& a, GSVector8i& b, const void* s0, const void* s1)
{
const GSVector4i* src0 = static_cast<const GSVector4i*>(s0);
const GSVector4i* src1 = static_cast<const GSVector4i*>(s1);
a = GSVector8i(GSVector4i::load<Aligned>(src0), GSVector4i::load<Aligned>(src1));
b = GSVector8i(GSVector4i::load<Aligned>(src0 + 1), GSVector4i::load<Aligned>(src1 + 1));
}
#endif
public:
template <int i, int alignment, u32 mask>
__forceinline static void WriteColumn32(u8* RESTRICT dst, const u8* RESTRICT src, int srcpitch)
@ -146,10 +160,9 @@ public:
#if _M_SSE >= 0x501
GSVector8i v0 = GSVector8i::load<false>(s0);
GSVector8i v1 = GSVector8i::load<false>(s1);
GSVector8i v0, v1;
GSVector8i::sw128(v0, v1);
LoadSW128<false>(v0, v1, s0, s1);
GSVector8i::sw16(v0, v1);
v0 = v0.acbd();
@ -435,10 +448,9 @@ public:
const GSVector8i* s = (const GSVector8i*)src;
GSVector8i v0 = s[i * 2 + 0];
GSVector8i v1 = s[i * 2 + 1];
GSVector8i v0, v1;
GSVector8i::sw128(v0, v1);
LoadSW128(v0, v1, &s[i * 2 + 0], &s[i * 2 + 1]);
GSVector8i::sw64(v0, v1);
GSVector8i::store<true>(&dst[dstpitch * 0], v0);
@ -1499,10 +1511,9 @@ public:
for (int i = 0; i < 4; i++, dst += dstpitch * 2)
{
GSVector8i v0 = s[i * 2 + 0];
GSVector8i v1 = s[i * 2 + 1];
GSVector8i v0, v1;
GSVector8i::sw128(v0, v1);
LoadSW128(v0, v1, &s[i * 2 + 0], &s[i * 2 + 1]);
GSVector8i::sw64(v0, v1);
GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0];
@ -1575,10 +1586,7 @@ public:
GSVector8i* d2 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 2);
GSVector8i* d3 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 3);
v0 = s[i * 4 + 0];
v1 = s[i * 4 + 1];
GSVector8i::sw128(v0, v1);
LoadSW128(v0, v1, &s[i * 4 + 0], &s[i * 4 + 1]);
GSVector8i::sw64(v0, v1);
d0[0] = ((v0 ) & mask).gather32_32(pal);
@ -1599,10 +1607,7 @@ public:
d2 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 2);
d3 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 3);
v1 = s[i * 4 + 2];
v0 = s[i * 4 + 3];
GSVector8i::sw128(v0, v1);
LoadSW128(v0, v1, &s[i * 4 + 3], &s[i * 4 + 2]);
GSVector8i::sw64(v0, v1);
d0[0] = ((v0 ) & mask).gather32_32(pal);
@ -1727,10 +1732,7 @@ public:
GSVector8i* d2 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 2);
GSVector8i* d3 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 3);
v0 = s[i * 4 + 0];
v1 = s[i * 4 + 1];
GSVector8i::sw128(v0, v1);
LoadSW128(v0, v1, &s[i * 4 + 0], &s[i * 4 + 1]);
GSVector8i::sw64(v0, v1);
v0 = v0.shuffle8(shuf);
@ -1750,10 +1752,7 @@ public:
d2 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 2);
d3 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 3);
v1 = s[i * 4 + 2];
v0 = s[i * 4 + 3];
GSVector8i::sw128(v0, v1);
LoadSW128(v0, v1, &s[i * 4 + 3], &s[i * 4 + 2]);
GSVector8i::sw64(v0, v1);
v0 = v0.shuffle8(shuf);
@ -1871,10 +1870,9 @@ public:
const GSVector8i* s = (const GSVector8i*)src;
for (int i = 0; i < 4; i++)
{
GSVector8i v0 = s[i * 2 + 0];
GSVector8i v1 = s[i * 2 + 1];
GSVector8i v0, v1;
GSVector8i::sw128(v0, v1);
LoadSW128(v0, v1, &s[i * 2 + 0], &s[i * 2 + 1]);
GSVector8i::sw64(v0, v1);
*reinterpret_cast<GSVector8i*>(dst) = (v0 >> 24).gather32_32(pal);

View File

@ -1333,7 +1333,7 @@ public:
GSVector8i c = a;
GSVector8i d = b;
a = c.ac(d);
a = c.insert<1>(d.extract<0>()); // Should become a single vinserti128, faster on Zen+
b = c.bd(d);
}