diff --git a/pcsx2/GS/GSBlock.h b/pcsx2/GS/GSBlock.h index c921dcdde9..33272fac86 100644 --- a/pcsx2/GS/GSBlock.h +++ b/pcsx2/GS/GSBlock.h @@ -37,6 +37,20 @@ class GSBlock static const GSVector4i m_uw8hmask2; static const GSVector4i m_uw8hmask3; +#if _M_SSE >= 0x501 + // Equvialent of `a = *s0; b = *s1; sw128(a, b);` + // Loads in two halves instead to reduce shuffle instructions + // Especially good for Zen/Zen+, as it's replacing a very expensive vperm2i128 + template + static void LoadSW128(GSVector8i& a, GSVector8i& b, const void* s0, const void* s1) + { + const GSVector4i* src0 = static_cast(s0); + const GSVector4i* src1 = static_cast(s1); + a = GSVector8i(GSVector4i::load(src0), GSVector4i::load(src1)); + b = GSVector8i(GSVector4i::load(src0 + 1), GSVector4i::load(src1 + 1)); + } +#endif + public: template __forceinline static void WriteColumn32(u8* RESTRICT dst, const u8* RESTRICT src, int srcpitch) @@ -146,10 +160,9 @@ public: #if _M_SSE >= 0x501 - GSVector8i v0 = GSVector8i::load(s0); - GSVector8i v1 = GSVector8i::load(s1); + GSVector8i v0, v1; - GSVector8i::sw128(v0, v1); + LoadSW128(v0, v1, s0, s1); GSVector8i::sw16(v0, v1); v0 = v0.acbd(); @@ -435,10 +448,9 @@ public: const GSVector8i* s = (const GSVector8i*)src; - GSVector8i v0 = s[i * 2 + 0]; - GSVector8i v1 = s[i * 2 + 1]; + GSVector8i v0, v1; - GSVector8i::sw128(v0, v1); + LoadSW128(v0, v1, &s[i * 2 + 0], &s[i * 2 + 1]); GSVector8i::sw64(v0, v1); GSVector8i::store(&dst[dstpitch * 0], v0); @@ -1499,10 +1511,9 @@ public: for (int i = 0; i < 4; i++, dst += dstpitch * 2) { - GSVector8i v0 = s[i * 2 + 0]; - GSVector8i v1 = s[i * 2 + 1]; + GSVector8i v0, v1; - GSVector8i::sw128(v0, v1); + LoadSW128(v0, v1, &s[i * 2 + 0], &s[i * 2 + 1]); GSVector8i::sw64(v0, v1); GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0]; @@ -1575,10 +1586,7 @@ public: GSVector8i* d2 = reinterpret_cast(dst + dstpitch * 2); GSVector8i* d3 = reinterpret_cast(dst + dstpitch * 3); - v0 = s[i * 4 + 0]; - v1 = s[i * 4 + 1]; - - GSVector8i::sw128(v0, v1); + LoadSW128(v0, v1, &s[i * 4 + 0], &s[i * 4 + 1]); GSVector8i::sw64(v0, v1); d0[0] = ((v0 ) & mask).gather32_32(pal); @@ -1599,10 +1607,7 @@ public: d2 = reinterpret_cast(dst + dstpitch * 2); d3 = reinterpret_cast(dst + dstpitch * 3); - v1 = s[i * 4 + 2]; - v0 = s[i * 4 + 3]; - - GSVector8i::sw128(v0, v1); + LoadSW128(v0, v1, &s[i * 4 + 3], &s[i * 4 + 2]); GSVector8i::sw64(v0, v1); d0[0] = ((v0 ) & mask).gather32_32(pal); @@ -1727,10 +1732,7 @@ public: GSVector8i* d2 = reinterpret_cast(dst + dstpitch * 2); GSVector8i* d3 = reinterpret_cast(dst + dstpitch * 3); - v0 = s[i * 4 + 0]; - v1 = s[i * 4 + 1]; - - GSVector8i::sw128(v0, v1); + LoadSW128(v0, v1, &s[i * 4 + 0], &s[i * 4 + 1]); GSVector8i::sw64(v0, v1); v0 = v0.shuffle8(shuf); @@ -1750,10 +1752,7 @@ public: d2 = reinterpret_cast(dst + dstpitch * 2); d3 = reinterpret_cast(dst + dstpitch * 3); - v1 = s[i * 4 + 2]; - v0 = s[i * 4 + 3]; - - GSVector8i::sw128(v0, v1); + LoadSW128(v0, v1, &s[i * 4 + 3], &s[i * 4 + 2]); GSVector8i::sw64(v0, v1); v0 = v0.shuffle8(shuf); @@ -1871,10 +1870,9 @@ public: const GSVector8i* s = (const GSVector8i*)src; for (int i = 0; i < 4; i++) { - GSVector8i v0 = s[i * 2 + 0]; - GSVector8i v1 = s[i * 2 + 1]; + GSVector8i v0, v1; - GSVector8i::sw128(v0, v1); + LoadSW128(v0, v1, &s[i * 2 + 0], &s[i * 2 + 1]); GSVector8i::sw64(v0, v1); *reinterpret_cast(dst) = (v0 >> 24).gather32_32(pal); diff --git a/pcsx2/GS/GSVector8i.h b/pcsx2/GS/GSVector8i.h index 784f7ad1f3..9442e29b7e 100644 --- a/pcsx2/GS/GSVector8i.h +++ b/pcsx2/GS/GSVector8i.h @@ -1333,7 +1333,7 @@ public: GSVector8i c = a; GSVector8i d = b; - a = c.ac(d); + a = c.insert<1>(d.extract<0>()); // Should become a single vinserti128, faster on Zen+ b = c.bd(d); }