From fd0c82c04ae1a1e9cf0e3cd3fa4519883248085f Mon Sep 17 00:00:00 2001 From: Stenzek Date: Tue, 30 Apr 2024 01:38:00 +1000 Subject: [PATCH] GS: Use insertps/blendps instead of shuffles Faster by one cycle on any CPU we care about these days. --- pcsx2/GS/GSVector4.h | 83 +++----------------------------------------- 1 file changed, 4 insertions(+), 79 deletions(-) diff --git a/pcsx2/GS/GSVector4.h b/pcsx2/GS/GSVector4.h index b5c61fbbaf..518c76ba68 100644 --- a/pcsx2/GS/GSVector4.h +++ b/pcsx2/GS/GSVector4.h @@ -515,92 +515,17 @@ public: template __forceinline GSVector4 insert32(const GSVector4& v) const { - // TODO: use blendps when src == dst - -#if 0 // _M_SSE >= 0x401 - - // NOTE: it's faster with shuffles... - - return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0))); - -#else - - switch (dst) - { - case 0: - switch (src) - { - case 0: return yyxx(v).zxzw(*this); - case 1: return yyyy(v).zxzw(*this); - case 2: return yyzz(v).zxzw(*this); - case 3: return yyww(v).zxzw(*this); - default: ASSUME(0); - } - break; - case 1: - switch (src) - { - case 0: return xxxx(v).xzzw(*this); - case 1: return xxyy(v).xzzw(*this); - case 2: return xxzz(v).xzzw(*this); - case 3: return xxww(v).xzzw(*this); - default: ASSUME(0); - } - break; - case 2: - switch (src) - { - case 0: return xyzx(wwxx(v)); - case 1: return xyzx(wwyy(v)); - case 2: return xyzx(wwzz(v)); - case 3: return xyzx(wwww(v)); - default: ASSUME(0); - } - break; - case 3: - switch (src) - { - case 0: return xyxz(zzxx(v)); - case 1: return xyxz(zzyy(v)); - case 2: return xyxz(zzzz(v)); - case 3: return xyxz(zzww(v)); - default: ASSUME(0); - } - break; - default: - ASSUME(0); - } - -#endif + if constexpr (src == dst) + return GSVector4(_mm_blend_ps(m, v.m, 1 << src)); + else + return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0))); } -#ifdef __linux__ -#if 0 - // Debug build error, _mm_extract_ps is actually a macro that use an anonymous union - // that contains i. I decide to rename the template on linux but it makes windows unhappy - // Hence the nice ifdef - // - // Code extract: - // union { int i; float f; } __tmp; - -GSVector.h:2977:40: error: declaration of 'int GSVector4::extract32() const::::i' - return _mm_extract_ps(m, i); -GSVector.h:2973:15: error: shadows template parm 'int i' - template __forceinline int extract32() const -#endif - - template - __forceinline int extract32() const - { - return _mm_extract_ps(m, index); - } -#else template __forceinline int extract32() const { return _mm_extract_ps(m, i); } -#endif __forceinline static GSVector4 zero() {