From 31755bc13a3ebf4a55b3962fa50699db8c9767f4 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 29 Jul 2020 17:28:48 +0200 Subject: [PATCH] Jit64: fselx - Optimize SSE4.1 packed Pretty much the same optimization we did for AVX, although slightly more constrained because we're stuck with the two-operand instruction where destination and source have to match. We could also specialize the case where registers b, c, and d are all distinct, but I decided against it since I couldn't find any game that does this. Before: 66 0F 57 C0 xorpd xmm0,xmm0 66 41 0F C2 C1 06 cmpnlepd xmm0,xmm9 41 0F 28 CE movaps xmm1,xmm14 66 41 0F 38 15 CC blendvpd xmm1,xmm12,xmm0 44 0F 28 F1 movaps xmm14,xmm1 After: 66 0F 57 C0 xorpd xmm0,xmm0 66 41 0F C2 C1 06 cmpnlepd xmm0,xmm9 66 45 0F 38 15 F4 blendvpd xmm14,xmm12,xmm0 --- Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index a0895b47c9..f520520601 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -459,6 +459,12 @@ void Jit64::fselx(UGeckoInstruction inst) } else if (cpu_info.bSSE4_1) { + if (packed && d == c) + { + BLENDVPD(Rd, Rb); + return; + } + MOVAPD(XMM1, Rc); BLENDVPD(XMM1, Rb); }