From 31755bc13a3ebf4a55b3962fa50699db8c9767f4 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Wed, 29 Jul 2020 17:28:48 +0200
Subject: [PATCH] Jit64: fselx - Optimize SSE4.1 packed

Pretty much the same optimization we did for AVX, although slightly more
constrained because we're stuck with the two-operand instruction where
destination and source have to match.

We could also specialize the case where registers b, c, and d are all
distinct, but I decided against it since I couldn't find any game that
does this.

Before:
66 0F 57 C0          xorpd       xmm0,xmm0
66 41 0F C2 C1 06    cmpnlepd    xmm0,xmm9
41 0F 28 CE          movaps      xmm1,xmm14
66 41 0F 38 15 CC    blendvpd    xmm1,xmm12,xmm0
44 0F 28 F1          movaps      xmm14,xmm1

After:
66 0F 57 C0          xorpd       xmm0,xmm0
66 41 0F C2 C1 06    cmpnlepd    xmm0,xmm9
66 45 0F 38 15 F4    blendvpd    xmm14,xmm12,xmm0
---
 Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index a0895b47c9..f520520601 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -459,6 +459,12 @@ void Jit64::fselx(UGeckoInstruction inst)
   }
   else if (cpu_info.bSSE4_1)
   {
+    if (packed && d == c)
+    {
+      BLENDVPD(Rd, Rb);
+      return;
+    }
+
     MOVAPD(XMM1, Rc);
     BLENDVPD(XMM1, Rb);
   }