From 8274dcbfe40269473332f22e0fcdce45d9fbf870 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Wed, 9 Aug 2023 19:46:27 +0200
Subject: [PATCH 1/2] JitArm64: Skip checking last input for NaN for non-SIMD
 operations

AArch64's handling of NaNs in arithmetic instructions matches PowerPC's
as long as no more than one of the operands is NaN. If we know that all
inputs except the last input are non-NaN, we can therefore skip checking
the last input. This is an optimization that in principle only works for
non-SIMD operations, but ps_sumX effectively is non-SIMD as far as the
arithmetic part of it is concerned, so we can use it there too.
---
 .../JitArm64/JitArm64_FloatingPoint.cpp       | 47 +++++---------
 .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 62 ++++++++-----------
 2 files changed, 43 insertions(+), 66 deletions(-)
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
index 0191cbd846..35904a8184 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@@ -80,9 +80,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   const bool fma = use_b && use_c;
   const bool negate_result = (op5 & ~0x1) == 30;
 
-  // Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs
-  const bool can_generate_nan = (op5 & ~0x1) != 20;
-
   const bool output_is_single = inst.OPCD == 59;
   const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
   const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
@@ -203,45 +200,35 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     if (use_c && VA != VC && (!use_b || VB != VC))
       inputs.push_back(VC);
 
-    // If any inputs are NaNs, pick the first NaN of them and set its quiet bit
-    for (size_t i = 0; i < inputs.size(); ++i)
+    // If any inputs are NaNs, pick the first NaN of them and set its quiet bit.
+    // However, we can skip checking the last input, because if exactly one input is NaN, AArch64
+    // arithmetic instructions automatically pick that NaN and make it quiet, just like we want.
+    for (size_t i = 0; i < inputs.size() - 1; ++i)
     {
-      // Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
-      // least one NaN input
-      const bool check_input = can_generate_nan || i != inputs.size() - 1;
-
       const ARM64Reg input = inputs[i];
-      FixupBranch skip;
-      if (check_input)
-      {
-        m_float_emit.FCMP(input);
-        skip = B(CCFlags::CC_VC);
-      }
+
+      m_float_emit.FCMP(input);
+      FixupBranch skip = B(CCFlags::CC_VC);
 
       // Make the NaN quiet
       m_float_emit.FADD(VD, input, input);
 
       nan_fixups.push_back(B());
 
-      if (check_input)
-        SetJumpTarget(skip);
+      SetJumpTarget(skip);
     }
 
     std::optional<FixupBranch> nan_early_fixup;
-    if (can_generate_nan)
+    if (negate_result)
     {
-      // There was no NaN in any of the inputs, so the NaN must have been generated by the
-      // arithmetic instruction. In this case, the result is already correct.
-      if (negate_result)
-      {
-        if (result_reg != VD)
-          m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
-        nan_fixups.push_back(B());
-      }
-      else
-      {
-        nan_early_fixup = B();
-      }
+      // If we have a NaN, we must not execute FNEG.
+      if (result_reg != VD)
+        m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
+      nan_fixups.push_back(B());
+    }
+    else
+    {
+      nan_early_fixup = B();
     }
 
     SwitchToNearCode();
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index 5b876649cc..6f211b1078 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -384,45 +384,38 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
 
   m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);
 
-  FixupBranch a_nan_done, b_nan_done;
+  FixupBranch a_nan_done;
   if (m_accurate_nans)
   {
-    const auto check_nan = [&](ARM64Reg input) {
-      m_float_emit.FCMP(scalar_reg_encoder(input));
-      FixupBranch not_nan = B(CCFlags::CC_VC);
-      FixupBranch nan = B();
-      SetJumpTarget(not_nan);
+    m_float_emit.FCMP(scalar_reg_encoder(VA));
+    FixupBranch a_not_nan = B(CCFlags::CC_VC);
+    FixupBranch a_nan = B();
+    SetJumpTarget(a_not_nan);
 
-      SwitchToFarCode();
-      SetJumpTarget(nan);
+    SwitchToFarCode();
+    SetJumpTarget(a_nan);
 
-      if (upper)
-      {
-        m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
-                          scalar_reg_encoder(input));
-        m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
-      }
-      else if (d != c)
-      {
-        m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(input),
-                          scalar_reg_encoder(input));
-        m_float_emit.INS(size, VD, 1, VC, 1);
-      }
-      else
-      {
-        m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
-                          scalar_reg_encoder(input));
-        m_float_emit.INS(size, VD, 0, V0, 0);
-      }
+    if (upper)
+    {
+      m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(VA), scalar_reg_encoder(VA));
+      m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
+    }
+    else if (d != c)
+    {
+      m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(VA), scalar_reg_encoder(VA));
+      m_float_emit.INS(size, VD, 1, VC, 1);
+    }
+    else
+    {
+      m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(VA), scalar_reg_encoder(VA));
+      m_float_emit.INS(size, VD, 0, V0, 0);
+    }
 
-      FixupBranch nan_done = B();
-      SwitchToNearCode();
+    FixupBranch a_nan_done = B();
+    SwitchToNearCode();
 
-      return nan_done;
-    };
-
-    a_nan_done = check_nan(VA);
-    b_nan_done = check_nan(V0);
+    // If exactly one input is NaN, AArch64 arithmetic instructions automatically pick that NaN
+    // and make it quiet, just like we want. So if rA isn't NaN, we can skip checking rB.
   }
 
   if (upper)
@@ -442,10 +435,7 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
   }
 
   if (m_accurate_nans)
-  {
     SetJumpTarget(a_nan_done);
-    SetJumpTarget(b_nan_done);
-  }
 
   fpr.Unlock(V0);
   if (temp_gpr != ARM64Reg::INVALID_REG)

From fc95d598059c5f36de44ae72f94655acad319c37 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Thu, 10 Aug 2023 21:34:20 +0200
Subject: [PATCH 2/2] JitArm64: Further optimize NaN handling in ps_sumX

So short that using farcode is pointless!
---
 .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 40 ++++---------------
 1 file changed, 7 insertions(+), 33 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index 6f211b1078..398afc8d69 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -380,42 +380,21 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
   const ARM64Reg VC = fpr.R(c, type);
   const ARM64Reg VD = fpr.RW(d, type);
   const ARM64Reg V0 = fpr.GetReg();
-  const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG;
 
   m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);
 
-  FixupBranch a_nan_done;
   if (m_accurate_nans)
   {
+    // If the first input is NaN, set the temp register for the second input to 0. This is because:
+    //
+    // - If the second input is also NaN, setting it to 0 ensures that the first NaN will be picked.
+    // - If only the first input is NaN, setting the second input to 0 has no effect on the result.
+    //
+    // Either way, we can then do an FADD as usual, and the FADD will make the NaN quiet.
     m_float_emit.FCMP(scalar_reg_encoder(VA));
     FixupBranch a_not_nan = B(CCFlags::CC_VC);
-    FixupBranch a_nan = B();
+    m_float_emit.MOVI(64, scalar_reg_encoder(V0), 0);
     SetJumpTarget(a_not_nan);
-
-    SwitchToFarCode();
-    SetJumpTarget(a_nan);
-
-    if (upper)
-    {
-      m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(VA), scalar_reg_encoder(VA));
-      m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
-    }
-    else if (d != c)
-    {
-      m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(VA), scalar_reg_encoder(VA));
-      m_float_emit.INS(size, VD, 1, VC, 1);
-    }
-    else
-    {
-      m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(VA), scalar_reg_encoder(VA));
-      m_float_emit.INS(size, VD, 0, V0, 0);
-    }
-
-    FixupBranch a_nan_done = B();
-    SwitchToNearCode();
-
-    // If exactly one input is NaN, AArch64 arithmetic instructions automatically pick that NaN
-    // and make it quiet, just like we want. So if rA isn't NaN, we can skip checking rB.
   }
 
   if (upper)
@@ -434,12 +413,7 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
     m_float_emit.INS(size, VD, 0, V0, 0);
   }
 
-  if (m_accurate_nans)
-    SetJumpTarget(a_nan_done);
-
   fpr.Unlock(V0);
-  if (temp_gpr != ARM64Reg::INVALID_REG)
-    gpr.Unlock(temp_gpr);
 
   ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
              "Register allocation turned singles into doubles in the middle of ps_sumX");