Merge pull request #12092 from JosJuice/jitarm64-last-nan

JitArm64: Skip checking last input for NaN for non-SIMD operations
2023-11-28 22:30:50 +01:00 · 2023-11-28 22:30:50 +01:00 · 934418a289
parent 95f06ef231 fc95d59805
commit 934418a289
2 changed files with 27 additions and 76 deletions
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@ -80,9 +80,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
  const bool fma = use_b && use_c;
  const bool negate_result = (op5 & ~0x1) == 30;

-  // Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs
-  const bool can_generate_nan = (op5 & ~0x1) != 20;
-
  const bool output_is_single = inst.OPCD == 59;
  const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
  const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
@ -203,45 +200,35 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
    if (use_c && VA != VC && (!use_b || VB != VC))
      inputs.push_back(VC);

-    // If any inputs are NaNs, pick the first NaN of them and set its quiet bit
-    for (size_t i = 0; i < inputs.size(); ++i)
+    // If any inputs are NaNs, pick the first NaN of them and set its quiet bit.
+    // However, we can skip checking the last input, because if exactly one input is NaN, AArch64
+    // arithmetic instructions automatically pick that NaN and make it quiet, just like we want.
+    for (size_t i = 0; i < inputs.size() - 1; ++i)
    {
-      // Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
-      // least one NaN input
-      const bool check_input = can_generate_nan || i != inputs.size() - 1;
-
      const ARM64Reg input = inputs[i];
-      FixupBranch skip;
-      if (check_input)
-      {
-        m_float_emit.FCMP(input);
-        skip = B(CCFlags::CC_VC);
-      }
+
+      m_float_emit.FCMP(input);
+      FixupBranch skip = B(CCFlags::CC_VC);

      // Make the NaN quiet
      m_float_emit.FADD(VD, input, input);

      nan_fixups.push_back(B());

-      if (check_input)
-        SetJumpTarget(skip);
+      SetJumpTarget(skip);
    }

    std::optional<FixupBranch> nan_early_fixup;
-    if (can_generate_nan)
+    if (negate_result)
    {
-      // There was no NaN in any of the inputs, so the NaN must have been generated by the
-      // arithmetic instruction. In this case, the result is already correct.
-      if (negate_result)
-      {
-        if (result_reg != VD)
-          m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
-        nan_fixups.push_back(B());
-      }
-      else
-      {
-        nan_early_fixup = B();
-      }
+      // If we have a NaN, we must not execute FNEG.
+      if (result_reg != VD)
+        m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
+      nan_fixups.push_back(B());
+    }
+    else
+    {
+      nan_early_fixup = B();
    }

    SwitchToNearCode();
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@ -380,49 +380,21 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
  const ARM64Reg VC = fpr.R(c, type);
  const ARM64Reg VD = fpr.RW(d, type);
  const ARM64Reg V0 = fpr.GetReg();
-  const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG;

  m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);

-  FixupBranch a_nan_done, b_nan_done;
  if (m_accurate_nans)
  {
-    const auto check_nan = [&](ARM64Reg input) {
-      m_float_emit.FCMP(scalar_reg_encoder(input));
-      FixupBranch not_nan = B(CCFlags::CC_VC);
-      FixupBranch nan = B();
-      SetJumpTarget(not_nan);
-
-      SwitchToFarCode();
-      SetJumpTarget(nan);
-
-      if (upper)
-      {
-        m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
-                          scalar_reg_encoder(input));
-        m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
-      }
-      else if (d != c)
-      {
-        m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(input),
-                          scalar_reg_encoder(input));
-        m_float_emit.INS(size, VD, 1, VC, 1);
-      }
-      else
-      {
-        m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
-                          scalar_reg_encoder(input));
-        m_float_emit.INS(size, VD, 0, V0, 0);
-      }
-
-      FixupBranch nan_done = B();
-      SwitchToNearCode();
-
-      return nan_done;
-    };
-
-    a_nan_done = check_nan(VA);
-    b_nan_done = check_nan(V0);
+    // If the first input is NaN, set the temp register for the second input to 0. This is because:
+    //
+    // - If the second input is also NaN, setting it to 0 ensures that the first NaN will be picked.
+    // - If only the first input is NaN, setting the second input to 0 has no effect on the result.
+    //
+    // Either way, we can then do an FADD as usual, and the FADD will make the NaN quiet.
+    m_float_emit.FCMP(scalar_reg_encoder(VA));
+    FixupBranch a_not_nan = B(CCFlags::CC_VC);
+    m_float_emit.MOVI(64, scalar_reg_encoder(V0), 0);
+    SetJumpTarget(a_not_nan);
  }

  if (upper)
@ -441,15 +413,7 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
    m_float_emit.INS(size, VD, 0, V0, 0);
  }

-  if (m_accurate_nans)
-  {
-    SetJumpTarget(a_nan_done);
-    SetJumpTarget(b_nan_done);
-  }
-
  fpr.Unlock(V0);
-  if (temp_gpr != ARM64Reg::INVALID_REG)
-    gpr.Unlock(temp_gpr);

  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
             "Register allocation turned singles into doubles in the middle of ps_sumX");