Merge pull request #12092 from JosJuice/jitarm64-last-nan

JitArm64: Skip checking last input for NaN for non-SIMD operations
2023-11-28 22:30:50 +01:00 · 2023-11-28 22:30:50 +01:00 · 934418a289
parent 95f06ef231 fc95d59805
commit 934418a289
2 changed files with 27 additions and 76 deletions
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@ -80,9 +80,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
  const bool fma = use_b && use_c;
  const bool negate_result = (op5 & ~0x1) == 30;
  // Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs
  const bool can_generate_nan = (op5 & ~0x1) != 20;
  const bool output_is_single = inst.OPCD == 59;
  const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
  const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
@ -203,45 +200,35 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
    if (use_c && VA != VC && (!use_b || VB != VC))
      inputs.push_back(VC);
-    // If any inputs are NaNs, pick the first NaN of them and set its quiet bit
+    // If any inputs are NaNs, pick the first NaN of them and set its quiet bit.
-    for (size_t i = 0; i < inputs.size(); ++i)
+    // However, we can skip checking the last input, because if exactly one input is NaN, AArch64
    // arithmetic instructions automatically pick that NaN and make it quiet, just like we want.
    for (size_t i = 0; i < inputs.size() - 1; ++i)
    {
      // Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
      // least one NaN input
      const bool check_input = can_generate_nan || i != inputs.size() - 1;
      const ARM64Reg input = inputs[i];
-      FixupBranch skip;
+
-      if (check_input)
+      m_float_emit.FCMP(input);
-      {
+      FixupBranch skip = B(CCFlags::CC_VC);
        m_float_emit.FCMP(input);
        skip = B(CCFlags::CC_VC);
      }
      // Make the NaN quiet
      m_float_emit.FADD(VD, input, input);
      nan_fixups.push_back(B());
-      if (check_input)
+      SetJumpTarget(skip);
        SetJumpTarget(skip);
    }
    std::optional<FixupBranch> nan_early_fixup;
-    if (can_generate_nan)
+    if (negate_result)
    {
-      // There was no NaN in any of the inputs, so the NaN must have been generated by the
+      // If we have a NaN, we must not execute FNEG.
-      // arithmetic instruction. In this case, the result is already correct.
+      if (result_reg != VD)
-      if (negate_result)
+        m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
-      {
+      nan_fixups.push_back(B());
-        if (result_reg != VD)
+    }
-          m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
+    else
-        nan_fixups.push_back(B());
+    {
-      }
+      nan_early_fixup = B();
      else
      {
        nan_early_fixup = B();
      }
    }
    SwitchToNearCode();
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@ -380,49 +380,21 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
  const ARM64Reg VC = fpr.R(c, type);
  const ARM64Reg VD = fpr.RW(d, type);
  const ARM64Reg V0 = fpr.GetReg();
  const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG;
  m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);
  FixupBranch a_nan_done, b_nan_done;
  if (m_accurate_nans)
  {
-    const auto check_nan = [&](ARM64Reg input) {
+    // If the first input is NaN, set the temp register for the second input to 0. This is because:
-      m_float_emit.FCMP(scalar_reg_encoder(input));
+    //
-      FixupBranch not_nan = B(CCFlags::CC_VC);
+    // - If the second input is also NaN, setting it to 0 ensures that the first NaN will be picked.
-      FixupBranch nan = B();
+    // - If only the first input is NaN, setting the second input to 0 has no effect on the result.
-      SetJumpTarget(not_nan);
+    //
-
+    // Either way, we can then do an FADD as usual, and the FADD will make the NaN quiet.
-      SwitchToFarCode();
+    m_float_emit.FCMP(scalar_reg_encoder(VA));
-      SetJumpTarget(nan);
+    FixupBranch a_not_nan = B(CCFlags::CC_VC);
-
+    m_float_emit.MOVI(64, scalar_reg_encoder(V0), 0);
-      if (upper)
+    SetJumpTarget(a_not_nan);
      {
        m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
                          scalar_reg_encoder(input));
        m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
      }
      else if (d != c)
      {
        m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(input),
                          scalar_reg_encoder(input));
        m_float_emit.INS(size, VD, 1, VC, 1);
      }
      else
      {
        m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
                          scalar_reg_encoder(input));
        m_float_emit.INS(size, VD, 0, V0, 0);
      }
      FixupBranch nan_done = B();
      SwitchToNearCode();
      return nan_done;
    };
    a_nan_done = check_nan(VA);
    b_nan_done = check_nan(V0);
  }
  if (upper)
@ -441,15 +413,7 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
    m_float_emit.INS(size, VD, 0, V0, 0);
  }
  if (m_accurate_nans)
  {
    SetJumpTarget(a_nan_done);
    SetJumpTarget(b_nan_done);
  }
  fpr.Unlock(V0);
  if (temp_gpr != ARM64Reg::INVALID_REG)
    gpr.Unlock(temp_gpr);
  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
             "Register allocation turned singles into doubles in the middle of ps_sumX");