From 554a2fd33228503a5848cda4039e35993f7985c1 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sun, 9 Oct 2022 12:14:47 +0200
Subject: [PATCH 1/2] JitArm64: Merge ps_mulsX and ps_maddXX

They have a lot of shared code, most notably the code for rounding c.

No behavior change.
---
 Source/Core/Core/PowerPC/JitArm64/Jit.h       |  3 +-
 .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 72 +++++--------------
 .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 16 ++---
 3 files changed, 27 insertions(+), 64 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h
index ec65997685..e9abdbbc4b 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.h
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h
@@ -152,9 +152,8 @@ public:
   void frsqrtex(UGeckoInstruction inst);
 
   // Paired
-  void ps_maddXX(UGeckoInstruction inst);
   void ps_mergeXX(UGeckoInstruction inst);
-  void ps_mulsX(UGeckoInstruction inst);
+  void ps_arith(UGeckoInstruction inst);
   void ps_sel(UGeckoInstruction inst);
   void ps_sumX(UGeckoInstruction inst);
   void ps_res(UGeckoInstruction inst);
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index 0d786f80a0..1afb5e1683 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -73,55 +73,7 @@ void JitArm64::ps_mergeXX(UGeckoInstruction inst)
              "Register allocation turned singles into doubles in the middle of ps_mergeXX");
 }
 
-void JitArm64::ps_mulsX(UGeckoInstruction inst)
-{
-  INSTRUCTION_START
-  JITDISABLE(bJITPairedOff);
-  FALLBACK_IF(inst.Rc);
-  FALLBACK_IF(jo.fp_exceptions);
-
-  const u32 a = inst.FA;
-  const u32 c = inst.FC;
-  const u32 d = inst.FD;
-
-  const bool upper = inst.SUBOP5 == 13;
-
-  const bool singles = fpr.IsSingle(a) && fpr.IsSingle(c);
-  const bool round_c = !js.op->fprIsSingle[inst.FC];
-  const RegType type = singles ? RegType::Single : RegType::Register;
-  const u8 size = singles ? 32 : 64;
-  const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
-
-  const ARM64Reg VA = fpr.R(a, type);
-  ARM64Reg VC = fpr.R(c, type);
-  const ARM64Reg VD = fpr.RW(d, type);
-
-  ARM64Reg V0Q = ARM64Reg::INVALID_REG;
-
-  if (round_c)
-  {
-    ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
-
-    V0Q = fpr.GetReg();
-
-    Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC));
-    VC = reg_encoder(V0Q);
-  }
-
-  m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0);
-
-  if (V0Q != ARM64Reg::INVALID_REG)
-    fpr.Unlock(V0Q);
-
-  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(c)),
-             "Register allocation turned singles into doubles in the middle of ps_mulsX");
-
-  fpr.FixSinglePrecision(d);
-
-  SetFPRFIfNeeded(true, VD);
-}
-
-void JitArm64::ps_maddXX(UGeckoInstruction inst)
+void JitArm64::ps_arith(UGeckoInstruction inst)
 {
   INSTRUCTION_START
   JITDISABLE(bJITPairedOff);
@@ -134,15 +86,21 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
   const u32 d = inst.FD;
   const u32 op5 = inst.SUBOP5;
 
+  const bool use_b = (op5 & ~0x1) != 12;   // muls uses no B
+
+  const auto singles_func = [&] {
+    return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && fpr.IsSingle(c);
+  };
+  const bool singles = singles_func();
+
   const bool inaccurate_fma = !Config::Get(Config::SESSION_USE_FMA);
-  const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
   const bool round_c = !js.op->fprIsSingle[inst.FC];
   const RegType type = singles ? RegType::Single : RegType::Register;
   const u8 size = singles ? 32 : 64;
   const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
 
   const ARM64Reg VA = reg_encoder(fpr.R(a, type));
-  const ARM64Reg VB = reg_encoder(fpr.R(b, type));
+  const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
   ARM64Reg VC = reg_encoder(fpr.R(c, type));
   const ARM64Reg VD = reg_encoder(fpr.RW(d, type));
 
@@ -178,6 +136,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
   ARM64Reg result_reg = VD;
   switch (op5)
   {
+  case 12:  // ps_muls0: d = a * c.ps0
+    m_float_emit.FMUL(size, VD, VA, VC, 0);
+    break;
+  case 13:  // ps_muls1: d = a * c.ps1
+    m_float_emit.FMUL(size, VD, VA, VC, 1);
+    break;
   case 14:  // ps_madds0: d = a * c.ps0 + b
     if (inaccurate_fma)
     {
@@ -269,7 +233,7 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
     }
     break;
   default:
-    ASSERT_MSG(DYNA_REC, 0, "ps_madd - invalid op");
+    ASSERT_MSG(DYNA_REC, 0, "ps_arith - invalid op");
     break;
   }
 
@@ -292,8 +256,8 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
   if (V1Q != ARM64Reg::INVALID_REG)
     fpr.Unlock(V1Q);
 
-  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
-             "Register allocation turned singles into doubles in the middle of ps_maddXX");
+  ASSERT_MSG(DYNA_REC, singles == singles_func(),
+             "Register allocation turned singles into doubles in the middle of ps_arith");
 
   fpr.FixSinglePrecision(d);
 
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp
index ccac60efb3..c3f7a87fbb 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp
@@ -108,10 +108,10 @@ constexpr std::array<GekkoOPTemplate, 13> table4{{
 constexpr std::array<GekkoOPTemplate, 17> table4_2{{
     {10, &JitArm64::ps_sumX},    // ps_sum0
     {11, &JitArm64::ps_sumX},    // ps_sum1
-    {12, &JitArm64::ps_mulsX},   // ps_muls0
-    {13, &JitArm64::ps_mulsX},   // ps_muls1
-    {14, &JitArm64::ps_maddXX},  // ps_madds0
-    {15, &JitArm64::ps_maddXX},  // ps_madds1
+    {12, &JitArm64::ps_arith},   // ps_muls0
+    {13, &JitArm64::ps_arith},   // ps_muls1
+    {14, &JitArm64::ps_arith},   // ps_madds0
+    {15, &JitArm64::ps_arith},   // ps_madds1
     {18, &JitArm64::fp_arith},   // ps_div
     {20, &JitArm64::fp_arith},   // ps_sub
     {21, &JitArm64::fp_arith},   // ps_add
@@ -119,10 +119,10 @@ constexpr std::array<GekkoOPTemplate, 17> table4_2{{
     {24, &JitArm64::ps_res},     // ps_res
     {25, &JitArm64::fp_arith},   // ps_mul
     {26, &JitArm64::ps_rsqrte},  // ps_rsqrte
-    {28, &JitArm64::ps_maddXX},  // ps_msub
-    {29, &JitArm64::ps_maddXX},  // ps_madd
-    {30, &JitArm64::ps_maddXX},  // ps_nmsub
-    {31, &JitArm64::ps_maddXX},  // ps_nmadd
+    {28, &JitArm64::ps_arith},   // ps_msub
+    {29, &JitArm64::ps_arith},   // ps_madd
+    {30, &JitArm64::ps_arith},   // ps_nmsub
+    {31, &JitArm64::ps_arith},   // ps_nmadd
 }};
 
 constexpr std::array<GekkoOPTemplate, 4> table4_3{{

From 812067ab7cdd244a9144ae59049b2fd6647e6606 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sun, 9 Oct 2022 12:31:29 +0200
Subject: [PATCH 2/2] JitArm64: Move ps instructions from fp_arith to ps_arith

This lets us simplify fp_arith without making ps_arith much more
complicated.

No behavior change.
---
 .../JitArm64/JitArm64_FloatingPoint.cpp       | 198 +++++++-----------
 .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp |  21 +-
 .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp |   8 +-
 3 files changed, 94 insertions(+), 133 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
index 260da2d900..26c6dfd1b7 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@@ -69,154 +69,102 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
   u32 op5 = inst.SUBOP5;
 
-  bool single = inst.OPCD == 59;
-  bool packed = inst.OPCD == 4;
-
   const bool use_c = op5 >= 25;  // fmul and all kind of fmaddXX
   const bool use_b = op5 != 25;  // fmul uses no B
 
-  const bool outputs_are_singles = single || packed;
-  const bool round_c = use_c && outputs_are_singles && !js.op->fprIsSingle[inst.FC];
+  const bool output_is_single = inst.OPCD == 59;
+  const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
+  const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
 
   const auto inputs_are_singles_func = [&] {
-    return fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) &&
-           (!use_c || fpr.IsSingle(c, !packed));
+    return fpr.IsSingle(a, true) && (!use_b || fpr.IsSingle(b, true)) &&
+           (!use_c || fpr.IsSingle(c, true));
   };
   const bool inputs_are_singles = inputs_are_singles_func();
 
-  ARM64Reg VA{}, VB{}, VC{}, VD{};
+  const RegType type =
+      (inputs_are_singles && output_is_single) ? RegType::LowerPairSingle : RegType::LowerPair;
+  const RegType type_out =
+      output_is_single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) :
+                         RegType::LowerPair;
+  const auto reg_encoder =
+      (inputs_are_singles && output_is_single) ? EncodeRegToSingle : EncodeRegToDouble;
+
+  const ARM64Reg VA = reg_encoder(fpr.R(a, type));
+  const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
+  ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
+  const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out));
 
   ARM64Reg V0Q = ARM64Reg::INVALID_REG;
   ARM64Reg V1Q = ARM64Reg::INVALID_REG;
 
-  if (packed)
+  if (round_c)
   {
-    const RegType type = inputs_are_singles ? RegType::Single : RegType::Register;
-    const u8 size = inputs_are_singles ? 32 : 64;
-    const auto reg_encoder = inputs_are_singles ? EncodeRegToDouble : EncodeRegToQuad;
+    ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
 
-    VA = reg_encoder(fpr.R(a, type));
-    if (use_b)
-      VB = reg_encoder(fpr.R(b, type));
-    if (use_c)
-      VC = reg_encoder(fpr.R(c, type));
-    VD = reg_encoder(fpr.RW(d, type));
+    V1Q = fpr.GetReg();
 
-    if (round_c)
-    {
-      ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
-
-      V0Q = fpr.GetReg();
-
-      Force25BitPrecision(reg_encoder(V0Q), VC);
-      VC = reg_encoder(V0Q);
-    }
-
-    switch (op5)
-    {
-    case 18:
-      m_float_emit.FDIV(size, VD, VA, VB);
-      break;
-    case 20:
-      m_float_emit.FSUB(size, VD, VA, VB);
-      break;
-    case 21:
-      m_float_emit.FADD(size, VD, VA, VB);
-      break;
-    case 25:
-      m_float_emit.FMUL(size, VD, VA, VC);
-      break;
-    default:
-      ASSERT_MSG(DYNA_REC, 0, "fp_arith");
-      break;
-    }
+    Force25BitPrecision(reg_encoder(V1Q), VC);
+    VC = reg_encoder(V1Q);
   }
-  else
+
+  ARM64Reg inaccurate_fma_temp_reg = VD;
+  if (inaccurate_fma && d == b)
   {
-    const RegType type =
-        (inputs_are_singles && single) ? RegType::LowerPairSingle : RegType::LowerPair;
-    const RegType type_out =
-        single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) :
-                 RegType::LowerPair;
-    const auto reg_encoder = (inputs_are_singles && single) ? EncodeRegToSingle : EncodeRegToDouble;
+    V0Q = fpr.GetReg();
 
-    VA = reg_encoder(fpr.R(a, type));
-    if (use_b)
-      VB = reg_encoder(fpr.R(b, type));
-    if (use_c)
-      VC = reg_encoder(fpr.R(c, type));
-    VD = reg_encoder(fpr.RW(d, type_out));
+    inaccurate_fma_temp_reg = reg_encoder(V0Q);
+  }
 
-    const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
-
-    if (round_c)
+  switch (op5)
+  {
+  case 18:
+    m_float_emit.FDIV(VD, VA, VB);
+    break;
+  case 20:
+    m_float_emit.FSUB(VD, VA, VB);
+    break;
+  case 21:
+    m_float_emit.FADD(VD, VA, VB);
+    break;
+  case 25:
+    m_float_emit.FMUL(VD, VA, VC);
+    break;
+  // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic],
+  // the subtly different definitions affect how signed zeroes are handled.
+  // Also, PowerPC's nmadd/nmsub perform rounding before the final negation.
+  // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
+  case 28:  // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
+  case 30:  // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
+    if (inaccurate_fma)
     {
-      ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
-
-      V1Q = fpr.GetReg();
-
-      Force25BitPrecision(reg_encoder(V1Q), VC);
-      VC = reg_encoder(V1Q);
+      m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC);
+      m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB);
     }
-
-    ARM64Reg inaccurate_fma_temp_reg = VD;
-    if (inaccurate_fma && d == b)
+    else
     {
-      V0Q = fpr.GetReg();
-
-      inaccurate_fma_temp_reg = reg_encoder(V0Q);
+      m_float_emit.FNMSUB(VD, VA, VC, VB);
     }
-
-    switch (op5)
+    if (op5 == 30)
+      m_float_emit.FNEG(VD, VD);
+    break;
+  case 29:  // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
+  case 31:  // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
+    if (inaccurate_fma)
     {
-    case 18:
-      m_float_emit.FDIV(VD, VA, VB);
-      break;
-    case 20:
-      m_float_emit.FSUB(VD, VA, VB);
-      break;
-    case 21:
-      m_float_emit.FADD(VD, VA, VB);
-      break;
-    case 25:
-      m_float_emit.FMUL(VD, VA, VC);
-      break;
-    // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic],
-    // the subtly different definitions affect how signed zeroes are handled.
-    // Also, PowerPC's nmadd/nmsub perform rounding before the final negation.
-    // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
-    case 28:  // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
-    case 30:  // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
-      if (inaccurate_fma)
-      {
-        m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC);
-        m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB);
-      }
-      else
-      {
-        m_float_emit.FNMSUB(VD, VA, VC, VB);
-      }
-      if (op5 == 30)
-        m_float_emit.FNEG(VD, VD);
-      break;
-    case 29:  // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
-    case 31:  // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
-      if (inaccurate_fma)
-      {
-        m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC);
-        m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB);
-      }
-      else
-      {
-        m_float_emit.FMADD(VD, VA, VC, VB);
-      }
-      if (op5 == 31)
-        m_float_emit.FNEG(VD, VD);
-      break;
-    default:
-      ASSERT_MSG(DYNA_REC, 0, "fp_arith");
-      break;
+      m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC);
+      m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB);
     }
+    else
+    {
+      m_float_emit.FMADD(VD, VA, VC, VB);
+    }
+    if (op5 == 31)
+      m_float_emit.FNEG(VD, VD);
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, 0, "fp_arith");
+    break;
   }
 
   if (V0Q != ARM64Reg::INVALID_REG)
@@ -224,7 +172,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   if (V1Q != ARM64Reg::INVALID_REG)
     fpr.Unlock(V1Q);
 
-  if (outputs_are_singles)
+  if (output_is_single)
   {
     ASSERT_MSG(DYNA_REC, inputs_are_singles == inputs_are_singles_func(),
                "Register allocation turned singles into doubles in the middle of fp_arith");
@@ -232,7 +180,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     fpr.FixSinglePrecision(d);
   }
 
-  SetFPRFIfNeeded(outputs_are_singles, VD);
+  SetFPRFIfNeeded(output_is_single, VD);
 }
 
 void JitArm64::fp_logic(UGeckoInstruction inst)
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index 1afb5e1683..85d22f6183 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -86,22 +86,23 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
   const u32 d = inst.FD;
   const u32 op5 = inst.SUBOP5;
 
-  const bool use_b = (op5 & ~0x1) != 12;   // muls uses no B
+  const bool use_c = op5 == 25 || (op5 & ~0x13) == 12;  // mul, muls, and all kinds of maddXX
+  const bool use_b = op5 != 25 && (op5 & ~0x1) != 12;   // mul and muls don't use B
 
   const auto singles_func = [&] {
-    return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && fpr.IsSingle(c);
+    return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c));
   };
   const bool singles = singles_func();
 
   const bool inaccurate_fma = !Config::Get(Config::SESSION_USE_FMA);
-  const bool round_c = !js.op->fprIsSingle[inst.FC];
+  const bool round_c = use_c && !js.op->fprIsSingle[inst.FC];
   const RegType type = singles ? RegType::Single : RegType::Register;
   const u8 size = singles ? 32 : 64;
   const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
 
   const ARM64Reg VA = reg_encoder(fpr.R(a, type));
   const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
-  ARM64Reg VC = reg_encoder(fpr.R(c, type));
+  ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
   const ARM64Reg VD = reg_encoder(fpr.RW(d, type));
 
   ARM64Reg V0Q = ARM64Reg::INVALID_REG;
@@ -188,6 +189,18 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
       result_reg = V0;
     }
     break;
+  case 18:  // ps_div
+    m_float_emit.FDIV(size, VD, VA, VB);
+    break;
+  case 20:  // ps_sub
+    m_float_emit.FSUB(size, VD, VA, VB);
+    break;
+  case 21:  // ps_add
+    m_float_emit.FADD(size, VD, VA, VB);
+    break;
+  case 25:  // ps_mul
+    m_float_emit.FMUL(size, VD, VA, VC);
+    break;
   case 28:  // ps_msub:  d = a * c - b
   case 30:  // ps_nmsub: d = -(a * c - b)
     if (inaccurate_fma)
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp
index c3f7a87fbb..2e4c72f4f6 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp
@@ -112,12 +112,12 @@ constexpr std::array<GekkoOPTemplate, 17> table4_2{{
     {13, &JitArm64::ps_arith},   // ps_muls1
     {14, &JitArm64::ps_arith},   // ps_madds0
     {15, &JitArm64::ps_arith},   // ps_madds1
-    {18, &JitArm64::fp_arith},   // ps_div
-    {20, &JitArm64::fp_arith},   // ps_sub
-    {21, &JitArm64::fp_arith},   // ps_add
+    {18, &JitArm64::ps_arith},   // ps_div
+    {20, &JitArm64::ps_arith},   // ps_sub
+    {21, &JitArm64::ps_arith},   // ps_add
     {23, &JitArm64::ps_sel},     // ps_sel
     {24, &JitArm64::ps_res},     // ps_res
-    {25, &JitArm64::fp_arith},   // ps_mul
+    {25, &JitArm64::ps_arith},   // ps_mul
     {26, &JitArm64::ps_rsqrte},  // ps_rsqrte
     {28, &JitArm64::ps_arith},   // ps_msub
     {29, &JitArm64::ps_arith},   // ps_madd