[a64] Optimize `OPCODE_MUL_ADD`

Use `FMADD` and `FMLA` Tests are the same, though now it should run a bit faster. The tests that fail are primarily denormals and other subtle precision issues it seems. Ex: ``` i> 00002358 - vmaddfp_7298_GEN !> 00002358 Register v4 assert failed: !> 00002358 Expected: v4 == [00000000, 00000000, 00000000, 00000000] !> 00002358 Actual: v4 == [000D000E, 00138014, 000E4CDC, 0018B34D] !> 00002358 TEST FAILED ``` Host-To-Guest and Guest-To-Host thunks should probably restore/preserve the FPCR to maintain these roundings.
2024-05-12 20:04:47 -07:00 · 2024-05-12 20:04:47 -07:00 · b9d0752b40
parent 684904c487
commit b9d0752b40
1 changed files with 49 additions and 40 deletions
--- a/src/xenia/cpu/backend/a64/a64_sequences.cc
+++ b/src/xenia/cpu/backend/a64/a64_sequences.cc
@ -1717,77 +1717,86 @@ EMITTER_OPCODE_TABLE(OPCODE_DIV, DIV_I8, DIV_I16, DIV_I32, DIV_I64, DIV_F32,
 struct MUL_ADD_F32
    : Sequence<MUL_ADD_F32, I<OPCODE_MUL_ADD, F32Op, F32Op, F32Op, F32Op>> {
  static void Emit(A64Emitter& e, const EmitArgType& i) {
-    SReg src3(1);
+    SReg src3 = S3;
    if (i.src3.is_constant) {
-      src3 = S1;
      e.LoadConstantV(src3.toQ(), i.src3.constant());
    } else {
-      // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
      src3 = i.src3.reg();
-      if (i.dest.reg().index() == i.src3.reg().index()) {
-        e.FMOV(S1, i.src3);
-        src3 = S1;
-      }
    }

-    // Multiply operation is commutative.
-    EmitCommutativeBinaryVOp<SReg>(
-        e, i, [&i](A64Emitter& e, SReg dest, SReg src1, SReg src2) {
-          e.FMUL(dest, src1, src2);  // $0 = $1 * $2
-        });
+    SReg src2 = S2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(src2.toQ(), i.src2.constant());
+    } else {
+      src2 = i.src2.reg();
+    }

-    e.FADD(i.dest, i.dest, src3);  // $0 = $1 + $2
+    SReg src1 = S1;
+    if (i.src1.is_constant) {
+      e.LoadConstantV(src1.toQ(), i.src1.constant());
+    } else {
+      src1 = i.src1.reg();
+    }
+
+    e.FMADD(i.dest, src1, src2, src3);
  }
 };
 struct MUL_ADD_F64
    : Sequence<MUL_ADD_F64, I<OPCODE_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
  static void Emit(A64Emitter& e, const EmitArgType& i) {
-    DReg src3(1);
+    DReg src3 = D3;
    if (i.src3.is_constant) {
-      src3 = D1;
      e.LoadConstantV(src3.toQ(), i.src3.constant());
    } else {
-      // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
      src3 = i.src3.reg();
-      if (i.dest.reg().index() == i.src3.reg().index()) {
-        e.FMOV(D1, i.src3);
-        src3 = D1;
-      }
    }

-    // Multiply operation is commutative.
-    EmitCommutativeBinaryVOp<DReg>(
-        e, i, [&i](A64Emitter& e, DReg dest, DReg src1, DReg src2) {
-          e.FMUL(dest, src1, src2);  // $0 = $1 * $2
-        });
+    DReg src2 = D2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(src2.toQ(), i.src2.constant());
+    } else {
+      src2 = i.src2.reg();
+    }

-    e.FADD(i.dest, i.dest, src3);  // $0 = $1 + $2
+    DReg src1 = D1;
+    if (i.src1.is_constant) {
+      e.LoadConstantV(src1.toQ(), i.src1.constant());
+    } else {
+      src1 = i.src1.reg();
+    }
+
+    e.FMADD(i.dest, src1, src2, src3);
  }
 };
 struct MUL_ADD_V128
    : Sequence<MUL_ADD_V128,
               I<OPCODE_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
  static void Emit(A64Emitter& e, const EmitArgType& i) {
-    QReg src3(1);
+    const QReg dest = i.dest.reg();
    if (i.src3.is_constant) {
-      src3 = Q1;
-      e.LoadConstantV(src3, i.src3.constant());
+      e.LoadConstantV(dest.toQ(), i.src3.constant());
    } else {
-      // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
-      src3 = i.src3;
-      if (i.dest == i.src3) {
-        e.MOV(Q1.B16(), i.src3.reg().B16());
-        src3 = Q1;
+      // If i.dest != i.src3, move the addition-term into dest for FMLA
+      if (i.dest != i.src3) {
+        e.MOV(dest.B16(), i.src3.reg().B16());
      }
    }

-    // Multiply operation is commutative.
-    EmitCommutativeBinaryVOp(
-        e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
-          e.FMUL(dest.S4(), src1.S4(), src2.S4());  // $0 = $1 * $2
-        });
+    QReg src2 = Q2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(src2.toQ(), i.src2.constant());
+    } else {
+      src2 = i.src2.reg();
+    }

-    e.FADD(i.dest.reg().S4(), i.dest.reg().S4(), src3.S4());
+    QReg src1 = Q1;
+    if (i.src1.is_constant) {
+      e.LoadConstantV(src1.toQ(), i.src1.constant());
+    } else {
+      src1 = i.src1.reg();
+    }
+
+    e.FMLA(dest.S4(), src1.S4(), src2.S4());
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);