From b9d0752b40ba05044b4549ddf69ad755b1f0b83f Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Sun, 12 May 2024 20:04:47 -0700 Subject: [PATCH] [a64] Optimize `OPCODE_MUL_ADD` Use `FMADD` and `FMLA` Tests are the same, though now it should run a bit faster. The tests that fail are primarily denormals and other subtle precision issues it seems. Ex: ``` i> 00002358 - vmaddfp_7298_GEN !> 00002358 Register v4 assert failed: !> 00002358 Expected: v4 == [00000000, 00000000, 00000000, 00000000] !> 00002358 Actual: v4 == [000D000E, 00138014, 000E4CDC, 0018B34D] !> 00002358 TEST FAILED ``` Host-To-Guest and Guest-To-Host thunks should probably restore/preserve the FPCR to maintain these roundings. --- src/xenia/cpu/backend/a64/a64_sequences.cc | 89 ++++++++++++---------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/src/xenia/cpu/backend/a64/a64_sequences.cc b/src/xenia/cpu/backend/a64/a64_sequences.cc index b156b720b..9b58a8409 100644 --- a/src/xenia/cpu/backend/a64/a64_sequences.cc +++ b/src/xenia/cpu/backend/a64/a64_sequences.cc @@ -1717,77 +1717,86 @@ EMITTER_OPCODE_TABLE(OPCODE_DIV, DIV_I8, DIV_I16, DIV_I32, DIV_I64, DIV_F32, struct MUL_ADD_F32 : Sequence> { static void Emit(A64Emitter& e, const EmitArgType& i) { - SReg src3(1); + SReg src3 = S3; if (i.src3.is_constant) { - src3 = S1; e.LoadConstantV(src3.toQ(), i.src3.constant()); } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3.reg(); - if (i.dest.reg().index() == i.src3.reg().index()) { - e.FMOV(S1, i.src3); - src3 = S1; - } } - // Multiply operation is commutative. - EmitCommutativeBinaryVOp( - e, i, [&i](A64Emitter& e, SReg dest, SReg src1, SReg src2) { - e.FMUL(dest, src1, src2); // $0 = $1 * $2 - }); + SReg src2 = S2; + if (i.src2.is_constant) { + e.LoadConstantV(src2.toQ(), i.src2.constant()); + } else { + src2 = i.src2.reg(); + } - e.FADD(i.dest, i.dest, src3); // $0 = $1 + $2 + SReg src1 = S1; + if (i.src1.is_constant) { + e.LoadConstantV(src1.toQ(), i.src1.constant()); + } else { + src1 = i.src1.reg(); + } + + e.FMADD(i.dest, src1, src2, src3); } }; struct MUL_ADD_F64 : Sequence> { static void Emit(A64Emitter& e, const EmitArgType& i) { - DReg src3(1); + DReg src3 = D3; if (i.src3.is_constant) { - src3 = D1; e.LoadConstantV(src3.toQ(), i.src3.constant()); } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3.reg(); - if (i.dest.reg().index() == i.src3.reg().index()) { - e.FMOV(D1, i.src3); - src3 = D1; - } } - // Multiply operation is commutative. - EmitCommutativeBinaryVOp( - e, i, [&i](A64Emitter& e, DReg dest, DReg src1, DReg src2) { - e.FMUL(dest, src1, src2); // $0 = $1 * $2 - }); + DReg src2 = D2; + if (i.src2.is_constant) { + e.LoadConstantV(src2.toQ(), i.src2.constant()); + } else { + src2 = i.src2.reg(); + } - e.FADD(i.dest, i.dest, src3); // $0 = $1 + $2 + DReg src1 = D1; + if (i.src1.is_constant) { + e.LoadConstantV(src1.toQ(), i.src1.constant()); + } else { + src1 = i.src1.reg(); + } + + e.FMADD(i.dest, src1, src2, src3); } }; struct MUL_ADD_V128 : Sequence> { static void Emit(A64Emitter& e, const EmitArgType& i) { - QReg src3(1); + const QReg dest = i.dest.reg(); if (i.src3.is_constant) { - src3 = Q1; - e.LoadConstantV(src3, i.src3.constant()); + e.LoadConstantV(dest.toQ(), i.src3.constant()); } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - src3 = i.src3; - if (i.dest == i.src3) { - e.MOV(Q1.B16(), i.src3.reg().B16()); - src3 = Q1; + // If i.dest != i.src3, move the addition-term into dest for FMLA + if (i.dest != i.src3) { + e.MOV(dest.B16(), i.src3.reg().B16()); } } - // Multiply operation is commutative. - EmitCommutativeBinaryVOp( - e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) { - e.FMUL(dest.S4(), src1.S4(), src2.S4()); // $0 = $1 * $2 - }); + QReg src2 = Q2; + if (i.src2.is_constant) { + e.LoadConstantV(src2.toQ(), i.src2.constant()); + } else { + src2 = i.src2.reg(); + } - e.FADD(i.dest.reg().S4(), i.dest.reg().S4(), src3.S4()); + QReg src1 = Q1; + if (i.src1.is_constant) { + e.LoadConstantV(src1.toQ(), i.src1.constant()); + } else { + src1 = i.src1.reg(); + } + + e.FMLA(dest.S4(), src1.S4(), src2.S4()); } }; EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);