[a64] Optimize `OPCODE_MUL_ADD`

Use `FMADD` and `FMLA`
Tests are the same, though now it should run a bit faster.
The tests that fail are primarily denormals and other subtle precision
issues it seems.

Ex:
```
i> 00002358   - vmaddfp_7298_GEN
!> 00002358 Register v4 assert failed:
!> 00002358   Expected: v4 == [00000000, 00000000, 00000000, 00000000]
!> 00002358     Actual: v4 == [000D000E, 00138014, 000E4CDC, 0018B34D]
!> 00002358     TEST FAILED
```

Host-To-Guest and Guest-To-Host thunks should probably restore/preserve
the FPCR to maintain these roundings.
This commit is contained in:
Wunkolo 2024-05-12 20:04:47 -07:00
parent 684904c487
commit b9d0752b40
1 changed files with 49 additions and 40 deletions

View File

@ -1717,77 +1717,86 @@ EMITTER_OPCODE_TABLE(OPCODE_DIV, DIV_I8, DIV_I16, DIV_I32, DIV_I64, DIV_F32,
struct MUL_ADD_F32
: Sequence<MUL_ADD_F32, I<OPCODE_MUL_ADD, F32Op, F32Op, F32Op, F32Op>> {
static void Emit(A64Emitter& e, const EmitArgType& i) {
SReg src3(1);
SReg src3 = S3;
if (i.src3.is_constant) {
src3 = S1;
e.LoadConstantV(src3.toQ(), i.src3.constant());
} else {
// If i.dest == i.src3, back up i.src3 so we don't overwrite it.
src3 = i.src3.reg();
if (i.dest.reg().index() == i.src3.reg().index()) {
e.FMOV(S1, i.src3);
src3 = S1;
}
}
// Multiply operation is commutative.
EmitCommutativeBinaryVOp<SReg>(
e, i, [&i](A64Emitter& e, SReg dest, SReg src1, SReg src2) {
e.FMUL(dest, src1, src2); // $0 = $1 * $2
});
SReg src2 = S2;
if (i.src2.is_constant) {
e.LoadConstantV(src2.toQ(), i.src2.constant());
} else {
src2 = i.src2.reg();
}
e.FADD(i.dest, i.dest, src3); // $0 = $1 + $2
SReg src1 = S1;
if (i.src1.is_constant) {
e.LoadConstantV(src1.toQ(), i.src1.constant());
} else {
src1 = i.src1.reg();
}
e.FMADD(i.dest, src1, src2, src3);
}
};
struct MUL_ADD_F64
: Sequence<MUL_ADD_F64, I<OPCODE_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
static void Emit(A64Emitter& e, const EmitArgType& i) {
DReg src3(1);
DReg src3 = D3;
if (i.src3.is_constant) {
src3 = D1;
e.LoadConstantV(src3.toQ(), i.src3.constant());
} else {
// If i.dest == i.src3, back up i.src3 so we don't overwrite it.
src3 = i.src3.reg();
if (i.dest.reg().index() == i.src3.reg().index()) {
e.FMOV(D1, i.src3);
src3 = D1;
}
}
// Multiply operation is commutative.
EmitCommutativeBinaryVOp<DReg>(
e, i, [&i](A64Emitter& e, DReg dest, DReg src1, DReg src2) {
e.FMUL(dest, src1, src2); // $0 = $1 * $2
});
DReg src2 = D2;
if (i.src2.is_constant) {
e.LoadConstantV(src2.toQ(), i.src2.constant());
} else {
src2 = i.src2.reg();
}
e.FADD(i.dest, i.dest, src3); // $0 = $1 + $2
DReg src1 = D1;
if (i.src1.is_constant) {
e.LoadConstantV(src1.toQ(), i.src1.constant());
} else {
src1 = i.src1.reg();
}
e.FMADD(i.dest, src1, src2, src3);
}
};
struct MUL_ADD_V128
: Sequence<MUL_ADD_V128,
I<OPCODE_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
static void Emit(A64Emitter& e, const EmitArgType& i) {
QReg src3(1);
const QReg dest = i.dest.reg();
if (i.src3.is_constant) {
src3 = Q1;
e.LoadConstantV(src3, i.src3.constant());
e.LoadConstantV(dest.toQ(), i.src3.constant());
} else {
// If i.dest == i.src3, back up i.src3 so we don't overwrite it.
src3 = i.src3;
if (i.dest == i.src3) {
e.MOV(Q1.B16(), i.src3.reg().B16());
src3 = Q1;
// If i.dest != i.src3, move the addition-term into dest for FMLA
if (i.dest != i.src3) {
e.MOV(dest.B16(), i.src3.reg().B16());
}
}
// Multiply operation is commutative.
EmitCommutativeBinaryVOp(
e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
e.FMUL(dest.S4(), src1.S4(), src2.S4()); // $0 = $1 * $2
});
QReg src2 = Q2;
if (i.src2.is_constant) {
e.LoadConstantV(src2.toQ(), i.src2.constant());
} else {
src2 = i.src2.reg();
}
e.FADD(i.dest.reg().S4(), i.dest.reg().S4(), src3.S4());
QReg src1 = Q1;
if (i.src1.is_constant) {
e.LoadConstantV(src1.toQ(), i.src1.constant());
} else {
src1 = i.src1.reg();
}
e.FMLA(dest.S4(), src1.S4(), src2.S4());
}
};
EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);