[a64] Optimize `OPCODE_MUL_ADD`

Use `FMADD` and `FMLA`
Tests are the same, though now it should run a bit faster.
The tests that fail are primarily denormals and other subtle precision
issues it seems.

Ex:
```
i> 00002358   - vmaddfp_7298_GEN
!> 00002358 Register v4 assert failed:
!> 00002358   Expected: v4 == [00000000, 00000000, 00000000, 00000000]
!> 00002358     Actual: v4 == [000D000E, 00138014, 000E4CDC, 0018B34D]
!> 00002358     TEST FAILED
```

Host-To-Guest and Guest-To-Host thunks should probably restore/preserve
the FPCR to maintain these roundings.
This commit is contained in:
Wunkolo 2024-05-12 20:04:47 -07:00
parent 684904c487
commit b9d0752b40
1 changed files with 49 additions and 40 deletions

View File

@ -1717,77 +1717,86 @@ EMITTER_OPCODE_TABLE(OPCODE_DIV, DIV_I8, DIV_I16, DIV_I32, DIV_I64, DIV_F32,
struct MUL_ADD_F32 struct MUL_ADD_F32
: Sequence<MUL_ADD_F32, I<OPCODE_MUL_ADD, F32Op, F32Op, F32Op, F32Op>> { : Sequence<MUL_ADD_F32, I<OPCODE_MUL_ADD, F32Op, F32Op, F32Op, F32Op>> {
static void Emit(A64Emitter& e, const EmitArgType& i) { static void Emit(A64Emitter& e, const EmitArgType& i) {
SReg src3(1); SReg src3 = S3;
if (i.src3.is_constant) { if (i.src3.is_constant) {
src3 = S1;
e.LoadConstantV(src3.toQ(), i.src3.constant()); e.LoadConstantV(src3.toQ(), i.src3.constant());
} else { } else {
// If i.dest == i.src3, back up i.src3 so we don't overwrite it.
src3 = i.src3.reg(); src3 = i.src3.reg();
if (i.dest.reg().index() == i.src3.reg().index()) {
e.FMOV(S1, i.src3);
src3 = S1;
}
} }
// Multiply operation is commutative. SReg src2 = S2;
EmitCommutativeBinaryVOp<SReg>( if (i.src2.is_constant) {
e, i, [&i](A64Emitter& e, SReg dest, SReg src1, SReg src2) { e.LoadConstantV(src2.toQ(), i.src2.constant());
e.FMUL(dest, src1, src2); // $0 = $1 * $2 } else {
}); src2 = i.src2.reg();
}
e.FADD(i.dest, i.dest, src3); // $0 = $1 + $2 SReg src1 = S1;
if (i.src1.is_constant) {
e.LoadConstantV(src1.toQ(), i.src1.constant());
} else {
src1 = i.src1.reg();
}
e.FMADD(i.dest, src1, src2, src3);
} }
}; };
struct MUL_ADD_F64 struct MUL_ADD_F64
: Sequence<MUL_ADD_F64, I<OPCODE_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> { : Sequence<MUL_ADD_F64, I<OPCODE_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
static void Emit(A64Emitter& e, const EmitArgType& i) { static void Emit(A64Emitter& e, const EmitArgType& i) {
DReg src3(1); DReg src3 = D3;
if (i.src3.is_constant) { if (i.src3.is_constant) {
src3 = D1;
e.LoadConstantV(src3.toQ(), i.src3.constant()); e.LoadConstantV(src3.toQ(), i.src3.constant());
} else { } else {
// If i.dest == i.src3, back up i.src3 so we don't overwrite it.
src3 = i.src3.reg(); src3 = i.src3.reg();
if (i.dest.reg().index() == i.src3.reg().index()) {
e.FMOV(D1, i.src3);
src3 = D1;
}
} }
// Multiply operation is commutative. DReg src2 = D2;
EmitCommutativeBinaryVOp<DReg>( if (i.src2.is_constant) {
e, i, [&i](A64Emitter& e, DReg dest, DReg src1, DReg src2) { e.LoadConstantV(src2.toQ(), i.src2.constant());
e.FMUL(dest, src1, src2); // $0 = $1 * $2 } else {
}); src2 = i.src2.reg();
}
e.FADD(i.dest, i.dest, src3); // $0 = $1 + $2 DReg src1 = D1;
if (i.src1.is_constant) {
e.LoadConstantV(src1.toQ(), i.src1.constant());
} else {
src1 = i.src1.reg();
}
e.FMADD(i.dest, src1, src2, src3);
} }
}; };
struct MUL_ADD_V128 struct MUL_ADD_V128
: Sequence<MUL_ADD_V128, : Sequence<MUL_ADD_V128,
I<OPCODE_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> { I<OPCODE_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
static void Emit(A64Emitter& e, const EmitArgType& i) { static void Emit(A64Emitter& e, const EmitArgType& i) {
QReg src3(1); const QReg dest = i.dest.reg();
if (i.src3.is_constant) { if (i.src3.is_constant) {
src3 = Q1; e.LoadConstantV(dest.toQ(), i.src3.constant());
e.LoadConstantV(src3, i.src3.constant());
} else { } else {
// If i.dest == i.src3, back up i.src3 so we don't overwrite it. // If i.dest != i.src3, move the addition-term into dest for FMLA
src3 = i.src3; if (i.dest != i.src3) {
if (i.dest == i.src3) { e.MOV(dest.B16(), i.src3.reg().B16());
e.MOV(Q1.B16(), i.src3.reg().B16());
src3 = Q1;
} }
} }
// Multiply operation is commutative. QReg src2 = Q2;
EmitCommutativeBinaryVOp( if (i.src2.is_constant) {
e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) { e.LoadConstantV(src2.toQ(), i.src2.constant());
e.FMUL(dest.S4(), src1.S4(), src2.S4()); // $0 = $1 * $2 } else {
}); src2 = i.src2.reg();
}
e.FADD(i.dest.reg().S4(), i.dest.reg().S4(), src3.S4()); QReg src1 = Q1;
if (i.src1.is_constant) {
e.LoadConstantV(src1.toQ(), i.src1.constant());
} else {
src1 = i.src1.reg();
}
e.FMLA(dest.S4(), src1.S4(), src2.S4());
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128); EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);