JitArm64: Prefer using FMOV when doing single/double conversion

FMOV is faster than INS and ties UMOV. (On all CPUs I checked,
at least. It certainly shouldn't be slower, though.)
This commit is contained in:
JosJuice 2021-05-15 18:56:40 +02:00
parent 41befc21cd
commit 8c12068a03
2 changed files with 14 additions and 17 deletions

View File

@ -447,9 +447,9 @@ void JitArm64::ConvertDoubleToSingleLower(size_t guest_reg, ARM64Reg dest_reg, A
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30};
ABI_PushRegisters(gpr_saved);
m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 0);
m_float_emit.FMOV(ARM64Reg::X0, EncodeRegToDouble(src_reg));
BL(cdts);
m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W1);
m_float_emit.FMOV(EncodeRegToSingle(dest_reg), ARM64Reg::W1);
ABI_PopRegisters(gpr_saved);
}
@ -467,11 +467,10 @@ void JitArm64::ConvertDoubleToSinglePair(size_t guest_reg, ARM64Reg dest_reg, AR
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30};
ABI_PushRegisters(gpr_saved);
m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 0);
m_float_emit.FMOV(ARM64Reg::X0, EncodeRegToDouble(src_reg));
BL(cdts);
m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W1);
m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 1);
m_float_emit.FMOV(EncodeRegToSingle(dest_reg), ARM64Reg::W1);
BL(cdts);
m_float_emit.INS(32, dest_reg, 1, ARM64Reg::W1);
@ -517,9 +516,9 @@ void JitArm64::ConvertSingleToDoubleLower(size_t guest_reg, ARM64Reg dest_reg, A
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
ABI_PushRegisters(gpr_saved);
m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0);
m_float_emit.FMOV(ARM64Reg::W0, EncodeRegToSingle(src_reg));
BL(cstd);
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
m_float_emit.FMOV(EncodeRegToDouble(dest_reg), ARM64Reg::X1);
ABI_PopRegisters(gpr_saved);
@ -588,17 +587,15 @@ void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, AR
// If no (or if we don't have a scratch register), call the bit-exact routine
// Save X0-X4 and X30 if they're in use
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
ABI_PushRegisters(gpr_saved);
m_float_emit.FMOV(ARM64Reg::W0, EncodeRegToSingle(src_reg));
BL(cstd);
m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 1);
m_float_emit.FMOV(EncodeRegToDouble(dest_reg), ARM64Reg::X1);
BL(cstd);
m_float_emit.INS(64, dest_reg, 1, ARM64Reg::X0);
m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0);
BL(cstd);
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
m_float_emit.INS(64, dest_reg, 1, ARM64Reg::X1);
ABI_PopRegisters(gpr_saved);

View File

@ -239,7 +239,7 @@ void JitArm64::GenerateConvertDoubleToSingle()
RET();
}
// Input in W0, output in X0, clobbers X0-X4 and flags.
// Input in W0, output in X1, clobbers X0-X4 and flags.
void JitArm64::GenerateConvertSingleToDouble()
{
UBFX(ARM64Reg::W1, ARM64Reg::W0, 23, 8);
@ -249,7 +249,7 @@ void JitArm64::GenerateConvertSingleToDouble()
FixupBranch denormal = CBNZ(ARM64Reg::W1);
// Zero
LSL(ARM64Reg::X0, ARM64Reg::X0, 32);
LSL(ARM64Reg::X1, ARM64Reg::X0, 32);
RET();
SetJumpTarget(denormal);
@ -262,7 +262,7 @@ void JitArm64::GenerateConvertSingleToDouble()
LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3);
BFI(ARM64Reg::X2, ARM64Reg::X1, 30, 22);
MOVI2R(ARM64Reg::X1, 0x3a90000000000000);
ADD(ARM64Reg::X0, ARM64Reg::X2, ARM64Reg::X1);
ADD(ARM64Reg::X1, ARM64Reg::X2, ARM64Reg::X1);
RET();
SetJumpTarget(normal_or_nan);
@ -277,7 +277,7 @@ void JitArm64::GenerateConvertSingleToDouble()
CMP(ARM64Reg::W2, 0);
CSEL(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::ZR, CCFlags::CC_NEQ);
BFI(ARM64Reg::X3, ARM64Reg::X4, 29, 30);
ORR(ARM64Reg::X0, ARM64Reg::X3, ARM64Reg::X1);
ORR(ARM64Reg::X1, ARM64Reg::X3, ARM64Reg::X1);
RET();
}