JitArm64: Optimize ConvertSingleToDouble, part 2

If we can prove that FCVT will provide a correct conversion,
we can use FCVT. This makes the common case a bit faster
and the less likely cases (unfortunately including zero,
which FCVT actually can convert correctly) a bit slower.
This commit is contained in:
JosJuice 2021-02-01 22:14:16 +01:00
parent 018e247624
commit 1d106ceaf5
6 changed files with 139 additions and 14 deletions

View File

@ -3601,6 +3601,14 @@ void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
}
void ARM64FloatEmitter::FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, size >> 6, 0x1D, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, 2 | (size >> 6), 0x1D, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
{

View File

@ -1094,6 +1094,8 @@ public:
void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
// Conditional select
void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);

View File

@ -154,8 +154,10 @@ public:
void ConvertDoubleToSingleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
void ConvertDoubleToSinglePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg,
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg,
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
private:
struct SlowmemHandler
@ -189,14 +191,18 @@ private:
nearcode = GetWritableCodePtr();
SetCodePtrUnsafe(farcode.GetWritableCodePtr());
AlignCode16();
m_in_farcode = true;
}
void SwitchToNearCode()
{
farcode.SetCodePtrUnsafe(GetWritableCodePtr());
SetCodePtrUnsafe(nearcode);
m_in_farcode = false;
}
bool IsInFarCode() const { return m_in_farcode; }
// Dump a memory range of code
void DumpCode(const u8* start, const u8* end);
@ -262,6 +268,7 @@ private:
Arm64Gen::ARM64CodeBlock farcode;
u8* nearcode; // Backed up when we switch to far code.
bool m_in_farcode = false;
bool m_enable_blr_optimization;
bool m_cleanup_after_stackfault = false;

View File

@ -421,10 +421,35 @@ void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg)
ABI_PopRegisters(gpr_saved);
}
void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg)
void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg)
{
ASSERT(scratch_reg != src_reg);
const bool switch_to_farcode = !IsInFarCode();
FlushCarry();
// Do we know that the input isn't NaN, and that the input isn't denormal or FPCR.FZ is not set?
// (This check unfortunately also catches zeroes)
FixupBranch fast;
if (scratch_reg != ARM64Reg::INVALID_REG)
{
m_float_emit.FABS(EncodeRegToSingle(scratch_reg), EncodeRegToSingle(src_reg));
m_float_emit.FCMP(EncodeRegToSingle(scratch_reg));
fast = B(CCFlags::CC_GT);
if (switch_to_farcode)
{
FixupBranch slow = B();
SwitchToFarCode();
SetJumpTarget(slow);
}
}
// If no (or if we don't have a scratch register), call the bit-exact routine
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
ABI_PushRegisters(gpr_saved);
@ -433,12 +458,65 @@ void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg)
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
ABI_PopRegisters(gpr_saved);
// If yes, do a fast conversion with FCVT
if (scratch_reg != ARM64Reg::INVALID_REG)
{
FixupBranch continue1 = B();
if (switch_to_farcode)
SwitchToNearCode();
SetJumpTarget(fast);
m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
SetJumpTarget(continue1);
}
}
void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg)
void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg)
{
ASSERT(scratch_reg != src_reg);
const bool switch_to_farcode = !IsInFarCode();
FlushCarry();
// Do we know that neither input is NaN, and that neither input is denormal or FPCR.FZ is not set?
// (This check unfortunately also catches zeroes)
FixupBranch fast;
if (scratch_reg != ARM64Reg::INVALID_REG)
{
// Set each 32-bit element of scratch_reg to 0x0000'0000 or 0xFFFF'FFFF depending on whether
// the absolute value of the corresponding element in src_reg compares greater than 0
m_float_emit.MOVI(8, EncodeRegToDouble(scratch_reg), 0);
m_float_emit.FACGT(32, EncodeRegToDouble(scratch_reg), EncodeRegToDouble(src_reg),
EncodeRegToDouble(scratch_reg));
// 0x0000'0000'0000'0000 (zero) -> 0x0000'0000'0000'0000 (zero)
// 0x0000'0000'FFFF'FFFF (denormal) -> 0xFF00'0000'FFFF'FFFF (normal)
// 0xFFFF'FFFF'0000'0000 (NaN) -> 0x00FF'FFFF'0000'0000 (normal)
// 0xFFFF'FFFF'FFFF'FFFF (NaN) -> 0xFFFF'FFFF'FFFF'FFFF (NaN)
m_float_emit.INS(8, EncodeRegToDouble(scratch_reg), 7, EncodeRegToDouble(scratch_reg), 0);
// Is scratch_reg a NaN (0xFFFF'FFFF'FFFF'FFFF)?
m_float_emit.FCMP(EncodeRegToDouble(scratch_reg));
fast = B(CCFlags::CC_VS);
if (switch_to_farcode)
{
FixupBranch slow = B();
SwitchToFarCode();
SetJumpTarget(slow);
}
}
// If no (or if we don't have a scratch register), call the bit-exact routine
// Save X0-X4 and X30 if they're in use
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
ABI_PushRegisters(gpr_saved);
@ -452,4 +530,19 @@ void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg)
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
ABI_PopRegisters(gpr_saved);
// If yes, do a fast conversion with FCVTL
if (scratch_reg != ARM64Reg::INVALID_REG)
{
FixupBranch continue1 = B();
if (switch_to_farcode)
SwitchToNearCode();
SetJumpTarget(fast);
m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
SetJumpTarget(continue1);
}
}

View File

@ -468,7 +468,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
return host_reg;
// Else convert this register back to doubles.
m_jit->ConvertSingleToDoublePair(host_reg, host_reg);
const ARM64Reg tmp_reg = GetReg();
m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg);
UnlockRegister(tmp_reg);
reg.Load(host_reg, RegType::Register);
[[fallthrough]];
}
@ -483,7 +486,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
return host_reg;
// Else convert this register back to a double.
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg);
const ARM64Reg tmp_reg = GetReg();
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
UnlockRegister(tmp_reg);
reg.Load(host_reg, RegType::LowerPair);
[[fallthrough]];
}
@ -517,7 +523,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
return host_reg;
}
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg);
const ARM64Reg tmp_reg = GetReg();
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
UnlockRegister(tmp_reg);
reg.Load(host_reg, RegType::Duplicated);
[[fallthrough]];
}
@ -594,7 +603,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
{
case RegType::Single:
flush_reg = GetReg();
m_jit->ConvertSingleToDoublePair(flush_reg, host_reg);
m_jit->ConvertSingleToDoublePair(flush_reg, host_reg, flush_reg);
[[fallthrough]];
case RegType::Register:
// We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit
@ -605,7 +614,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
break;
case RegType::DuplicatedSingle:
flush_reg = GetReg();
m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg);
m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg, flush_reg);
[[fallthrough]];
case RegType::Duplicated:
// Store PSR1 (which is equal to PSR0) in memory.
@ -709,17 +718,20 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
const bool dirty = reg.IsDirty();
RegType type = reg.GetType();
// If FlushRegister calls GetReg with all registers locked, we can get infinite recursion
const ARM64Reg tmp_reg = GetUnlockedRegisterCount() > 0 ? GetReg() : ARM64Reg::INVALID_REG;
// If we're in single mode, just convert it back to a double.
if (type == RegType::Single)
{
if (dirty)
m_jit->ConvertSingleToDoublePair(host_reg, host_reg);
m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg);
type = RegType::Register;
}
if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle)
{
if (dirty)
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg);
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
if (type == RegType::DuplicatedSingle)
type = RegType::Duplicated;
@ -771,6 +783,9 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
reg.Flush();
}
}
if (tmp_reg != ARM64Reg::INVALID_REG)
UnlockRegister(tmp_reg);
}
void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state)

View File

@ -168,6 +168,9 @@ public:
void UpdateLastUsed(BitSet32 regs_used);
// Get available host registers
u32 GetUnlockedRegisterCount() const;
// Locks a register so a cache cannot use it
// Useful for function calls
template <typename T = Arm64Gen::ARM64Reg, typename... Args>
@ -211,9 +214,6 @@ protected:
void DiscardRegister(size_t preg);
virtual void FlushRegister(size_t preg, bool maintain_state) = 0;
// Get available host registers
u32 GetUnlockedRegisterCount() const;
void IncrementAllUsed()
{
for (auto& reg : m_guest_registers)