JitArm64: Optimize ConvertSingleToDouble, part 2

If we can prove that FCVT will provide a correct conversion,
we can use FCVT. This makes the common case a bit faster
and the less likely cases (unfortunately including zero,
which FCVT actually can convert correctly) a bit slower.
This commit is contained in:
JosJuice 2021-02-01 22:14:16 +01:00
parent 018e247624
commit 1d106ceaf5
6 changed files with 139 additions and 14 deletions

View File

@ -3601,6 +3601,14 @@ void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{ {
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn); Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
} }
void ARM64FloatEmitter::FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, size >> 6, 0x1D, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, 2 | (size >> 6), 0x1D, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
{ {

View File

@ -1094,6 +1094,8 @@ public:
void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
// Conditional select // Conditional select
void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);

View File

@ -154,8 +154,10 @@ public:
void ConvertDoubleToSingleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); void ConvertDoubleToSingleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
void ConvertDoubleToSinglePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); void ConvertDoubleToSinglePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg,
void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg,
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
private: private:
struct SlowmemHandler struct SlowmemHandler
@ -189,14 +191,18 @@ private:
nearcode = GetWritableCodePtr(); nearcode = GetWritableCodePtr();
SetCodePtrUnsafe(farcode.GetWritableCodePtr()); SetCodePtrUnsafe(farcode.GetWritableCodePtr());
AlignCode16(); AlignCode16();
m_in_farcode = true;
} }
void SwitchToNearCode() void SwitchToNearCode()
{ {
farcode.SetCodePtrUnsafe(GetWritableCodePtr()); farcode.SetCodePtrUnsafe(GetWritableCodePtr());
SetCodePtrUnsafe(nearcode); SetCodePtrUnsafe(nearcode);
m_in_farcode = false;
} }
bool IsInFarCode() const { return m_in_farcode; }
// Dump a memory range of code // Dump a memory range of code
void DumpCode(const u8* start, const u8* end); void DumpCode(const u8* start, const u8* end);
@ -262,6 +268,7 @@ private:
Arm64Gen::ARM64CodeBlock farcode; Arm64Gen::ARM64CodeBlock farcode;
u8* nearcode; // Backed up when we switch to far code. u8* nearcode; // Backed up when we switch to far code.
bool m_in_farcode = false;
bool m_enable_blr_optimization; bool m_enable_blr_optimization;
bool m_cleanup_after_stackfault = false; bool m_cleanup_after_stackfault = false;

View File

@ -421,10 +421,35 @@ void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg)
ABI_PopRegisters(gpr_saved); ABI_PopRegisters(gpr_saved);
} }
void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg) void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg)
{ {
ASSERT(scratch_reg != src_reg);
const bool switch_to_farcode = !IsInFarCode();
FlushCarry(); FlushCarry();
// Do we know that the input isn't NaN, and that the input isn't denormal or FPCR.FZ is not set?
// (This check unfortunately also catches zeroes)
FixupBranch fast;
if (scratch_reg != ARM64Reg::INVALID_REG)
{
m_float_emit.FABS(EncodeRegToSingle(scratch_reg), EncodeRegToSingle(src_reg));
m_float_emit.FCMP(EncodeRegToSingle(scratch_reg));
fast = B(CCFlags::CC_GT);
if (switch_to_farcode)
{
FixupBranch slow = B();
SwitchToFarCode();
SetJumpTarget(slow);
}
}
// If no (or if we don't have a scratch register), call the bit-exact routine
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30}; const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
ABI_PushRegisters(gpr_saved); ABI_PushRegisters(gpr_saved);
@ -433,12 +458,65 @@ void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg)
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
ABI_PopRegisters(gpr_saved); ABI_PopRegisters(gpr_saved);
// If yes, do a fast conversion with FCVT
if (scratch_reg != ARM64Reg::INVALID_REG)
{
FixupBranch continue1 = B();
if (switch_to_farcode)
SwitchToNearCode();
SetJumpTarget(fast);
m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
SetJumpTarget(continue1);
}
} }
void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg) void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg)
{ {
ASSERT(scratch_reg != src_reg);
const bool switch_to_farcode = !IsInFarCode();
FlushCarry(); FlushCarry();
// Do we know that neither input is NaN, and that neither input is denormal or FPCR.FZ is not set?
// (This check unfortunately also catches zeroes)
FixupBranch fast;
if (scratch_reg != ARM64Reg::INVALID_REG)
{
// Set each 32-bit element of scratch_reg to 0x0000'0000 or 0xFFFF'FFFF depending on whether
// the absolute value of the corresponding element in src_reg compares greater than 0
m_float_emit.MOVI(8, EncodeRegToDouble(scratch_reg), 0);
m_float_emit.FACGT(32, EncodeRegToDouble(scratch_reg), EncodeRegToDouble(src_reg),
EncodeRegToDouble(scratch_reg));
// 0x0000'0000'0000'0000 (zero) -> 0x0000'0000'0000'0000 (zero)
// 0x0000'0000'FFFF'FFFF (denormal) -> 0xFF00'0000'FFFF'FFFF (normal)
// 0xFFFF'FFFF'0000'0000 (NaN) -> 0x00FF'FFFF'0000'0000 (normal)
// 0xFFFF'FFFF'FFFF'FFFF (NaN) -> 0xFFFF'FFFF'FFFF'FFFF (NaN)
m_float_emit.INS(8, EncodeRegToDouble(scratch_reg), 7, EncodeRegToDouble(scratch_reg), 0);
// Is scratch_reg a NaN (0xFFFF'FFFF'FFFF'FFFF)?
m_float_emit.FCMP(EncodeRegToDouble(scratch_reg));
fast = B(CCFlags::CC_VS);
if (switch_to_farcode)
{
FixupBranch slow = B();
SwitchToFarCode();
SetJumpTarget(slow);
}
}
// If no (or if we don't have a scratch register), call the bit-exact routine
// Save X0-X4 and X30 if they're in use // Save X0-X4 and X30 if they're in use
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30}; const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
ABI_PushRegisters(gpr_saved); ABI_PushRegisters(gpr_saved);
@ -452,4 +530,19 @@ void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg)
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
ABI_PopRegisters(gpr_saved); ABI_PopRegisters(gpr_saved);
// If yes, do a fast conversion with FCVTL
if (scratch_reg != ARM64Reg::INVALID_REG)
{
FixupBranch continue1 = B();
if (switch_to_farcode)
SwitchToNearCode();
SetJumpTarget(fast);
m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
SetJumpTarget(continue1);
}
} }

View File

@ -468,7 +468,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
return host_reg; return host_reg;
// Else convert this register back to doubles. // Else convert this register back to doubles.
m_jit->ConvertSingleToDoublePair(host_reg, host_reg); const ARM64Reg tmp_reg = GetReg();
m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg);
UnlockRegister(tmp_reg);
reg.Load(host_reg, RegType::Register); reg.Load(host_reg, RegType::Register);
[[fallthrough]]; [[fallthrough]];
} }
@ -483,7 +486,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
return host_reg; return host_reg;
// Else convert this register back to a double. // Else convert this register back to a double.
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); const ARM64Reg tmp_reg = GetReg();
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
UnlockRegister(tmp_reg);
reg.Load(host_reg, RegType::LowerPair); reg.Load(host_reg, RegType::LowerPair);
[[fallthrough]]; [[fallthrough]];
} }
@ -517,7 +523,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
return host_reg; return host_reg;
} }
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); const ARM64Reg tmp_reg = GetReg();
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
UnlockRegister(tmp_reg);
reg.Load(host_reg, RegType::Duplicated); reg.Load(host_reg, RegType::Duplicated);
[[fallthrough]]; [[fallthrough]];
} }
@ -594,7 +603,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
{ {
case RegType::Single: case RegType::Single:
flush_reg = GetReg(); flush_reg = GetReg();
m_jit->ConvertSingleToDoublePair(flush_reg, host_reg); m_jit->ConvertSingleToDoublePair(flush_reg, host_reg, flush_reg);
[[fallthrough]]; [[fallthrough]];
case RegType::Register: case RegType::Register:
// We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit // We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit
@ -605,7 +614,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
break; break;
case RegType::DuplicatedSingle: case RegType::DuplicatedSingle:
flush_reg = GetReg(); flush_reg = GetReg();
m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg); m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg, flush_reg);
[[fallthrough]]; [[fallthrough]];
case RegType::Duplicated: case RegType::Duplicated:
// Store PSR1 (which is equal to PSR0) in memory. // Store PSR1 (which is equal to PSR0) in memory.
@ -709,17 +718,20 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
const bool dirty = reg.IsDirty(); const bool dirty = reg.IsDirty();
RegType type = reg.GetType(); RegType type = reg.GetType();
// If FlushRegister calls GetReg with all registers locked, we can get infinite recursion
const ARM64Reg tmp_reg = GetUnlockedRegisterCount() > 0 ? GetReg() : ARM64Reg::INVALID_REG;
// If we're in single mode, just convert it back to a double. // If we're in single mode, just convert it back to a double.
if (type == RegType::Single) if (type == RegType::Single)
{ {
if (dirty) if (dirty)
m_jit->ConvertSingleToDoublePair(host_reg, host_reg); m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg);
type = RegType::Register; type = RegType::Register;
} }
if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle) if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle)
{ {
if (dirty) if (dirty)
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
if (type == RegType::DuplicatedSingle) if (type == RegType::DuplicatedSingle)
type = RegType::Duplicated; type = RegType::Duplicated;
@ -771,6 +783,9 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
reg.Flush(); reg.Flush();
} }
} }
if (tmp_reg != ARM64Reg::INVALID_REG)
UnlockRegister(tmp_reg);
} }
void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state) void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state)

View File

@ -168,6 +168,9 @@ public:
void UpdateLastUsed(BitSet32 regs_used); void UpdateLastUsed(BitSet32 regs_used);
// Get available host registers
u32 GetUnlockedRegisterCount() const;
// Locks a register so a cache cannot use it // Locks a register so a cache cannot use it
// Useful for function calls // Useful for function calls
template <typename T = Arm64Gen::ARM64Reg, typename... Args> template <typename T = Arm64Gen::ARM64Reg, typename... Args>
@ -211,9 +214,6 @@ protected:
void DiscardRegister(size_t preg); void DiscardRegister(size_t preg);
virtual void FlushRegister(size_t preg, bool maintain_state) = 0; virtual void FlushRegister(size_t preg, bool maintain_state) = 0;
// Get available host registers
u32 GetUnlockedRegisterCount() const;
void IncrementAllUsed() void IncrementAllUsed()
{ {
for (auto& reg : m_guest_registers) for (auto& reg : m_guest_registers)