JitArm64: Optimize ConvertSingleToDouble, part 2
If we can prove that FCVT will provide a correct conversion, we can use FCVT. This makes the common case a bit faster and the less likely cases (unfortunately including zero, which FCVT actually can convert correctly) a bit slower.
This commit is contained in:
parent
018e247624
commit
1d106ceaf5
|
@ -3601,6 +3601,14 @@ void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
|
|||
{
|
||||
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(1, size >> 6, 0x1D, Rd, Rn, Rm);
|
||||
}
|
||||
void ARM64FloatEmitter::FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(1, 2 | (size >> 6), 0x1D, Rd, Rn, Rm);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
|
||||
{
|
||||
|
|
|
@ -1094,6 +1094,8 @@ public:
|
|||
void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
|
||||
// Conditional select
|
||||
void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
|
||||
|
|
|
@ -154,8 +154,10 @@ public:
|
|||
|
||||
void ConvertDoubleToSingleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
|
||||
void ConvertDoubleToSinglePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
|
||||
void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
|
||||
void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
|
||||
void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg,
|
||||
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
|
||||
void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg,
|
||||
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
|
||||
|
||||
private:
|
||||
struct SlowmemHandler
|
||||
|
@ -189,14 +191,18 @@ private:
|
|||
nearcode = GetWritableCodePtr();
|
||||
SetCodePtrUnsafe(farcode.GetWritableCodePtr());
|
||||
AlignCode16();
|
||||
m_in_farcode = true;
|
||||
}
|
||||
|
||||
void SwitchToNearCode()
|
||||
{
|
||||
farcode.SetCodePtrUnsafe(GetWritableCodePtr());
|
||||
SetCodePtrUnsafe(nearcode);
|
||||
m_in_farcode = false;
|
||||
}
|
||||
|
||||
bool IsInFarCode() const { return m_in_farcode; }
|
||||
|
||||
// Dump a memory range of code
|
||||
void DumpCode(const u8* start, const u8* end);
|
||||
|
||||
|
@ -262,6 +268,7 @@ private:
|
|||
|
||||
Arm64Gen::ARM64CodeBlock farcode;
|
||||
u8* nearcode; // Backed up when we switch to far code.
|
||||
bool m_in_farcode = false;
|
||||
|
||||
bool m_enable_blr_optimization;
|
||||
bool m_cleanup_after_stackfault = false;
|
||||
|
|
|
@ -421,10 +421,35 @@ void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg)
|
|||
ABI_PopRegisters(gpr_saved);
|
||||
}
|
||||
|
||||
void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg)
|
||||
void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg)
|
||||
{
|
||||
ASSERT(scratch_reg != src_reg);
|
||||
|
||||
const bool switch_to_farcode = !IsInFarCode();
|
||||
|
||||
FlushCarry();
|
||||
|
||||
// Do we know that the input isn't NaN, and that the input isn't denormal or FPCR.FZ is not set?
|
||||
// (This check unfortunately also catches zeroes)
|
||||
|
||||
FixupBranch fast;
|
||||
if (scratch_reg != ARM64Reg::INVALID_REG)
|
||||
{
|
||||
m_float_emit.FABS(EncodeRegToSingle(scratch_reg), EncodeRegToSingle(src_reg));
|
||||
m_float_emit.FCMP(EncodeRegToSingle(scratch_reg));
|
||||
fast = B(CCFlags::CC_GT);
|
||||
|
||||
if (switch_to_farcode)
|
||||
{
|
||||
FixupBranch slow = B();
|
||||
|
||||
SwitchToFarCode();
|
||||
SetJumpTarget(slow);
|
||||
}
|
||||
}
|
||||
|
||||
// If no (or if we don't have a scratch register), call the bit-exact routine
|
||||
|
||||
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
|
||||
ABI_PushRegisters(gpr_saved);
|
||||
|
||||
|
@ -433,12 +458,65 @@ void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg)
|
|||
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
|
||||
|
||||
ABI_PopRegisters(gpr_saved);
|
||||
|
||||
// If yes, do a fast conversion with FCVT
|
||||
|
||||
if (scratch_reg != ARM64Reg::INVALID_REG)
|
||||
{
|
||||
FixupBranch continue1 = B();
|
||||
|
||||
if (switch_to_farcode)
|
||||
SwitchToNearCode();
|
||||
|
||||
SetJumpTarget(fast);
|
||||
|
||||
m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
|
||||
|
||||
SetJumpTarget(continue1);
|
||||
}
|
||||
}
|
||||
|
||||
void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg)
|
||||
void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg)
|
||||
{
|
||||
ASSERT(scratch_reg != src_reg);
|
||||
|
||||
const bool switch_to_farcode = !IsInFarCode();
|
||||
|
||||
FlushCarry();
|
||||
|
||||
// Do we know that neither input is NaN, and that neither input is denormal or FPCR.FZ is not set?
|
||||
// (This check unfortunately also catches zeroes)
|
||||
|
||||
FixupBranch fast;
|
||||
if (scratch_reg != ARM64Reg::INVALID_REG)
|
||||
{
|
||||
// Set each 32-bit element of scratch_reg to 0x0000'0000 or 0xFFFF'FFFF depending on whether
|
||||
// the absolute value of the corresponding element in src_reg compares greater than 0
|
||||
m_float_emit.MOVI(8, EncodeRegToDouble(scratch_reg), 0);
|
||||
m_float_emit.FACGT(32, EncodeRegToDouble(scratch_reg), EncodeRegToDouble(src_reg),
|
||||
EncodeRegToDouble(scratch_reg));
|
||||
|
||||
// 0x0000'0000'0000'0000 (zero) -> 0x0000'0000'0000'0000 (zero)
|
||||
// 0x0000'0000'FFFF'FFFF (denormal) -> 0xFF00'0000'FFFF'FFFF (normal)
|
||||
// 0xFFFF'FFFF'0000'0000 (NaN) -> 0x00FF'FFFF'0000'0000 (normal)
|
||||
// 0xFFFF'FFFF'FFFF'FFFF (NaN) -> 0xFFFF'FFFF'FFFF'FFFF (NaN)
|
||||
m_float_emit.INS(8, EncodeRegToDouble(scratch_reg), 7, EncodeRegToDouble(scratch_reg), 0);
|
||||
|
||||
// Is scratch_reg a NaN (0xFFFF'FFFF'FFFF'FFFF)?
|
||||
m_float_emit.FCMP(EncodeRegToDouble(scratch_reg));
|
||||
fast = B(CCFlags::CC_VS);
|
||||
|
||||
if (switch_to_farcode)
|
||||
{
|
||||
FixupBranch slow = B();
|
||||
|
||||
SwitchToFarCode();
|
||||
SetJumpTarget(slow);
|
||||
}
|
||||
}
|
||||
|
||||
// If no (or if we don't have a scratch register), call the bit-exact routine
|
||||
|
||||
// Save X0-X4 and X30 if they're in use
|
||||
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
|
||||
ABI_PushRegisters(gpr_saved);
|
||||
|
@ -452,4 +530,19 @@ void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg)
|
|||
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
|
||||
|
||||
ABI_PopRegisters(gpr_saved);
|
||||
|
||||
// If yes, do a fast conversion with FCVTL
|
||||
|
||||
if (scratch_reg != ARM64Reg::INVALID_REG)
|
||||
{
|
||||
FixupBranch continue1 = B();
|
||||
|
||||
if (switch_to_farcode)
|
||||
SwitchToNearCode();
|
||||
|
||||
SetJumpTarget(fast);
|
||||
m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
|
||||
|
||||
SetJumpTarget(continue1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -468,7 +468,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
|
|||
return host_reg;
|
||||
|
||||
// Else convert this register back to doubles.
|
||||
m_jit->ConvertSingleToDoublePair(host_reg, host_reg);
|
||||
const ARM64Reg tmp_reg = GetReg();
|
||||
m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg);
|
||||
UnlockRegister(tmp_reg);
|
||||
|
||||
reg.Load(host_reg, RegType::Register);
|
||||
[[fallthrough]];
|
||||
}
|
||||
|
@ -483,7 +486,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
|
|||
return host_reg;
|
||||
|
||||
// Else convert this register back to a double.
|
||||
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg);
|
||||
const ARM64Reg tmp_reg = GetReg();
|
||||
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
|
||||
UnlockRegister(tmp_reg);
|
||||
|
||||
reg.Load(host_reg, RegType::LowerPair);
|
||||
[[fallthrough]];
|
||||
}
|
||||
|
@ -517,7 +523,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
|
|||
return host_reg;
|
||||
}
|
||||
|
||||
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg);
|
||||
const ARM64Reg tmp_reg = GetReg();
|
||||
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
|
||||
UnlockRegister(tmp_reg);
|
||||
|
||||
reg.Load(host_reg, RegType::Duplicated);
|
||||
[[fallthrough]];
|
||||
}
|
||||
|
@ -594,7 +603,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
|
|||
{
|
||||
case RegType::Single:
|
||||
flush_reg = GetReg();
|
||||
m_jit->ConvertSingleToDoublePair(flush_reg, host_reg);
|
||||
m_jit->ConvertSingleToDoublePair(flush_reg, host_reg, flush_reg);
|
||||
[[fallthrough]];
|
||||
case RegType::Register:
|
||||
// We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit
|
||||
|
@ -605,7 +614,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
|
|||
break;
|
||||
case RegType::DuplicatedSingle:
|
||||
flush_reg = GetReg();
|
||||
m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg);
|
||||
m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg, flush_reg);
|
||||
[[fallthrough]];
|
||||
case RegType::Duplicated:
|
||||
// Store PSR1 (which is equal to PSR0) in memory.
|
||||
|
@ -709,17 +718,20 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
|
|||
const bool dirty = reg.IsDirty();
|
||||
RegType type = reg.GetType();
|
||||
|
||||
// If FlushRegister calls GetReg with all registers locked, we can get infinite recursion
|
||||
const ARM64Reg tmp_reg = GetUnlockedRegisterCount() > 0 ? GetReg() : ARM64Reg::INVALID_REG;
|
||||
|
||||
// If we're in single mode, just convert it back to a double.
|
||||
if (type == RegType::Single)
|
||||
{
|
||||
if (dirty)
|
||||
m_jit->ConvertSingleToDoublePair(host_reg, host_reg);
|
||||
m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg);
|
||||
type = RegType::Register;
|
||||
}
|
||||
if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle)
|
||||
{
|
||||
if (dirty)
|
||||
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg);
|
||||
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
|
||||
|
||||
if (type == RegType::DuplicatedSingle)
|
||||
type = RegType::Duplicated;
|
||||
|
@ -771,6 +783,9 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
|
|||
reg.Flush();
|
||||
}
|
||||
}
|
||||
|
||||
if (tmp_reg != ARM64Reg::INVALID_REG)
|
||||
UnlockRegister(tmp_reg);
|
||||
}
|
||||
|
||||
void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state)
|
||||
|
|
|
@ -168,6 +168,9 @@ public:
|
|||
|
||||
void UpdateLastUsed(BitSet32 regs_used);
|
||||
|
||||
// Get available host registers
|
||||
u32 GetUnlockedRegisterCount() const;
|
||||
|
||||
// Locks a register so a cache cannot use it
|
||||
// Useful for function calls
|
||||
template <typename T = Arm64Gen::ARM64Reg, typename... Args>
|
||||
|
@ -211,9 +214,6 @@ protected:
|
|||
void DiscardRegister(size_t preg);
|
||||
virtual void FlushRegister(size_t preg, bool maintain_state) = 0;
|
||||
|
||||
// Get available host registers
|
||||
u32 GetUnlockedRegisterCount() const;
|
||||
|
||||
void IncrementAllUsed()
|
||||
{
|
||||
for (auto& reg : m_guest_registers)
|
||||
|
|
Loading…
Reference in New Issue