JitArm64: Optimize ConvertSingleToDouble, part 2
If we can prove that FCVT will provide a correct conversion, we can use FCVT. This makes the common case a bit faster and the less likely cases (unfortunately including zero, which FCVT actually can convert correctly) a bit slower.
This commit is contained in:
parent
018e247624
commit
1d106ceaf5
|
@ -3601,6 +3601,14 @@ void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
|
||||||
{
|
{
|
||||||
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
|
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
|
||||||
}
|
}
|
||||||
|
void ARM64FloatEmitter::FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
|
{
|
||||||
|
EmitThreeSame(1, size >> 6, 0x1D, Rd, Rn, Rm);
|
||||||
|
}
|
||||||
|
void ARM64FloatEmitter::FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
|
{
|
||||||
|
EmitThreeSame(1, 2 | (size >> 6), 0x1D, Rd, Rn, Rm);
|
||||||
|
}
|
||||||
|
|
||||||
void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
|
void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
|
||||||
{
|
{
|
||||||
|
|
|
@ -1094,6 +1094,8 @@ public:
|
||||||
void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||||
void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||||
void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||||
|
void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
|
void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
|
|
||||||
// Conditional select
|
// Conditional select
|
||||||
void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
|
void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
|
||||||
|
|
|
@ -154,8 +154,10 @@ public:
|
||||||
|
|
||||||
void ConvertDoubleToSingleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
|
void ConvertDoubleToSingleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
|
||||||
void ConvertDoubleToSinglePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
|
void ConvertDoubleToSinglePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
|
||||||
void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
|
void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg,
|
||||||
void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg);
|
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
|
||||||
|
void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg,
|
||||||
|
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct SlowmemHandler
|
struct SlowmemHandler
|
||||||
|
@ -189,14 +191,18 @@ private:
|
||||||
nearcode = GetWritableCodePtr();
|
nearcode = GetWritableCodePtr();
|
||||||
SetCodePtrUnsafe(farcode.GetWritableCodePtr());
|
SetCodePtrUnsafe(farcode.GetWritableCodePtr());
|
||||||
AlignCode16();
|
AlignCode16();
|
||||||
|
m_in_farcode = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SwitchToNearCode()
|
void SwitchToNearCode()
|
||||||
{
|
{
|
||||||
farcode.SetCodePtrUnsafe(GetWritableCodePtr());
|
farcode.SetCodePtrUnsafe(GetWritableCodePtr());
|
||||||
SetCodePtrUnsafe(nearcode);
|
SetCodePtrUnsafe(nearcode);
|
||||||
|
m_in_farcode = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool IsInFarCode() const { return m_in_farcode; }
|
||||||
|
|
||||||
// Dump a memory range of code
|
// Dump a memory range of code
|
||||||
void DumpCode(const u8* start, const u8* end);
|
void DumpCode(const u8* start, const u8* end);
|
||||||
|
|
||||||
|
@ -262,6 +268,7 @@ private:
|
||||||
|
|
||||||
Arm64Gen::ARM64CodeBlock farcode;
|
Arm64Gen::ARM64CodeBlock farcode;
|
||||||
u8* nearcode; // Backed up when we switch to far code.
|
u8* nearcode; // Backed up when we switch to far code.
|
||||||
|
bool m_in_farcode = false;
|
||||||
|
|
||||||
bool m_enable_blr_optimization;
|
bool m_enable_blr_optimization;
|
||||||
bool m_cleanup_after_stackfault = false;
|
bool m_cleanup_after_stackfault = false;
|
||||||
|
|
|
@ -421,10 +421,35 @@ void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg)
|
||||||
ABI_PopRegisters(gpr_saved);
|
ABI_PopRegisters(gpr_saved);
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg)
|
void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg)
|
||||||
{
|
{
|
||||||
|
ASSERT(scratch_reg != src_reg);
|
||||||
|
|
||||||
|
const bool switch_to_farcode = !IsInFarCode();
|
||||||
|
|
||||||
FlushCarry();
|
FlushCarry();
|
||||||
|
|
||||||
|
// Do we know that the input isn't NaN, and that the input isn't denormal or FPCR.FZ is not set?
|
||||||
|
// (This check unfortunately also catches zeroes)
|
||||||
|
|
||||||
|
FixupBranch fast;
|
||||||
|
if (scratch_reg != ARM64Reg::INVALID_REG)
|
||||||
|
{
|
||||||
|
m_float_emit.FABS(EncodeRegToSingle(scratch_reg), EncodeRegToSingle(src_reg));
|
||||||
|
m_float_emit.FCMP(EncodeRegToSingle(scratch_reg));
|
||||||
|
fast = B(CCFlags::CC_GT);
|
||||||
|
|
||||||
|
if (switch_to_farcode)
|
||||||
|
{
|
||||||
|
FixupBranch slow = B();
|
||||||
|
|
||||||
|
SwitchToFarCode();
|
||||||
|
SetJumpTarget(slow);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no (or if we don't have a scratch register), call the bit-exact routine
|
||||||
|
|
||||||
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
|
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
|
||||||
ABI_PushRegisters(gpr_saved);
|
ABI_PushRegisters(gpr_saved);
|
||||||
|
|
||||||
|
@ -433,12 +458,65 @@ void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg)
|
||||||
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
|
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
|
||||||
|
|
||||||
ABI_PopRegisters(gpr_saved);
|
ABI_PopRegisters(gpr_saved);
|
||||||
|
|
||||||
|
// If yes, do a fast conversion with FCVT
|
||||||
|
|
||||||
|
if (scratch_reg != ARM64Reg::INVALID_REG)
|
||||||
|
{
|
||||||
|
FixupBranch continue1 = B();
|
||||||
|
|
||||||
|
if (switch_to_farcode)
|
||||||
|
SwitchToNearCode();
|
||||||
|
|
||||||
|
SetJumpTarget(fast);
|
||||||
|
|
||||||
|
m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
|
||||||
|
|
||||||
|
SetJumpTarget(continue1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg)
|
void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg)
|
||||||
{
|
{
|
||||||
|
ASSERT(scratch_reg != src_reg);
|
||||||
|
|
||||||
|
const bool switch_to_farcode = !IsInFarCode();
|
||||||
|
|
||||||
FlushCarry();
|
FlushCarry();
|
||||||
|
|
||||||
|
// Do we know that neither input is NaN, and that neither input is denormal or FPCR.FZ is not set?
|
||||||
|
// (This check unfortunately also catches zeroes)
|
||||||
|
|
||||||
|
FixupBranch fast;
|
||||||
|
if (scratch_reg != ARM64Reg::INVALID_REG)
|
||||||
|
{
|
||||||
|
// Set each 32-bit element of scratch_reg to 0x0000'0000 or 0xFFFF'FFFF depending on whether
|
||||||
|
// the absolute value of the corresponding element in src_reg compares greater than 0
|
||||||
|
m_float_emit.MOVI(8, EncodeRegToDouble(scratch_reg), 0);
|
||||||
|
m_float_emit.FACGT(32, EncodeRegToDouble(scratch_reg), EncodeRegToDouble(src_reg),
|
||||||
|
EncodeRegToDouble(scratch_reg));
|
||||||
|
|
||||||
|
// 0x0000'0000'0000'0000 (zero) -> 0x0000'0000'0000'0000 (zero)
|
||||||
|
// 0x0000'0000'FFFF'FFFF (denormal) -> 0xFF00'0000'FFFF'FFFF (normal)
|
||||||
|
// 0xFFFF'FFFF'0000'0000 (NaN) -> 0x00FF'FFFF'0000'0000 (normal)
|
||||||
|
// 0xFFFF'FFFF'FFFF'FFFF (NaN) -> 0xFFFF'FFFF'FFFF'FFFF (NaN)
|
||||||
|
m_float_emit.INS(8, EncodeRegToDouble(scratch_reg), 7, EncodeRegToDouble(scratch_reg), 0);
|
||||||
|
|
||||||
|
// Is scratch_reg a NaN (0xFFFF'FFFF'FFFF'FFFF)?
|
||||||
|
m_float_emit.FCMP(EncodeRegToDouble(scratch_reg));
|
||||||
|
fast = B(CCFlags::CC_VS);
|
||||||
|
|
||||||
|
if (switch_to_farcode)
|
||||||
|
{
|
||||||
|
FixupBranch slow = B();
|
||||||
|
|
||||||
|
SwitchToFarCode();
|
||||||
|
SetJumpTarget(slow);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no (or if we don't have a scratch register), call the bit-exact routine
|
||||||
|
|
||||||
// Save X0-X4 and X30 if they're in use
|
// Save X0-X4 and X30 if they're in use
|
||||||
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
|
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
|
||||||
ABI_PushRegisters(gpr_saved);
|
ABI_PushRegisters(gpr_saved);
|
||||||
|
@ -452,4 +530,19 @@ void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg)
|
||||||
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
|
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
|
||||||
|
|
||||||
ABI_PopRegisters(gpr_saved);
|
ABI_PopRegisters(gpr_saved);
|
||||||
|
|
||||||
|
// If yes, do a fast conversion with FCVTL
|
||||||
|
|
||||||
|
if (scratch_reg != ARM64Reg::INVALID_REG)
|
||||||
|
{
|
||||||
|
FixupBranch continue1 = B();
|
||||||
|
|
||||||
|
if (switch_to_farcode)
|
||||||
|
SwitchToNearCode();
|
||||||
|
|
||||||
|
SetJumpTarget(fast);
|
||||||
|
m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
|
||||||
|
|
||||||
|
SetJumpTarget(continue1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -468,7 +468,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
|
||||||
return host_reg;
|
return host_reg;
|
||||||
|
|
||||||
// Else convert this register back to doubles.
|
// Else convert this register back to doubles.
|
||||||
m_jit->ConvertSingleToDoublePair(host_reg, host_reg);
|
const ARM64Reg tmp_reg = GetReg();
|
||||||
|
m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg);
|
||||||
|
UnlockRegister(tmp_reg);
|
||||||
|
|
||||||
reg.Load(host_reg, RegType::Register);
|
reg.Load(host_reg, RegType::Register);
|
||||||
[[fallthrough]];
|
[[fallthrough]];
|
||||||
}
|
}
|
||||||
|
@ -483,7 +486,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
|
||||||
return host_reg;
|
return host_reg;
|
||||||
|
|
||||||
// Else convert this register back to a double.
|
// Else convert this register back to a double.
|
||||||
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg);
|
const ARM64Reg tmp_reg = GetReg();
|
||||||
|
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
|
||||||
|
UnlockRegister(tmp_reg);
|
||||||
|
|
||||||
reg.Load(host_reg, RegType::LowerPair);
|
reg.Load(host_reg, RegType::LowerPair);
|
||||||
[[fallthrough]];
|
[[fallthrough]];
|
||||||
}
|
}
|
||||||
|
@ -517,7 +523,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
|
||||||
return host_reg;
|
return host_reg;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg);
|
const ARM64Reg tmp_reg = GetReg();
|
||||||
|
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
|
||||||
|
UnlockRegister(tmp_reg);
|
||||||
|
|
||||||
reg.Load(host_reg, RegType::Duplicated);
|
reg.Load(host_reg, RegType::Duplicated);
|
||||||
[[fallthrough]];
|
[[fallthrough]];
|
||||||
}
|
}
|
||||||
|
@ -594,7 +603,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
|
||||||
{
|
{
|
||||||
case RegType::Single:
|
case RegType::Single:
|
||||||
flush_reg = GetReg();
|
flush_reg = GetReg();
|
||||||
m_jit->ConvertSingleToDoublePair(flush_reg, host_reg);
|
m_jit->ConvertSingleToDoublePair(flush_reg, host_reg, flush_reg);
|
||||||
[[fallthrough]];
|
[[fallthrough]];
|
||||||
case RegType::Register:
|
case RegType::Register:
|
||||||
// We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit
|
// We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit
|
||||||
|
@ -605,7 +614,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
|
||||||
break;
|
break;
|
||||||
case RegType::DuplicatedSingle:
|
case RegType::DuplicatedSingle:
|
||||||
flush_reg = GetReg();
|
flush_reg = GetReg();
|
||||||
m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg);
|
m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg, flush_reg);
|
||||||
[[fallthrough]];
|
[[fallthrough]];
|
||||||
case RegType::Duplicated:
|
case RegType::Duplicated:
|
||||||
// Store PSR1 (which is equal to PSR0) in memory.
|
// Store PSR1 (which is equal to PSR0) in memory.
|
||||||
|
@ -709,17 +718,20 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
|
||||||
const bool dirty = reg.IsDirty();
|
const bool dirty = reg.IsDirty();
|
||||||
RegType type = reg.GetType();
|
RegType type = reg.GetType();
|
||||||
|
|
||||||
|
// If FlushRegister calls GetReg with all registers locked, we can get infinite recursion
|
||||||
|
const ARM64Reg tmp_reg = GetUnlockedRegisterCount() > 0 ? GetReg() : ARM64Reg::INVALID_REG;
|
||||||
|
|
||||||
// If we're in single mode, just convert it back to a double.
|
// If we're in single mode, just convert it back to a double.
|
||||||
if (type == RegType::Single)
|
if (type == RegType::Single)
|
||||||
{
|
{
|
||||||
if (dirty)
|
if (dirty)
|
||||||
m_jit->ConvertSingleToDoublePair(host_reg, host_reg);
|
m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg);
|
||||||
type = RegType::Register;
|
type = RegType::Register;
|
||||||
}
|
}
|
||||||
if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle)
|
if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle)
|
||||||
{
|
{
|
||||||
if (dirty)
|
if (dirty)
|
||||||
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg);
|
m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg);
|
||||||
|
|
||||||
if (type == RegType::DuplicatedSingle)
|
if (type == RegType::DuplicatedSingle)
|
||||||
type = RegType::Duplicated;
|
type = RegType::Duplicated;
|
||||||
|
@ -771,6 +783,9 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
|
||||||
reg.Flush();
|
reg.Flush();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tmp_reg != ARM64Reg::INVALID_REG)
|
||||||
|
UnlockRegister(tmp_reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state)
|
void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state)
|
||||||
|
|
|
@ -168,6 +168,9 @@ public:
|
||||||
|
|
||||||
void UpdateLastUsed(BitSet32 regs_used);
|
void UpdateLastUsed(BitSet32 regs_used);
|
||||||
|
|
||||||
|
// Get available host registers
|
||||||
|
u32 GetUnlockedRegisterCount() const;
|
||||||
|
|
||||||
// Locks a register so a cache cannot use it
|
// Locks a register so a cache cannot use it
|
||||||
// Useful for function calls
|
// Useful for function calls
|
||||||
template <typename T = Arm64Gen::ARM64Reg, typename... Args>
|
template <typename T = Arm64Gen::ARM64Reg, typename... Args>
|
||||||
|
@ -211,9 +214,6 @@ protected:
|
||||||
void DiscardRegister(size_t preg);
|
void DiscardRegister(size_t preg);
|
||||||
virtual void FlushRegister(size_t preg, bool maintain_state) = 0;
|
virtual void FlushRegister(size_t preg, bool maintain_state) = 0;
|
||||||
|
|
||||||
// Get available host registers
|
|
||||||
u32 GetUnlockedRegisterCount() const;
|
|
||||||
|
|
||||||
void IncrementAllUsed()
|
void IncrementAllUsed()
|
||||||
{
|
{
|
||||||
for (auto& reg : m_guest_registers)
|
for (auto& reg : m_guest_registers)
|
||||||
|
|
Loading…
Reference in New Issue