Merge pull request #12352 from JosJuice/jitarm64-flush-in-long-inst
JitArm64: Flush in the middle of lmw/stmw/mfcr
This commit is contained in:
commit
e2472e4f50
|
@ -3,6 +3,8 @@
|
||||||
|
|
||||||
#include "Core/PowerPC/JitArm64/Jit.h"
|
#include "Core/PowerPC/JitArm64/Jit.h"
|
||||||
|
|
||||||
|
#include <bit>
|
||||||
|
|
||||||
#include "Common/Arm64Emitter.h"
|
#include "Common/Arm64Emitter.h"
|
||||||
#include "Common/BitSet.h"
|
#include "Common/BitSet.h"
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
|
@ -539,6 +541,27 @@ void JitArm64::lmw(UGeckoInstruction inst)
|
||||||
if (!a_is_addr_base_reg)
|
if (!a_is_addr_base_reg)
|
||||||
MOV(addr_base_reg, addr_reg);
|
MOV(addr_base_reg, addr_reg);
|
||||||
|
|
||||||
|
BitSet32 gprs_to_flush = ~js.op->gprInUse & BitSet32(0xFFFFFFFFU << d);
|
||||||
|
if (!js.op->gprInUse[a])
|
||||||
|
{
|
||||||
|
if (!a_is_addr_base_reg)
|
||||||
|
{
|
||||||
|
gprs_to_flush[a] = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
gprs_to_flush[a] = false;
|
||||||
|
|
||||||
|
if (a + 1 == d && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0)
|
||||||
|
{
|
||||||
|
// In this situation, we can save one store instruction by flushing GPR d together with GPR
|
||||||
|
// a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's
|
||||||
|
// also wait with flushing GPR d until the end of the PPC instruction.
|
||||||
|
gprs_to_flush[d] = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: This doesn't handle rollback on DSI correctly
|
// TODO: This doesn't handle rollback on DSI correctly
|
||||||
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_SIZE_32;
|
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_SIZE_32;
|
||||||
for (u32 i = d; i < 32; i++)
|
for (u32 i = d; i < 32; i++)
|
||||||
|
@ -564,6 +587,28 @@ void JitArm64::lmw(UGeckoInstruction inst)
|
||||||
|
|
||||||
gpr.BindToRegister(i, false, true);
|
gpr.BindToRegister(i, false, true);
|
||||||
ASSERT(dest_reg == gpr.R(i));
|
ASSERT(dest_reg == gpr.R(i));
|
||||||
|
|
||||||
|
// To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
|
||||||
|
// after this instruction, flush registers that would be flushed after this instruction anyway.
|
||||||
|
//
|
||||||
|
// We try to store two registers at a time when possible to let the register cache use STP.
|
||||||
|
if (!jo.memcheck && js.op->gprDiscardable[i])
|
||||||
|
{
|
||||||
|
gpr.DiscardRegisters(BitSet32{int(i)});
|
||||||
|
}
|
||||||
|
else if (gprs_to_flush[i])
|
||||||
|
{
|
||||||
|
BitSet32 gprs_to_flush_this_time{};
|
||||||
|
if (i != 0 && gprs_to_flush[i - 1])
|
||||||
|
gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)};
|
||||||
|
else if (i == 31 || !gprs_to_flush[i + 1])
|
||||||
|
gprs_to_flush_this_time = BitSet32{int(i)};
|
||||||
|
else
|
||||||
|
continue;
|
||||||
|
|
||||||
|
gpr.StoreRegisters(gprs_to_flush_this_time);
|
||||||
|
gprs_to_flush &= ~gprs_to_flush_this_time;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30);
|
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30);
|
||||||
|
@ -600,6 +645,28 @@ void JitArm64::stmw(UGeckoInstruction inst)
|
||||||
if (!a_is_addr_base_reg)
|
if (!a_is_addr_base_reg)
|
||||||
MOV(addr_base_reg, addr_reg);
|
MOV(addr_base_reg, addr_reg);
|
||||||
|
|
||||||
|
const BitSet32 dirty_gprs_to_flush_unmasked = ~js.op->gprInUse & gpr.GetDirtyGPRs();
|
||||||
|
BitSet32 dirty_gprs_to_flush = dirty_gprs_to_flush_unmasked & BitSet32(0xFFFFFFFFU << s);
|
||||||
|
if (dirty_gprs_to_flush_unmasked[a])
|
||||||
|
{
|
||||||
|
if (!a_is_addr_base_reg)
|
||||||
|
{
|
||||||
|
dirty_gprs_to_flush[a] = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
dirty_gprs_to_flush[a] = false;
|
||||||
|
|
||||||
|
if (a + 1 == s && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0)
|
||||||
|
{
|
||||||
|
// In this situation, we can save one store instruction by flushing GPR s together with GPR
|
||||||
|
// a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's
|
||||||
|
// also wait with flushing GPR s until the end of the PPC instruction.
|
||||||
|
dirty_gprs_to_flush[s] = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: This doesn't handle rollback on DSI correctly
|
// TODO: This doesn't handle rollback on DSI correctly
|
||||||
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_SIZE_32;
|
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_SIZE_32;
|
||||||
for (u32 i = s; i < 32; i++)
|
for (u32 i = s; i < 32; i++)
|
||||||
|
@ -620,6 +687,35 @@ void JitArm64::stmw(UGeckoInstruction inst)
|
||||||
|
|
||||||
EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), regs_in_use,
|
EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), regs_in_use,
|
||||||
fprs_in_use);
|
fprs_in_use);
|
||||||
|
|
||||||
|
// To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
|
||||||
|
// after this instruction, flush registers that would be flushed after this instruction anyway.
|
||||||
|
//
|
||||||
|
// We try to store two registers at a time when possible to let the register cache use STP.
|
||||||
|
if (!jo.memcheck && js.op->gprDiscardable[i])
|
||||||
|
{
|
||||||
|
gpr.DiscardRegisters(BitSet32{int(i)});
|
||||||
|
}
|
||||||
|
else if (dirty_gprs_to_flush[i])
|
||||||
|
{
|
||||||
|
BitSet32 gprs_to_flush_this_time{};
|
||||||
|
if (i != 0 && dirty_gprs_to_flush[i - 1])
|
||||||
|
gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)};
|
||||||
|
else if (i == 31 || !dirty_gprs_to_flush[i + 1])
|
||||||
|
gprs_to_flush_this_time = BitSet32{int(i)};
|
||||||
|
else
|
||||||
|
continue;
|
||||||
|
|
||||||
|
gpr.StoreRegisters(gprs_to_flush_this_time);
|
||||||
|
dirty_gprs_to_flush &= ~gprs_to_flush_this_time;
|
||||||
|
}
|
||||||
|
else if (!js.op->gprInUse[i])
|
||||||
|
{
|
||||||
|
// If this register can be flushed but it isn't dirty, no store instruction will be emitted
|
||||||
|
// when flushing it, so it doesn't matter if we flush it together with another register or
|
||||||
|
// not. Let's just flush it in the simplest way possible.
|
||||||
|
gpr.StoreRegisters(BitSet32{int(i)});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
|
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
|
||||||
|
|
|
@ -458,6 +458,17 @@ BitSet32 Arm64GPRCache::GetCallerSavedUsed() const
|
||||||
return registers;
|
return registers;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BitSet32 Arm64GPRCache::GetDirtyGPRs() const
|
||||||
|
{
|
||||||
|
BitSet32 registers(0);
|
||||||
|
for (size_t i = 0; i < GUEST_GPR_COUNT; ++i)
|
||||||
|
{
|
||||||
|
const OpArg& arg = m_guest_registers[GUEST_GPR_OFFSET + i];
|
||||||
|
registers[i] = arg.GetType() != RegType::NotLoaded && arg.IsDirty();
|
||||||
|
}
|
||||||
|
return registers;
|
||||||
|
}
|
||||||
|
|
||||||
void Arm64GPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg)
|
void Arm64GPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < m_guest_registers.size(); ++i)
|
for (size_t i = 0; i < m_guest_registers.size(); ++i)
|
||||||
|
|
|
@ -325,6 +325,8 @@ public:
|
||||||
|
|
||||||
BitSet32 GetCallerSavedUsed() const override;
|
BitSet32 GetCallerSavedUsed() const override;
|
||||||
|
|
||||||
|
BitSet32 GetDirtyGPRs() const;
|
||||||
|
|
||||||
void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG)
|
void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG)
|
||||||
{
|
{
|
||||||
FlushRegisters(regs, false, tmp_reg);
|
FlushRegisters(regs, false, tmp_reg);
|
||||||
|
|
|
@ -704,6 +704,16 @@ void JitArm64::mfcr(UGeckoInstruction inst)
|
||||||
ORR(WC, WA, LogicalImm(1 << PowerPC::CR_GT_BIT, 32));
|
ORR(WC, WA, LogicalImm(1 << PowerPC::CR_GT_BIT, 32));
|
||||||
CMP(CR, ARM64Reg::ZR);
|
CMP(CR, ARM64Reg::ZR);
|
||||||
CSEL(WA, WC, WA, CC_GT);
|
CSEL(WA, WC, WA, CC_GT);
|
||||||
|
|
||||||
|
// To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
|
||||||
|
// after this instruction, flush registers that would be flushed after this instruction anyway.
|
||||||
|
//
|
||||||
|
// There's no point in ensuring we flush two registers at the same time, because the offset in
|
||||||
|
// ppcState for CRs is too large to be encoded into an STP instruction.
|
||||||
|
if (js.op->crDiscardable[i])
|
||||||
|
gpr.DiscardCRRegisters(BitSet8{i});
|
||||||
|
else if (!js.op->crInUse[i])
|
||||||
|
gpr.StoreCRRegisters(BitSet8{i}, WC);
|
||||||
}
|
}
|
||||||
|
|
||||||
gpr.Unlock(WB, WC);
|
gpr.Unlock(WB, WC);
|
||||||
|
|
Loading…
Reference in New Issue