JitArm64: Flush in the middle of lmw/stmw/mfcr

Normally we only flush registers right at the end of each PPC
instruction. However, for PPC instructions that use a lot of registers
one at a time, it's beneficial to do this flushing work in the middle
of the instruction instead, reducing the risk of register starvation
and improving pipelining.
This commit is contained in:
JosJuice 2023-11-29 20:14:16 +01:00
parent 4f3f208fe4
commit 8368a397ee
4 changed files with 119 additions and 0 deletions

View File

@ -3,6 +3,8 @@
#include "Core/PowerPC/JitArm64/Jit.h"
#include <bit>
#include "Common/Arm64Emitter.h"
#include "Common/BitSet.h"
#include "Common/CommonTypes.h"
@ -539,6 +541,27 @@ void JitArm64::lmw(UGeckoInstruction inst)
if (!a_is_addr_base_reg)
MOV(addr_base_reg, addr_reg);
BitSet32 gprs_to_flush = ~js.op->gprInUse & BitSet32(0xFFFFFFFFU << d);
if (!js.op->gprInUse[a])
{
if (!a_is_addr_base_reg)
{
gprs_to_flush[a] = true;
}
else
{
gprs_to_flush[a] = false;
if (a + 1 == d && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0)
{
// In this situation, we can save one store instruction by flushing GPR d together with GPR
// a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's
// also wait with flushing GPR d until the end of the PPC instruction.
gprs_to_flush[d] = false;
}
}
}
// TODO: This doesn't handle rollback on DSI correctly
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_SIZE_32;
for (u32 i = d; i < 32; i++)
@ -564,6 +587,28 @@ void JitArm64::lmw(UGeckoInstruction inst)
gpr.BindToRegister(i, false, true);
ASSERT(dest_reg == gpr.R(i));
// To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
// after this instruction, flush registers that would be flushed after this instruction anyway.
//
// We try to store two registers at a time when possible to let the register cache use STP.
if (!jo.memcheck && js.op->gprDiscardable[i])
{
gpr.DiscardRegisters(BitSet32{int(i)});
}
else if (gprs_to_flush[i])
{
BitSet32 gprs_to_flush_this_time{};
if (i != 0 && gprs_to_flush[i - 1])
gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)};
else if (i == 31 || !gprs_to_flush[i + 1])
gprs_to_flush_this_time = BitSet32{int(i)};
else
continue;
gpr.StoreRegisters(gprs_to_flush_this_time);
gprs_to_flush &= ~gprs_to_flush_this_time;
}
}
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30);
@ -600,6 +645,28 @@ void JitArm64::stmw(UGeckoInstruction inst)
if (!a_is_addr_base_reg)
MOV(addr_base_reg, addr_reg);
const BitSet32 dirty_gprs_to_flush_unmasked = ~js.op->gprInUse & gpr.GetDirtyGPRs();
BitSet32 dirty_gprs_to_flush = dirty_gprs_to_flush_unmasked & BitSet32(0xFFFFFFFFU << s);
if (dirty_gprs_to_flush_unmasked[a])
{
if (!a_is_addr_base_reg)
{
dirty_gprs_to_flush[a] = true;
}
else
{
dirty_gprs_to_flush[a] = false;
if (a + 1 == s && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0)
{
// In this situation, we can save one store instruction by flushing GPR s together with GPR
// a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's
// also wait with flushing GPR s until the end of the PPC instruction.
dirty_gprs_to_flush[s] = false;
}
}
}
// TODO: This doesn't handle rollback on DSI correctly
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_SIZE_32;
for (u32 i = s; i < 32; i++)
@ -620,6 +687,35 @@ void JitArm64::stmw(UGeckoInstruction inst)
EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), regs_in_use,
fprs_in_use);
// To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
// after this instruction, flush registers that would be flushed after this instruction anyway.
//
// We try to store two registers at a time when possible to let the register cache use STP.
if (!jo.memcheck && js.op->gprDiscardable[i])
{
gpr.DiscardRegisters(BitSet32{int(i)});
}
else if (dirty_gprs_to_flush[i])
{
BitSet32 gprs_to_flush_this_time{};
if (i != 0 && dirty_gprs_to_flush[i - 1])
gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)};
else if (i == 31 || !dirty_gprs_to_flush[i + 1])
gprs_to_flush_this_time = BitSet32{int(i)};
else
continue;
gpr.StoreRegisters(gprs_to_flush_this_time);
dirty_gprs_to_flush &= ~gprs_to_flush_this_time;
}
else if (!js.op->gprInUse[i])
{
// If this register can be flushed but it isn't dirty, no store instruction will be emitted
// when flushing it, so it doesn't matter if we flush it together with another register or
// not. Let's just flush it in the simplest way possible.
gpr.StoreRegisters(BitSet32{int(i)});
}
}
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);

View File

@ -458,6 +458,17 @@ BitSet32 Arm64GPRCache::GetCallerSavedUsed() const
return registers;
}
BitSet32 Arm64GPRCache::GetDirtyGPRs() const
{
BitSet32 registers(0);
for (size_t i = 0; i < GUEST_GPR_COUNT; ++i)
{
const OpArg& arg = m_guest_registers[GUEST_GPR_OFFSET + i];
registers[i] = arg.GetType() != RegType::NotLoaded && arg.IsDirty();
}
return registers;
}
void Arm64GPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg)
{
for (size_t i = 0; i < m_guest_registers.size(); ++i)

View File

@ -325,6 +325,8 @@ public:
BitSet32 GetCallerSavedUsed() const override;
BitSet32 GetDirtyGPRs() const;
void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG)
{
FlushRegisters(regs, false, tmp_reg);

View File

@ -704,6 +704,16 @@ void JitArm64::mfcr(UGeckoInstruction inst)
ORR(WC, WA, LogicalImm(1 << PowerPC::CR_GT_BIT, 32));
CMP(CR, ARM64Reg::ZR);
CSEL(WA, WC, WA, CC_GT);
// To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
// after this instruction, flush registers that would be flushed after this instruction anyway.
//
// There's no point in ensuring we flush two registers at the same time, because the offset in
// ppcState for CRs is too large to be encoded into an STP instruction.
if (js.op->crDiscardable[i])
gpr.DiscardCRRegisters(BitSet8{i});
else if (!js.op->crInUse[i])
gpr.StoreCRRegisters(BitSet8{i}, WC);
}
gpr.Unlock(WB, WC);