JitArm64: Flush in the middle of lmw/stmw/mfcr

Normally we only flush registers right at the end of each PPC instruction. However, for PPC instructions that use a lot of registers one at a time, it's beneficial to do this flushing work in the middle of the instruction instead, reducing the risk of register starvation and improving pipelining.
2023-11-29 20:14:16 +01:00 · 2023-11-29 20:14:16 +01:00 · 8368a397ee
parent 4f3f208fe4
commit 8368a397ee
4 changed files with 119 additions and 0 deletions
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp
@ -3,6 +3,8 @@

 #include "Core/PowerPC/JitArm64/Jit.h"

+#include <bit>
+
 #include "Common/Arm64Emitter.h"
 #include "Common/BitSet.h"
 #include "Common/CommonTypes.h"
@ -539,6 +541,27 @@ void JitArm64::lmw(UGeckoInstruction inst)
  if (!a_is_addr_base_reg)
    MOV(addr_base_reg, addr_reg);

+  BitSet32 gprs_to_flush = ~js.op->gprInUse & BitSet32(0xFFFFFFFFU << d);
+  if (!js.op->gprInUse[a])
+  {
+    if (!a_is_addr_base_reg)
+    {
+      gprs_to_flush[a] = true;
+    }
+    else
+    {
+      gprs_to_flush[a] = false;
+
+      if (a + 1 == d && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0)
+      {
+        // In this situation, we can save one store instruction by flushing GPR d together with GPR
+        // a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's
+        // also wait with flushing GPR d until the end of the PPC instruction.
+        gprs_to_flush[d] = false;
+      }
+    }
+  }
+
  // TODO: This doesn't handle rollback on DSI correctly
  constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_SIZE_32;
  for (u32 i = d; i < 32; i++)
@ -564,6 +587,28 @@ void JitArm64::lmw(UGeckoInstruction inst)

    gpr.BindToRegister(i, false, true);
    ASSERT(dest_reg == gpr.R(i));
+
+    // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
+    // after this instruction, flush registers that would be flushed after this instruction anyway.
+    //
+    // We try to store two registers at a time when possible to let the register cache use STP.
+    if (!jo.memcheck && js.op->gprDiscardable[i])
+    {
+      gpr.DiscardRegisters(BitSet32{int(i)});
+    }
+    else if (gprs_to_flush[i])
+    {
+      BitSet32 gprs_to_flush_this_time{};
+      if (i != 0 && gprs_to_flush[i - 1])
+        gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)};
+      else if (i == 31 || !gprs_to_flush[i + 1])
+        gprs_to_flush_this_time = BitSet32{int(i)};
+      else
+        continue;
+
+      gpr.StoreRegisters(gprs_to_flush_this_time);
+      gprs_to_flush &= ~gprs_to_flush_this_time;
+    }
  }

  gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30);
@ -600,6 +645,28 @@ void JitArm64::stmw(UGeckoInstruction inst)
  if (!a_is_addr_base_reg)
    MOV(addr_base_reg, addr_reg);

+  const BitSet32 dirty_gprs_to_flush_unmasked = ~js.op->gprInUse & gpr.GetDirtyGPRs();
+  BitSet32 dirty_gprs_to_flush = dirty_gprs_to_flush_unmasked & BitSet32(0xFFFFFFFFU << s);
+  if (dirty_gprs_to_flush_unmasked[a])
+  {
+    if (!a_is_addr_base_reg)
+    {
+      dirty_gprs_to_flush[a] = true;
+    }
+    else
+    {
+      dirty_gprs_to_flush[a] = false;
+
+      if (a + 1 == s && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0)
+      {
+        // In this situation, we can save one store instruction by flushing GPR s together with GPR
+        // a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's
+        // also wait with flushing GPR s until the end of the PPC instruction.
+        dirty_gprs_to_flush[s] = false;
+      }
+    }
+  }
+
  // TODO: This doesn't handle rollback on DSI correctly
  constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_SIZE_32;
  for (u32 i = s; i < 32; i++)
@ -620,6 +687,35 @@ void JitArm64::stmw(UGeckoInstruction inst)

    EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), regs_in_use,
                         fprs_in_use);
+
+    // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
+    // after this instruction, flush registers that would be flushed after this instruction anyway.
+    //
+    // We try to store two registers at a time when possible to let the register cache use STP.
+    if (!jo.memcheck && js.op->gprDiscardable[i])
+    {
+      gpr.DiscardRegisters(BitSet32{int(i)});
+    }
+    else if (dirty_gprs_to_flush[i])
+    {
+      BitSet32 gprs_to_flush_this_time{};
+      if (i != 0 && dirty_gprs_to_flush[i - 1])
+        gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)};
+      else if (i == 31 || !dirty_gprs_to_flush[i + 1])
+        gprs_to_flush_this_time = BitSet32{int(i)};
+      else
+        continue;
+
+      gpr.StoreRegisters(gprs_to_flush_this_time);
+      dirty_gprs_to_flush &= ~gprs_to_flush_this_time;
+    }
+    else if (!js.op->gprInUse[i])
+    {
+      // If this register can be flushed but it isn't dirty, no store instruction will be emitted
+      // when flushing it, so it doesn't matter if we flush it together with another register or
+      // not. Let's just flush it in the simplest way possible.
+      gpr.StoreRegisters(BitSet32{int(i)});
+    }
  }

  gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
@ -458,6 +458,17 @@ BitSet32 Arm64GPRCache::GetCallerSavedUsed() const
  return registers;
 }

+BitSet32 Arm64GPRCache::GetDirtyGPRs() const
+{
+  BitSet32 registers(0);
+  for (size_t i = 0; i < GUEST_GPR_COUNT; ++i)
+  {
+    const OpArg& arg = m_guest_registers[GUEST_GPR_OFFSET + i];
+    registers[i] = arg.GetType() != RegType::NotLoaded && arg.IsDirty();
+  }
+  return registers;
+}
+
 void Arm64GPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg)
 {
  for (size_t i = 0; i < m_guest_registers.size(); ++i)
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
@ -325,6 +325,8 @@ public:

  BitSet32 GetCallerSavedUsed() const override;

+  BitSet32 GetDirtyGPRs() const;
+
  void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG)
  {
    FlushRegisters(regs, false, tmp_reg);
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp
@ -704,6 +704,16 @@ void JitArm64::mfcr(UGeckoInstruction inst)
    ORR(WC, WA, LogicalImm(1 << PowerPC::CR_GT_BIT, 32));
    CMP(CR, ARM64Reg::ZR);
    CSEL(WA, WC, WA, CC_GT);
+
+    // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
+    // after this instruction, flush registers that would be flushed after this instruction anyway.
+    //
+    // There's no point in ensuring we flush two registers at the same time, because the offset in
+    // ppcState for CRs is too large to be encoded into an STP instruction.
+    if (js.op->crDiscardable[i])
+      gpr.DiscardCRRegisters(BitSet8{i});
+    else if (!js.op->crInUse[i])
+      gpr.StoreCRRegisters(BitSet8{i}, WC);
  }

  gpr.Unlock(WB, WC);