diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 1ecfea4e4f..d42fb5b251 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -3,6 +3,8 @@ #include "Core/PowerPC/JitArm64/Jit.h" +#include + #include "Common/Arm64Emitter.h" #include "Common/BitSet.h" #include "Common/CommonTypes.h" @@ -539,6 +541,27 @@ void JitArm64::lmw(UGeckoInstruction inst) if (!a_is_addr_base_reg) MOV(addr_base_reg, addr_reg); + BitSet32 gprs_to_flush = ~js.op->gprInUse & BitSet32(0xFFFFFFFFU << d); + if (!js.op->gprInUse[a]) + { + if (!a_is_addr_base_reg) + { + gprs_to_flush[a] = true; + } + else + { + gprs_to_flush[a] = false; + + if (a + 1 == d && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0) + { + // In this situation, we can save one store instruction by flushing GPR d together with GPR + // a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's + // also wait with flushing GPR d until the end of the PPC instruction. + gprs_to_flush[d] = false; + } + } + } + // TODO: This doesn't handle rollback on DSI correctly constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_SIZE_32; for (u32 i = d; i < 32; i++) @@ -564,6 +587,28 @@ void JitArm64::lmw(UGeckoInstruction inst) gpr.BindToRegister(i, false, true); ASSERT(dest_reg == gpr.R(i)); + + // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores + // after this instruction, flush registers that would be flushed after this instruction anyway. + // + // We try to store two registers at a time when possible to let the register cache use STP. + if (!jo.memcheck && js.op->gprDiscardable[i]) + { + gpr.DiscardRegisters(BitSet32{int(i)}); + } + else if (gprs_to_flush[i]) + { + BitSet32 gprs_to_flush_this_time{}; + if (i != 0 && gprs_to_flush[i - 1]) + gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)}; + else if (i == 31 || !gprs_to_flush[i + 1]) + gprs_to_flush_this_time = BitSet32{int(i)}; + else + continue; + + gpr.StoreRegisters(gprs_to_flush_this_time); + gprs_to_flush &= ~gprs_to_flush_this_time; + } } gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); @@ -600,6 +645,28 @@ void JitArm64::stmw(UGeckoInstruction inst) if (!a_is_addr_base_reg) MOV(addr_base_reg, addr_reg); + const BitSet32 dirty_gprs_to_flush_unmasked = ~js.op->gprInUse & gpr.GetDirtyGPRs(); + BitSet32 dirty_gprs_to_flush = dirty_gprs_to_flush_unmasked & BitSet32(0xFFFFFFFFU << s); + if (dirty_gprs_to_flush_unmasked[a]) + { + if (!a_is_addr_base_reg) + { + dirty_gprs_to_flush[a] = true; + } + else + { + dirty_gprs_to_flush[a] = false; + + if (a + 1 == s && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0) + { + // In this situation, we can save one store instruction by flushing GPR s together with GPR + // a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's + // also wait with flushing GPR s until the end of the PPC instruction. + dirty_gprs_to_flush[s] = false; + } + } + } + // TODO: This doesn't handle rollback on DSI correctly constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_SIZE_32; for (u32 i = s; i < 32; i++) @@ -620,6 +687,35 @@ void JitArm64::stmw(UGeckoInstruction inst) EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), regs_in_use, fprs_in_use); + + // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores + // after this instruction, flush registers that would be flushed after this instruction anyway. + // + // We try to store two registers at a time when possible to let the register cache use STP. + if (!jo.memcheck && js.op->gprDiscardable[i]) + { + gpr.DiscardRegisters(BitSet32{int(i)}); + } + else if (dirty_gprs_to_flush[i]) + { + BitSet32 gprs_to_flush_this_time{}; + if (i != 0 && dirty_gprs_to_flush[i - 1]) + gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)}; + else if (i == 31 || !dirty_gprs_to_flush[i + 1]) + gprs_to_flush_this_time = BitSet32{int(i)}; + else + continue; + + gpr.StoreRegisters(gprs_to_flush_this_time); + dirty_gprs_to_flush &= ~gprs_to_flush_this_time; + } + else if (!js.op->gprInUse[i]) + { + // If this register can be flushed but it isn't dirty, no store instruction will be emitted + // when flushing it, so it doesn't matter if we flush it together with another register or + // not. Let's just flush it in the simplest way possible. + gpr.StoreRegisters(BitSet32{int(i)}); + } } gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 5da455cb61..c5484da30b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -458,6 +458,17 @@ BitSet32 Arm64GPRCache::GetCallerSavedUsed() const return registers; } +BitSet32 Arm64GPRCache::GetDirtyGPRs() const +{ + BitSet32 registers(0); + for (size_t i = 0; i < GUEST_GPR_COUNT; ++i) + { + const OpArg& arg = m_guest_registers[GUEST_GPR_OFFSET + i]; + registers[i] = arg.GetType() != RegType::NotLoaded && arg.IsDirty(); + } + return registers; +} + void Arm64GPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg) { for (size_t i = 0; i < m_guest_registers.size(); ++i) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index dad43ce9ef..c17c30f8a0 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -325,6 +325,8 @@ public: BitSet32 GetCallerSavedUsed() const override; + BitSet32 GetDirtyGPRs() const; + void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG) { FlushRegisters(regs, false, tmp_reg); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp index e9dcc4f3fb..fda306984f 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp @@ -704,6 +704,16 @@ void JitArm64::mfcr(UGeckoInstruction inst) ORR(WC, WA, LogicalImm(1 << PowerPC::CR_GT_BIT, 32)); CMP(CR, ARM64Reg::ZR); CSEL(WA, WC, WA, CC_GT); + + // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores + // after this instruction, flush registers that would be flushed after this instruction anyway. + // + // There's no point in ensuring we flush two registers at the same time, because the offset in + // ppcState for CRs is too large to be encoded into an STP instruction. + if (js.op->crDiscardable[i]) + gpr.DiscardCRRegisters(BitSet8{i}); + else if (!js.op->crInUse[i]) + gpr.StoreCRRegisters(BitSet8{i}, WC); } gpr.Unlock(WB, WC);