From c1d4f63929f399cee2b4caba04316371fd0bf51e Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 27 Dec 2024 16:01:08 +0100 Subject: [PATCH] Jit: Flush registers used in memory breakpoint conditions Aims to fix https://bugs.dolphin-emu.org/issues/13686. I'm using a less efficient approach for Jit64 than for JitArm64. In JitArm64, I'm flushing in the slow access code, but in Jit64, I'm flushing before the split between the slow access code and the fast access code, because Jit64 doesn't keep register mappings around for when it's time to emit the slow access code. But the flushing code is only emitted when there are memory breakpoints with conditions, so I'd say that this is a performance loss we can live with. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 26 ++++++++++++++++ Source/Core/Core/PowerPC/Jit64/Jit.h | 3 ++ .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 14 ++++++++- .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 6 ++++ .../PowerPC/Jit64/Jit_LoadStorePaired.cpp | 16 ++++++---- .../Core/PowerPC/Jit64Common/EmuCodeBlock.cpp | 28 +++++------------ Source/Core/Core/PowerPC/JitArm64/Jit.h | 3 ++ .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 31 +++++++++++++------ .../JitArm64/JitArm64_LoadStorePaired.cpp | 8 ++--- .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 15 +++++---- 10 files changed, 101 insertions(+), 49 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index b09279d458..d76854409f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -1294,6 +1294,32 @@ void Jit64::IntializeSpeculativeConstants() } } +void Jit64::FlushPCBeforeSlowAccess() +{ + // PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe + // interrupt checks, and printing accurate PC locations in debug logs. + MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); +} + +void Jit64::FlushRegistersBeforeSlowAccess() +{ + // Register values can be used by memory watchpoint conditions. + MemChecks& mem_checks = m_system.GetPowerPC().GetMemChecks(); + if (mem_checks.HasAny()) + { + BitSet32 gprs = mem_checks.GetGPRsUsedInConditions(); + BitSet32 fprs = mem_checks.GetFPRsUsedInConditions(); + if (gprs || fprs) + { + RCForkGuard gpr_guard = gpr.Fork(); + RCForkGuard fpr_guard = fpr.Fork(); + + gpr.Flush(gprs); + fpr.Flush(fprs); + } + } +} + bool Jit64::HandleFunctionHooking(u32 address) { const auto result = HLE::TryReplaceFunction(m_ppc_symbol_db, address, PowerPC::CoreMode::JIT); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 0e98981a46..c23d7e1491 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -81,6 +81,9 @@ public: void IntializeSpeculativeConstants(); + void FlushPCBeforeSlowAccess(); + void FlushRegistersBeforeSlowAccess(); + JitBlockCache* GetBlockCache() override { return &blocks; } void Trace(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 05bd690694..a200689f35 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -110,6 +110,8 @@ void Jit64::lXXx(UGeckoInstruction inst) PanicAlertFmt("Invalid instruction"); } + FlushRegistersBeforeSlowAccess(); + // PowerPC has no 8-bit sign extended load, but x86 does, so merge extsb with the load if we find // it. if (CanMergeNextInstructions(1) && accessSize == 8 && js.op[1].inst.OPCD == 31 && @@ -439,6 +441,8 @@ void Jit64::dcbz(UGeckoInstruction inst) int a = inst.RA; int b = inst.RB; + FlushRegistersBeforeSlowAccess(); + { RCOpArg Ra = a ? gpr.Use(a, RCMode::Read) : RCOpArg::Imm32(0); RCOpArg Rb = gpr.Use(b, RCMode::Read); @@ -477,7 +481,7 @@ void Jit64::dcbz(UGeckoInstruction inst) SwitchToFarCode(); SetJumpTarget(slow); } - MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); + FlushPCBeforeSlowAccess(); BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_CallFunctionPR(PowerPC::ClearDCacheLineFromJit, &m_mmu, RSCRATCH); @@ -524,6 +528,8 @@ void Jit64::stX(UGeckoInstruction inst) return; } + FlushRegistersBeforeSlowAccess(); + // If we already know the address of the write if (!a || gpr.IsImm(a)) { @@ -602,6 +608,8 @@ void Jit64::stXx(UGeckoInstruction inst) break; } + FlushRegistersBeforeSlowAccess(); + const bool does_clobber = WriteClobbersRegValue(accessSize, /* swap */ !byte_reverse); RCOpArg Ra = update ? gpr.Bind(a, RCMode::ReadWrite) : gpr.Use(a, RCMode::Read); @@ -634,6 +642,8 @@ void Jit64::lmw(UGeckoInstruction inst) int a = inst.RA, d = inst.RD; + FlushRegistersBeforeSlowAccess(); + // TODO: This doesn't handle rollback on DSI correctly { RCOpArg Ra = a ? gpr.Use(a, RCMode::Read) : RCOpArg::Imm32(0); @@ -657,6 +667,8 @@ void Jit64::stmw(UGeckoInstruction inst) int a = inst.RA, d = inst.RD; + FlushRegistersBeforeSlowAccess(); + // TODO: This doesn't handle rollback on DSI correctly for (int i = d; i < 32; i++) { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 7ac34f5d6e..7b347a3742 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -30,6 +30,8 @@ void Jit64::lfXXX(UGeckoInstruction inst) FALLBACK_IF(!indexed && !a); + FlushRegistersBeforeSlowAccess(); + s32 offset = 0; RCOpArg addr = gpr.Bind(a, update ? RCMode::ReadWrite : RCMode::Read); RegCache::Realize(addr); @@ -103,6 +105,8 @@ void Jit64::stfXXX(UGeckoInstruction inst) FALLBACK_IF(update && jo.memcheck && indexed && a == b); + FlushRegistersBeforeSlowAccess(); + if (single) { if (js.fpr_is_store_safe[s] && js.op->fprIsSingle[s]) @@ -196,6 +200,8 @@ void Jit64::stfiwx(UGeckoInstruction inst) int a = inst.RA; int b = inst.RB; + FlushRegistersBeforeSlowAccess(); + RCOpArg Ra = a ? gpr.Use(a, RCMode::Read) : RCOpArg::Imm32(0); RCOpArg Rb = gpr.Use(b, RCMode::Read); RCOpArg Rs = fpr.Use(s, RCMode::Read); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index 91865b61a0..78b00b99f0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -35,6 +35,8 @@ void Jit64::psq_stXX(UGeckoInstruction inst) int w = indexed ? inst.Wx : inst.W; FALLBACK_IF(!a); + FlushRegistersBeforeSlowAccess(); + RCX64Reg scratch_guard = gpr.Scratch(RSCRATCH_EXTRA); RCOpArg Ra = update ? gpr.Bind(a, RCMode::ReadWrite) : gpr.Use(a, RCMode::Read); RCOpArg Rb = indexed ? gpr.Use(b, RCMode::Read) : RCOpArg::Imm32((u32)offset); @@ -69,8 +71,8 @@ void Jit64::psq_stXX(UGeckoInstruction inst) } else { - // Stash PC in case asm routine needs to call into C++ - MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); + FlushPCBeforeSlowAccess(); + // We know what GQR is here, so we can load RSCRATCH2 and call into the store method directly // with just the scale bits. MOV(32, R(RSCRATCH2), Imm32(gqrValue & 0x3F00)); @@ -83,8 +85,8 @@ void Jit64::psq_stXX(UGeckoInstruction inst) } else { - // Stash PC in case asm routine needs to call into C++ - MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); + FlushPCBeforeSlowAccess(); + // Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code. // Hence, we need to mask out the unused bits. The layout of the GQR register is // UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with @@ -124,6 +126,8 @@ void Jit64::psq_lXX(UGeckoInstruction inst) int w = indexed ? inst.Wx : inst.W; FALLBACK_IF(!a); + FlushRegistersBeforeSlowAccess(); + RCX64Reg scratch_guard = gpr.Scratch(RSCRATCH_EXTRA); RCX64Reg Ra = gpr.Bind(a, update ? RCMode::ReadWrite : RCMode::Read); RCOpArg Rb = indexed ? gpr.Use(b, RCMode::Read) : RCOpArg::Imm32((u32)offset); @@ -144,8 +148,8 @@ void Jit64::psq_lXX(UGeckoInstruction inst) } else { - // Stash PC in case asm routine needs to call into C++ - MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); + FlushPCBeforeSlowAccess(); + // Get the high part of the GQR register OpArg gqr = PPCSTATE_SPR(SPR_GQR0 + i); gqr.AddMemOffset(2); diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp index 02c8ba1893..040e983b08 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp @@ -386,15 +386,10 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg& opAddress, SetJumpTarget(slow); } - // PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe - // interrupt checks, and printing accurate PC locations in debug logs. - // - // In the case of Jit64AsmCommon routines, we don't know the PC here, - // so the caller has to store the PC themselves. + // In the case of Jit64AsmCommon routines, the state we want to store here + // isn't known at JIT time, so the caller has to store it themselves. if (!(flags & SAFE_LOADSTORE_NO_UPDATE_PC)) - { - MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); - } + m_jit.FlushPCBeforeSlowAccess(); size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0; ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment); @@ -457,8 +452,7 @@ void EmuCodeBlock::SafeLoadToRegImmediate(X64Reg reg_value, u32 address, int acc return; } - // Helps external systems know which instruction triggered the read. - MOV(32, PPCSTATE(pc), Imm32(m_jit.js.compilerPC)); + m_jit.FlushPCBeforeSlowAccess(); // Fall back to general-case code. ABI_PushRegistersAndAdjustStack(registersInUse, 0); @@ -560,15 +554,10 @@ void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acces SetJumpTarget(slow); } - // PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe - // interrupt checks, and printing accurate PC locations in debug logs. - // - // In the case of Jit64AsmCommon routines, we don't know the PC here, - // so the caller has to store the PC themselves. + // In the case of Jit64AsmCommon routines, the state we want to store here + // isn't known at JIT time, so the caller has to store it themselves. if (!(flags & SAFE_LOADSTORE_NO_UPDATE_PC)) - { - MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); - } + m_jit.FlushPCBeforeSlowAccess(); size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0; ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment); @@ -663,8 +652,7 @@ bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address, } else { - // Helps external systems know which instruction triggered the write - MOV(32, PPCSTATE(pc), Imm32(m_jit.js.compilerPC)); + m_jit.FlushPCBeforeSlowAccess(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); switch (accessSize) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 8ca685b1d3..e28b473235 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -282,6 +282,9 @@ protected: Arm64Gen::ARM64Reg addr, BitSet32 gprs_to_push = BitSet32(0), BitSet32 fprs_to_push = BitSet32(0), bool emitting_routine = false); + // temp_gpr must be a valid register, but temp_fpr can be INVALID_REG. + void FlushPPCStateBeforeSlowAccess(Arm64Gen::ARM64Reg temp_gpr, Arm64Gen::ARM64Reg temp_fpr); + // Loadstore routines void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update); void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset, bool update); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 1c54c00ebe..04fd1afdb2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -21,6 +21,7 @@ #include "Core/PowerPC/JitArmCommon/BackPatch.h" #include "Core/PowerPC/MMU.h" #include "Core/PowerPC/PowerPC.h" +#include "Core/System.h" using namespace Arm64Gen; @@ -171,6 +172,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, const ARM64Reg temp_gpr = ARM64Reg::W1; const int temp_gpr_index = DecodeReg(temp_gpr); + const ARM64Reg temp_fpr = fprs_to_push[0] ? ARM64Reg::INVALID_REG : ARM64Reg::Q0; BitSet32 gprs_to_push_early = {}; if (memcheck) @@ -189,16 +191,10 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, ABI_PushRegisters(gprs_to_push & ~gprs_to_push_early); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); - // PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe - // interrupt checks, and printing accurate PC locations in debug logs. - // - // In the case of JitAsm routines, we don't know the PC here, - // so the caller has to store the PC themselves. + // In the case of JitAsm routines, the state we want to store here + // isn't known at JIT time, so the caller has to store it themselves. if (!emitting_routine) - { - MOVI2R(ARM64Reg::W30, js.compilerPC); - STR(IndexType::Unsigned, ARM64Reg::W30, PPC_REG, PPCSTATE_OFF(pc)); - } + FlushPPCStateBeforeSlowAccess(ARM64Reg::W30, temp_fpr); if (flags & BackPatchInfo::FLAG_STORE) { @@ -265,7 +261,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if (memcheck) { - const ARM64Reg temp_fpr = fprs_to_push[0] ? ARM64Reg::INVALID_REG : ARM64Reg::Q0; const u64 early_push_count = (gprs_to_push & gprs_to_push_early).Count(); const u64 early_push_size = Common::AlignUp(early_push_count, 2) * 8; @@ -316,6 +311,22 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, } } +void JitArm64::FlushPPCStateBeforeSlowAccess(ARM64Reg temp_gpr, ARM64Reg temp_fpr) +{ + // PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe + // interrupt checks, and printing accurate PC locations in debug logs. + MOVI2R(temp_gpr, js.compilerPC); + STR(IndexType::Unsigned, temp_gpr, PPC_REG, PPCSTATE_OFF(pc)); + + // Register values can be used by memory watchpoint conditions. + MemChecks& mem_checks = m_system.GetPowerPC().GetMemChecks(); + if (mem_checks.HasAny()) + { + gpr.StoreRegisters(mem_checks.GetGPRsUsedInConditions(), temp_gpr, FlushMode::MaintainState); + fpr.StoreRegisters(mem_checks.GetFPRsUsedInConditions(), temp_fpr, FlushMode::MaintainState); + } +} + bool JitArm64::HandleFastmemFault(SContext* ctx) { const u8* pc = reinterpret_cast(ctx->CTX_PC); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index eb8b4d015c..8bf8a8cbe2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -102,9 +102,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) { LDR(IndexType::Unsigned, scale_reg, PPC_REG, PPCSTATE_OFF_SPR(SPR_GQR0 + i)); - // Stash PC in case asm routine needs to call into C++ - MOVI2R(ARM64Reg::W30, js.compilerPC); - STR(IndexType::Unsigned, ARM64Reg::W30, PPC_REG, PPCSTATE_OFF(pc)); + FlushPPCStateBeforeSlowAccess(ARM64Reg::W30, ARM64Reg::Q1); UBFM(type_reg, scale_reg, 16, 18); // Type UBFM(scale_reg, scale_reg, 24, 29); // Scale @@ -260,9 +258,7 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) { LDR(IndexType::Unsigned, scale_reg, PPC_REG, PPCSTATE_OFF_SPR(SPR_GQR0 + i)); - // Stash PC in case asm routine needs to call into C++ - MOVI2R(ARM64Reg::W30, js.compilerPC); - STR(IndexType::Unsigned, ARM64Reg::W30, PPC_REG, PPCSTATE_OFF(pc)); + FlushPPCStateBeforeSlowAccess(ARM64Reg::W30, ARM64Reg::Q1); UBFM(type_reg, scale_reg, 0, 2); // Type UBFM(scale_reg, scale_reg, 8, 13); // Scale diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index b98e170531..90b56115ac 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -388,14 +388,16 @@ public: BitSet32 GetDirtyGPRs() const; - void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG) + void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG, + FlushMode flush_mode = FlushMode::All) { - FlushRegisters(regs, FlushMode::All, tmp_reg, IgnoreDiscardedRegisters::No); + FlushRegisters(regs, flush_mode, tmp_reg, IgnoreDiscardedRegisters::No); } - void StoreCRRegisters(BitSet8 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG) + void StoreCRRegisters(BitSet8 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG, + FlushMode flush_mode = FlushMode::All) { - FlushCRRegisters(regs, FlushMode::All, tmp_reg, IgnoreDiscardedRegisters::No); + FlushCRRegisters(regs, flush_mode, tmp_reg, IgnoreDiscardedRegisters::No); } void DiscardCRRegisters(BitSet8 regs); @@ -459,9 +461,10 @@ public: void FixSinglePrecision(size_t preg); - void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG) + void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG, + FlushMode flush_mode = FlushMode::All) { - FlushRegisters(regs, FlushMode::All, tmp_reg); + FlushRegisters(regs, flush_mode, tmp_reg); } protected: