Jit: Flush registers used in memory breakpoint conditions

Aims to fix https://bugs.dolphin-emu.org/issues/13686.

I'm using a less efficient approach for Jit64 than for JitArm64. In
JitArm64, I'm flushing in the slow access code, but in Jit64, I'm
flushing before the split between the slow access code and the fast
access code, because Jit64 doesn't keep register mappings around for
when it's time to emit the slow access code. But the flushing code is
only emitted when there are memory breakpoints with conditions, so I'd
say that this is a performance loss we can live with.
This commit is contained in:
JosJuice 2024-12-27 16:01:08 +01:00
parent 4ef14d3619
commit c1d4f63929
10 changed files with 101 additions and 49 deletions

View File

@ -1294,6 +1294,32 @@ void Jit64::IntializeSpeculativeConstants()
}
}
void Jit64::FlushPCBeforeSlowAccess()
{
// PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe
// interrupt checks, and printing accurate PC locations in debug logs.
MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
}
void Jit64::FlushRegistersBeforeSlowAccess()
{
// Register values can be used by memory watchpoint conditions.
MemChecks& mem_checks = m_system.GetPowerPC().GetMemChecks();
if (mem_checks.HasAny())
{
BitSet32 gprs = mem_checks.GetGPRsUsedInConditions();
BitSet32 fprs = mem_checks.GetFPRsUsedInConditions();
if (gprs || fprs)
{
RCForkGuard gpr_guard = gpr.Fork();
RCForkGuard fpr_guard = fpr.Fork();
gpr.Flush(gprs);
fpr.Flush(fprs);
}
}
}
bool Jit64::HandleFunctionHooking(u32 address)
{
const auto result = HLE::TryReplaceFunction(m_ppc_symbol_db, address, PowerPC::CoreMode::JIT);

View File

@ -81,6 +81,9 @@ public:
void IntializeSpeculativeConstants();
void FlushPCBeforeSlowAccess();
void FlushRegistersBeforeSlowAccess();
JitBlockCache* GetBlockCache() override { return &blocks; }
void Trace();

View File

@ -110,6 +110,8 @@ void Jit64::lXXx(UGeckoInstruction inst)
PanicAlertFmt("Invalid instruction");
}
FlushRegistersBeforeSlowAccess();
// PowerPC has no 8-bit sign extended load, but x86 does, so merge extsb with the load if we find
// it.
if (CanMergeNextInstructions(1) && accessSize == 8 && js.op[1].inst.OPCD == 31 &&
@ -439,6 +441,8 @@ void Jit64::dcbz(UGeckoInstruction inst)
int a = inst.RA;
int b = inst.RB;
FlushRegistersBeforeSlowAccess();
{
RCOpArg Ra = a ? gpr.Use(a, RCMode::Read) : RCOpArg::Imm32(0);
RCOpArg Rb = gpr.Use(b, RCMode::Read);
@ -477,7 +481,7 @@ void Jit64::dcbz(UGeckoInstruction inst)
SwitchToFarCode();
SetJumpTarget(slow);
}
MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
FlushPCBeforeSlowAccess();
BitSet32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
ABI_CallFunctionPR(PowerPC::ClearDCacheLineFromJit, &m_mmu, RSCRATCH);
@ -524,6 +528,8 @@ void Jit64::stX(UGeckoInstruction inst)
return;
}
FlushRegistersBeforeSlowAccess();
// If we already know the address of the write
if (!a || gpr.IsImm(a))
{
@ -602,6 +608,8 @@ void Jit64::stXx(UGeckoInstruction inst)
break;
}
FlushRegistersBeforeSlowAccess();
const bool does_clobber = WriteClobbersRegValue(accessSize, /* swap */ !byte_reverse);
RCOpArg Ra = update ? gpr.Bind(a, RCMode::ReadWrite) : gpr.Use(a, RCMode::Read);
@ -634,6 +642,8 @@ void Jit64::lmw(UGeckoInstruction inst)
int a = inst.RA, d = inst.RD;
FlushRegistersBeforeSlowAccess();
// TODO: This doesn't handle rollback on DSI correctly
{
RCOpArg Ra = a ? gpr.Use(a, RCMode::Read) : RCOpArg::Imm32(0);
@ -657,6 +667,8 @@ void Jit64::stmw(UGeckoInstruction inst)
int a = inst.RA, d = inst.RD;
FlushRegistersBeforeSlowAccess();
// TODO: This doesn't handle rollback on DSI correctly
for (int i = d; i < 32; i++)
{

View File

@ -30,6 +30,8 @@ void Jit64::lfXXX(UGeckoInstruction inst)
FALLBACK_IF(!indexed && !a);
FlushRegistersBeforeSlowAccess();
s32 offset = 0;
RCOpArg addr = gpr.Bind(a, update ? RCMode::ReadWrite : RCMode::Read);
RegCache::Realize(addr);
@ -103,6 +105,8 @@ void Jit64::stfXXX(UGeckoInstruction inst)
FALLBACK_IF(update && jo.memcheck && indexed && a == b);
FlushRegistersBeforeSlowAccess();
if (single)
{
if (js.fpr_is_store_safe[s] && js.op->fprIsSingle[s])
@ -196,6 +200,8 @@ void Jit64::stfiwx(UGeckoInstruction inst)
int a = inst.RA;
int b = inst.RB;
FlushRegistersBeforeSlowAccess();
RCOpArg Ra = a ? gpr.Use(a, RCMode::Read) : RCOpArg::Imm32(0);
RCOpArg Rb = gpr.Use(b, RCMode::Read);
RCOpArg Rs = fpr.Use(s, RCMode::Read);

View File

@ -35,6 +35,8 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(!a);
FlushRegistersBeforeSlowAccess();
RCX64Reg scratch_guard = gpr.Scratch(RSCRATCH_EXTRA);
RCOpArg Ra = update ? gpr.Bind(a, RCMode::ReadWrite) : gpr.Use(a, RCMode::Read);
RCOpArg Rb = indexed ? gpr.Use(b, RCMode::Read) : RCOpArg::Imm32((u32)offset);
@ -69,8 +71,8 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
}
else
{
// Stash PC in case asm routine needs to call into C++
MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
FlushPCBeforeSlowAccess();
// We know what GQR is here, so we can load RSCRATCH2 and call into the store method directly
// with just the scale bits.
MOV(32, R(RSCRATCH2), Imm32(gqrValue & 0x3F00));
@ -83,8 +85,8 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
}
else
{
// Stash PC in case asm routine needs to call into C++
MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
FlushPCBeforeSlowAccess();
// Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code.
// Hence, we need to mask out the unused bits. The layout of the GQR register is
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
@ -124,6 +126,8 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(!a);
FlushRegistersBeforeSlowAccess();
RCX64Reg scratch_guard = gpr.Scratch(RSCRATCH_EXTRA);
RCX64Reg Ra = gpr.Bind(a, update ? RCMode::ReadWrite : RCMode::Read);
RCOpArg Rb = indexed ? gpr.Use(b, RCMode::Read) : RCOpArg::Imm32((u32)offset);
@ -144,8 +148,8 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
}
else
{
// Stash PC in case asm routine needs to call into C++
MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
FlushPCBeforeSlowAccess();
// Get the high part of the GQR register
OpArg gqr = PPCSTATE_SPR(SPR_GQR0 + i);
gqr.AddMemOffset(2);

View File

@ -386,15 +386,10 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg& opAddress,
SetJumpTarget(slow);
}
// PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe
// interrupt checks, and printing accurate PC locations in debug logs.
//
// In the case of Jit64AsmCommon routines, we don't know the PC here,
// so the caller has to store the PC themselves.
// In the case of Jit64AsmCommon routines, the state we want to store here
// isn't known at JIT time, so the caller has to store it themselves.
if (!(flags & SAFE_LOADSTORE_NO_UPDATE_PC))
{
MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
}
m_jit.FlushPCBeforeSlowAccess();
size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0;
ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment);
@ -457,8 +452,7 @@ void EmuCodeBlock::SafeLoadToRegImmediate(X64Reg reg_value, u32 address, int acc
return;
}
// Helps external systems know which instruction triggered the read.
MOV(32, PPCSTATE(pc), Imm32(m_jit.js.compilerPC));
m_jit.FlushPCBeforeSlowAccess();
// Fall back to general-case code.
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
@ -560,15 +554,10 @@ void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acces
SetJumpTarget(slow);
}
// PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe
// interrupt checks, and printing accurate PC locations in debug logs.
//
// In the case of Jit64AsmCommon routines, we don't know the PC here,
// so the caller has to store the PC themselves.
// In the case of Jit64AsmCommon routines, the state we want to store here
// isn't known at JIT time, so the caller has to store it themselves.
if (!(flags & SAFE_LOADSTORE_NO_UPDATE_PC))
{
MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
}
m_jit.FlushPCBeforeSlowAccess();
size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0;
ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment);
@ -663,8 +652,7 @@ bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address,
}
else
{
// Helps external systems know which instruction triggered the write
MOV(32, PPCSTATE(pc), Imm32(m_jit.js.compilerPC));
m_jit.FlushPCBeforeSlowAccess();
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
switch (accessSize)

View File

@ -282,6 +282,9 @@ protected:
Arm64Gen::ARM64Reg addr, BitSet32 gprs_to_push = BitSet32(0),
BitSet32 fprs_to_push = BitSet32(0), bool emitting_routine = false);
// temp_gpr must be a valid register, but temp_fpr can be INVALID_REG.
void FlushPPCStateBeforeSlowAccess(Arm64Gen::ARM64Reg temp_gpr, Arm64Gen::ARM64Reg temp_fpr);
// Loadstore routines
void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update);
void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset, bool update);

View File

@ -21,6 +21,7 @@
#include "Core/PowerPC/JitArmCommon/BackPatch.h"
#include "Core/PowerPC/MMU.h"
#include "Core/PowerPC/PowerPC.h"
#include "Core/System.h"
using namespace Arm64Gen;
@ -171,6 +172,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
const ARM64Reg temp_gpr = ARM64Reg::W1;
const int temp_gpr_index = DecodeReg(temp_gpr);
const ARM64Reg temp_fpr = fprs_to_push[0] ? ARM64Reg::INVALID_REG : ARM64Reg::Q0;
BitSet32 gprs_to_push_early = {};
if (memcheck)
@ -189,16 +191,10 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
ABI_PushRegisters(gprs_to_push & ~gprs_to_push_early);
m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30);
// PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe
// interrupt checks, and printing accurate PC locations in debug logs.
//
// In the case of JitAsm routines, we don't know the PC here,
// so the caller has to store the PC themselves.
// In the case of JitAsm routines, the state we want to store here
// isn't known at JIT time, so the caller has to store it themselves.
if (!emitting_routine)
{
MOVI2R(ARM64Reg::W30, js.compilerPC);
STR(IndexType::Unsigned, ARM64Reg::W30, PPC_REG, PPCSTATE_OFF(pc));
}
FlushPPCStateBeforeSlowAccess(ARM64Reg::W30, temp_fpr);
if (flags & BackPatchInfo::FLAG_STORE)
{
@ -265,7 +261,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
if (memcheck)
{
const ARM64Reg temp_fpr = fprs_to_push[0] ? ARM64Reg::INVALID_REG : ARM64Reg::Q0;
const u64 early_push_count = (gprs_to_push & gprs_to_push_early).Count();
const u64 early_push_size = Common::AlignUp(early_push_count, 2) * 8;
@ -316,6 +311,22 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
}
}
void JitArm64::FlushPPCStateBeforeSlowAccess(ARM64Reg temp_gpr, ARM64Reg temp_fpr)
{
// PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe
// interrupt checks, and printing accurate PC locations in debug logs.
MOVI2R(temp_gpr, js.compilerPC);
STR(IndexType::Unsigned, temp_gpr, PPC_REG, PPCSTATE_OFF(pc));
// Register values can be used by memory watchpoint conditions.
MemChecks& mem_checks = m_system.GetPowerPC().GetMemChecks();
if (mem_checks.HasAny())
{
gpr.StoreRegisters(mem_checks.GetGPRsUsedInConditions(), temp_gpr, FlushMode::MaintainState);
fpr.StoreRegisters(mem_checks.GetFPRsUsedInConditions(), temp_fpr, FlushMode::MaintainState);
}
}
bool JitArm64::HandleFastmemFault(SContext* ctx)
{
const u8* pc = reinterpret_cast<const u8*>(ctx->CTX_PC);

View File

@ -102,9 +102,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
{
LDR(IndexType::Unsigned, scale_reg, PPC_REG, PPCSTATE_OFF_SPR(SPR_GQR0 + i));
// Stash PC in case asm routine needs to call into C++
MOVI2R(ARM64Reg::W30, js.compilerPC);
STR(IndexType::Unsigned, ARM64Reg::W30, PPC_REG, PPCSTATE_OFF(pc));
FlushPPCStateBeforeSlowAccess(ARM64Reg::W30, ARM64Reg::Q1);
UBFM(type_reg, scale_reg, 16, 18); // Type
UBFM(scale_reg, scale_reg, 24, 29); // Scale
@ -260,9 +258,7 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
{
LDR(IndexType::Unsigned, scale_reg, PPC_REG, PPCSTATE_OFF_SPR(SPR_GQR0 + i));
// Stash PC in case asm routine needs to call into C++
MOVI2R(ARM64Reg::W30, js.compilerPC);
STR(IndexType::Unsigned, ARM64Reg::W30, PPC_REG, PPCSTATE_OFF(pc));
FlushPPCStateBeforeSlowAccess(ARM64Reg::W30, ARM64Reg::Q1);
UBFM(type_reg, scale_reg, 0, 2); // Type
UBFM(scale_reg, scale_reg, 8, 13); // Scale

View File

@ -388,14 +388,16 @@ public:
BitSet32 GetDirtyGPRs() const;
void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG)
void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG,
FlushMode flush_mode = FlushMode::All)
{
FlushRegisters(regs, FlushMode::All, tmp_reg, IgnoreDiscardedRegisters::No);
FlushRegisters(regs, flush_mode, tmp_reg, IgnoreDiscardedRegisters::No);
}
void StoreCRRegisters(BitSet8 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG)
void StoreCRRegisters(BitSet8 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG,
FlushMode flush_mode = FlushMode::All)
{
FlushCRRegisters(regs, FlushMode::All, tmp_reg, IgnoreDiscardedRegisters::No);
FlushCRRegisters(regs, flush_mode, tmp_reg, IgnoreDiscardedRegisters::No);
}
void DiscardCRRegisters(BitSet8 regs);
@ -459,9 +461,10 @@ public:
void FixSinglePrecision(size_t preg);
void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG)
void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG,
FlushMode flush_mode = FlushMode::All)
{
FlushRegisters(regs, FlushMode::All, tmp_reg);
FlushRegisters(regs, flush_mode, tmp_reg);
}
protected: