JitArm64: Install BranchWatch

This commit is contained in:
mitaclaw 2023-12-20 19:10:03 -08:00
parent 7cccedca1e
commit fd8f2c7822
4 changed files with 247 additions and 29 deletions

View File

@ -1181,7 +1181,22 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
if (HandleFunctionHooking(op.address))
break;
if (!op.skip)
if (op.skip)
{
if (IsDebuggingEnabled())
{
// The only thing that currently sets op.skip is the BLR following optimization.
// If any non-branch instruction starts setting that too, this will need to be changed.
ASSERT(op.inst.hex == 0x4e800020);
const ARM64Reg bw_reg_a = gpr.GetReg(), bw_reg_b = gpr.GetReg();
const BitSet32 gpr_caller_save =
gpr.GetCallerSavedUsed() & ~BitSet32{DecodeReg(bw_reg_a), DecodeReg(bw_reg_b)};
WriteBranchWatch<true>(op.address, op.branchTo, op.inst, bw_reg_a, bw_reg_b,
gpr_caller_save, fpr.GetCallerSavedUsed());
gpr.Unlock(bw_reg_a, bw_reg_b);
}
}
else
{
if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound)
{

View File

@ -315,6 +315,16 @@ protected:
void MSRUpdated(u32 msr);
void MSRUpdated(Arm64Gen::ARM64Reg msr);
// Branch Watch
template <bool condition>
void WriteBranchWatch(u32 origin, u32 destination, UGeckoInstruction inst,
Arm64Gen::ARM64Reg reg_a, Arm64Gen::ARM64Reg reg_b,
BitSet32 gpr_caller_save, BitSet32 fpr_caller_save);
void WriteBranchWatchDestInRegister(u32 origin, Arm64Gen::ARM64Reg destination,
UGeckoInstruction inst, Arm64Gen::ARM64Reg reg_a,
Arm64Gen::ARM64Reg reg_b, BitSet32 gpr_caller_save,
BitSet32 fpr_caller_save);
// Exits
void
WriteExit(u32 destination, bool LK = false, u32 exit_address_after_return = 0,

View File

@ -8,6 +8,7 @@
#include "Core/Core.h"
#include "Core/CoreTiming.h"
#include "Core/Debugger/BranchWatch.h"
#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h"
#include "Core/PowerPC/PPCTables.h"
#include "Core/PowerPC/PowerPC.h"
@ -74,6 +75,70 @@ void JitArm64::rfi(UGeckoInstruction inst)
gpr.Unlock(WA);
}
template <bool condition>
void JitArm64::WriteBranchWatch(u32 origin, u32 destination, UGeckoInstruction inst, ARM64Reg reg_a,
ARM64Reg reg_b, BitSet32 gpr_caller_save, BitSet32 fpr_caller_save)
{
const ARM64Reg branch_watch = EncodeRegTo64(reg_a);
MOVP2R(branch_watch, &m_branch_watch);
LDRB(IndexType::Unsigned, reg_b, branch_watch, Core::BranchWatch::GetOffsetOfRecordingActive());
FixupBranch branch_over = CBZ(reg_b);
FixupBranch branch_in = B();
SwitchToFarCode();
SetJumpTarget(branch_in);
const ARM64Reg float_emit_tmp = EncodeRegTo64(reg_b);
ABI_PushRegisters(gpr_caller_save);
m_float_emit.ABI_PushRegisters(fpr_caller_save, float_emit_tmp);
ABI_CallFunction(m_ppc_state.msr.IR ? (condition ? &Core::BranchWatch::HitVirtualTrue_fk :
&Core::BranchWatch::HitVirtualFalse_fk) :
(condition ? &Core::BranchWatch::HitPhysicalTrue_fk :
&Core::BranchWatch::HitPhysicalFalse_fk),
branch_watch, Core::FakeBranchWatchCollectionKey{origin, destination}, inst.hex);
m_float_emit.ABI_PopRegisters(fpr_caller_save, float_emit_tmp);
ABI_PopRegisters(gpr_caller_save);
FixupBranch branch_out = B();
SwitchToNearCode();
SetJumpTarget(branch_out);
SetJumpTarget(branch_over);
}
template void JitArm64::WriteBranchWatch<true>(u32, u32, UGeckoInstruction, ARM64Reg, ARM64Reg,
BitSet32, BitSet32);
template void JitArm64::WriteBranchWatch<false>(u32, u32, UGeckoInstruction, ARM64Reg, ARM64Reg,
BitSet32, BitSet32);
void JitArm64::WriteBranchWatchDestInRegister(u32 origin, ARM64Reg destination,
UGeckoInstruction inst, ARM64Reg reg_a,
ARM64Reg reg_b, BitSet32 gpr_caller_save,
BitSet32 fpr_caller_save)
{
const ARM64Reg branch_watch = EncodeRegTo64(reg_a);
MOVP2R(branch_watch, &m_branch_watch);
LDRB(IndexType::Unsigned, reg_b, branch_watch, Core::BranchWatch::GetOffsetOfRecordingActive());
FixupBranch branch_over = CBZ(reg_b);
FixupBranch branch_in = B();
SwitchToFarCode();
SetJumpTarget(branch_in);
const ARM64Reg float_emit_tmp = EncodeRegTo64(reg_b);
ABI_PushRegisters(gpr_caller_save);
m_float_emit.ABI_PushRegisters(fpr_caller_save, float_emit_tmp);
ABI_CallFunction(m_ppc_state.msr.IR ? &Core::BranchWatch::HitVirtualTrue :
&Core::BranchWatch::HitPhysicalTrue,
branch_watch, origin, destination, inst.hex);
m_float_emit.ABI_PopRegisters(fpr_caller_save, float_emit_tmp);
ABI_PopRegisters(gpr_caller_save);
FixupBranch branch_out = B();
SwitchToNearCode();
SetJumpTarget(branch_out);
SetJumpTarget(branch_over);
}
void JitArm64::bx(UGeckoInstruction inst)
{
INSTRUCTION_START
@ -89,6 +154,16 @@ void JitArm64::bx(UGeckoInstruction inst)
if (!js.isLastInstruction)
{
if (IsDebuggingEnabled())
{
const ARM64Reg WB = gpr.GetReg(), WC = gpr.GetReg();
BitSet32 gpr_caller_save = gpr.GetCallerSavedUsed() & ~BitSet32{DecodeReg(WB), DecodeReg(WC)};
if (WA != ARM64Reg::INVALID_REG && js.op->skipLRStack)
gpr_caller_save[DecodeReg(WA)] = false;
WriteBranchWatch<true>(js.compilerPC, js.op->branchTo, inst, WB, WC, gpr_caller_save,
fpr.GetCallerSavedUsed());
gpr.Unlock(WB, WC);
}
if (inst.LK && !js.op->skipLRStack)
{
// We have to fake the stack as the RET instruction was not
@ -108,22 +183,37 @@ void JitArm64::bx(UGeckoInstruction inst)
if (js.op->branchIsIdleLoop)
{
if (WA != ARM64Reg::INVALID_REG)
gpr.Unlock(WA);
if (WA == ARM64Reg::INVALID_REG)
WA = gpr.GetReg();
if (IsDebuggingEnabled())
{
const ARM64Reg WB = gpr.GetReg();
WriteBranchWatch<true>(js.compilerPC, js.op->branchTo, inst, WA, WB, {}, {});
gpr.Unlock(WB);
}
// make idle loops go faster
ARM64Reg WB = gpr.GetReg();
ARM64Reg XB = EncodeRegTo64(WB);
ARM64Reg XA = EncodeRegTo64(WA);
MOVP2R(XB, &CoreTiming::GlobalIdle);
BLR(XB);
gpr.Unlock(WB);
MOVP2R(XA, &CoreTiming::GlobalIdle);
BLR(XA);
gpr.Unlock(WA);
WriteExceptionExit(js.op->branchTo);
return;
}
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4, inst.LK ? WA : ARM64Reg::INVALID_REG);
if (IsDebuggingEnabled())
{
const ARM64Reg WB = gpr.GetReg(), WC = gpr.GetReg();
const BitSet32 gpr_caller_save =
WA != ARM64Reg::INVALID_REG ? BitSet32{DecodeReg(WA)} & CALLER_SAVED_GPRS : BitSet32{};
WriteBranchWatch<true>(js.compilerPC, js.op->branchTo, inst, WB, WC, gpr_caller_save, {});
gpr.Unlock(WB, WC);
}
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4, WA);
if (WA != ARM64Reg::INVALID_REG)
gpr.Unlock(WA);
}
@ -134,7 +224,9 @@ void JitArm64::bcx(UGeckoInstruction inst)
JITDISABLE(bJITBranchOff);
ARM64Reg WA = gpr.GetReg();
ARM64Reg WB = inst.LK ? gpr.GetReg() : WA;
ARM64Reg WB = inst.LK || IsDebuggingEnabled() ? gpr.GetReg() : WA;
ARM64Reg WC = IsDebuggingEnabled() && inst.LK && !js.op->branchIsIdleLoop ? gpr.GetReg() :
ARM64Reg::INVALID_REG;
FixupBranch pCTRDontBranch;
if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0) // Decrement and test CTR
@ -166,6 +258,19 @@ void JitArm64::bcx(UGeckoInstruction inst)
gpr.Flush(FlushMode::MaintainState, WB);
fpr.Flush(FlushMode::MaintainState, ARM64Reg::INVALID_REG);
if (IsDebuggingEnabled())
{
ARM64Reg bw_reg_a, bw_reg_b;
// WC is only allocated when WA is needed for WriteExit and cannot be clobbered.
if (WC == ARM64Reg::INVALID_REG)
bw_reg_a = WA, bw_reg_b = WB;
else
bw_reg_a = WB, bw_reg_b = WC;
const BitSet32 gpr_caller_save =
gpr.GetCallerSavedUsed() & ~BitSet32{DecodeReg(bw_reg_a), DecodeReg(bw_reg_b)};
WriteBranchWatch<true>(js.compilerPC, js.op->branchTo, inst, bw_reg_a, bw_reg_b,
gpr_caller_save, fpr.GetCallerSavedUsed());
}
if (js.op->branchIsIdleLoop)
{
// make idle loops go faster
@ -178,7 +283,7 @@ void JitArm64::bcx(UGeckoInstruction inst)
}
else
{
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4, inst.LK ? WA : ARM64Reg::INVALID_REG);
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4, WA);
}
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
@ -186,12 +291,26 @@ void JitArm64::bcx(UGeckoInstruction inst)
if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0)
SetJumpTarget(pCTRDontBranch);
if (WC != ARM64Reg::INVALID_REG)
gpr.Unlock(WC);
if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
{
gpr.Flush(FlushMode::All, WA);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
if (IsDebuggingEnabled())
{
WriteBranchWatch<false>(js.compilerPC, js.compilerPC + 4, inst, WA, WB, {}, {});
}
WriteExit(js.compilerPC + 4);
}
else if (IsDebuggingEnabled())
{
const BitSet32 gpr_caller_save =
gpr.GetCallerSavedUsed() & ~BitSet32{DecodeReg(WA), DecodeReg(WB)};
WriteBranchWatch<false>(js.compilerPC, js.compilerPC + 4, inst, WA, WB, gpr_caller_save,
fpr.GetCallerSavedUsed());
}
gpr.Unlock(WA);
if (WB != WA)
@ -231,7 +350,17 @@ void JitArm64::bcctrx(UGeckoInstruction inst)
LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF_SPR(SPR_CTR));
AND(WA, WA, LogicalImm(~0x3, GPRSize::B32));
WriteExit(WA, inst.LK_3, js.compilerPC + 4, inst.LK_3 ? WB : ARM64Reg::INVALID_REG);
if (IsDebuggingEnabled())
{
const ARM64Reg WC = gpr.GetReg(), WD = gpr.GetReg();
BitSet32 gpr_caller_save = BitSet32{DecodeReg(WA)};
if (WB != ARM64Reg::INVALID_REG)
gpr_caller_save[DecodeReg(WB)] = true;
gpr_caller_save &= CALLER_SAVED_GPRS;
WriteBranchWatchDestInRegister(js.compilerPC, WA, inst, WC, WD, gpr_caller_save, {});
gpr.Unlock(WC, WD);
}
WriteExit(WA, inst.LK_3, js.compilerPC + 4, WB);
if (WB != ARM64Reg::INVALID_REG)
gpr.Unlock(WB);
@ -247,7 +376,9 @@ void JitArm64::bclrx(UGeckoInstruction inst)
(inst.BO & BO_DONT_DECREMENT_FLAG) == 0 || (inst.BO & BO_DONT_CHECK_CONDITION) == 0;
ARM64Reg WA = gpr.GetReg();
ARM64Reg WB = conditional || inst.LK ? gpr.GetReg() : ARM64Reg::INVALID_REG;
ARM64Reg WB =
conditional || inst.LK || IsDebuggingEnabled() ? gpr.GetReg() : ARM64Reg::INVALID_REG;
ARM64Reg WC = IsDebuggingEnabled() ? gpr.GetReg() : ARM64Reg::INVALID_REG;
FixupBranch pCTRDontBranch;
if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0) // Decrement and test CTR
@ -281,6 +412,26 @@ void JitArm64::bclrx(UGeckoInstruction inst)
gpr.Flush(conditional ? FlushMode::MaintainState : FlushMode::All, WB);
fpr.Flush(conditional ? FlushMode::MaintainState : FlushMode::All, ARM64Reg::INVALID_REG);
if (IsDebuggingEnabled())
{
BitSet32 gpr_caller_save;
BitSet32 fpr_caller_save;
if (conditional)
{
gpr_caller_save = gpr.GetCallerSavedUsed() & ~BitSet32{DecodeReg(WB), DecodeReg(WC)};
if (js.op->branchIsIdleLoop)
gpr_caller_save[DecodeReg(WA)] = false;
fpr_caller_save = fpr.GetCallerSavedUsed();
}
else
{
gpr_caller_save =
js.op->branchIsIdleLoop ? BitSet32{} : BitSet32{DecodeReg(WA)} & CALLER_SAVED_GPRS;
fpr_caller_save = {};
}
WriteBranchWatchDestInRegister(js.compilerPC, WA, inst, WB, WC, gpr_caller_save,
fpr_caller_save);
}
if (js.op->branchIsIdleLoop)
{
// make idle loops go faster
@ -301,12 +452,26 @@ void JitArm64::bclrx(UGeckoInstruction inst)
if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0)
SetJumpTarget(pCTRDontBranch);
if (WC != ARM64Reg::INVALID_REG)
gpr.Unlock(WC);
if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
{
gpr.Flush(FlushMode::All, WA);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
if (IsDebuggingEnabled())
{
WriteBranchWatch<false>(js.compilerPC, js.compilerPC + 4, inst, WA, WB, {}, {});
}
WriteExit(js.compilerPC + 4);
}
else if (IsDebuggingEnabled())
{
const BitSet32 gpr_caller_save =
gpr.GetCallerSavedUsed() & ~BitSet32{DecodeReg(WA), DecodeReg(WB)};
WriteBranchWatch<false>(js.compilerPC, js.compilerPC + 4, inst, WA, WB, gpr_caller_save,
fpr.GetCallerSavedUsed());
}
gpr.Unlock(WA);
if (WB != ARM64Reg::INVALID_REG)

View File

@ -13,6 +13,7 @@
#include "Core/ConfigManager.h"
#include "Core/Core.h"
#include "Core/CoreTiming.h"
#include "Core/Debugger/BranchWatch.h"
#include "Core/HW/DSP.h"
#include "Core/HW/MMIO.h"
#include "Core/HW/Memmap.h"
@ -769,18 +770,15 @@ void JitArm64::dcbx(UGeckoInstruction inst)
js.op[1].inst.RA_6 == b && js.op[1].inst.RD_2 == b &&
js.op[2].inst.hex == 0x4200fff8;
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1);
if (make_loop)
gpr.Lock(ARM64Reg::W2);
constexpr ARM64Reg WA = ARM64Reg::W0, WB = ARM64Reg::W1, loop_counter = ARM64Reg::W2;
// Be careful, loop_counter is only locked when make_loop == true.
gpr.Lock(WA, WB);
ARM64Reg WA = ARM64Reg::W0;
if (make_loop)
gpr.BindToRegister(b, true);
ARM64Reg loop_counter = ARM64Reg::INVALID_REG;
if (make_loop)
{
gpr.Lock(loop_counter);
gpr.BindToRegister(b, true);
// We'll execute somewhere between one single cacheline invalidation and however many are needed
// to reduce the downcount to zero, never exceeding the amount requested by the game.
// To stay consistent with the rest of the code we adjust the involved registers (CTR and Rb)
@ -788,10 +786,8 @@ void JitArm64::dcbx(UGeckoInstruction inst)
// bdnz afterwards! So if we invalidate a single cache line, we don't adjust the registers at
// all, if we invalidate 2 cachelines we adjust the registers by one step, and so on.
ARM64Reg reg_cycle_count = gpr.GetReg();
ARM64Reg reg_downcount = gpr.GetReg();
loop_counter = ARM64Reg::W2;
ARM64Reg WB = ARM64Reg::W1;
const ARM64Reg reg_cycle_count = gpr.GetReg();
const ARM64Reg reg_downcount = gpr.GetReg();
// Figure out how many loops we want to do.
const u8 cycle_count_per_loop =
@ -828,11 +824,43 @@ void JitArm64::dcbx(UGeckoInstruction inst)
// Load the loop_counter register with the amount of invalidations to execute.
ADD(loop_counter, WA, 1);
if (IsDebuggingEnabled())
{
const ARM64Reg branch_watch = EncodeRegTo64(reg_cycle_count);
MOVP2R(branch_watch, &m_branch_watch);
LDRB(IndexType::Unsigned, WB, branch_watch, Core::BranchWatch::GetOffsetOfRecordingActive());
FixupBranch branch_over = CBZ(WB);
FixupBranch branch_in = B();
SwitchToFarCode();
SetJumpTarget(branch_in);
const BitSet32 gpr_caller_save =
gpr.GetCallerSavedUsed() &
~BitSet32{DecodeReg(WB), DecodeReg(reg_cycle_count), DecodeReg(reg_downcount)};
ABI_PushRegisters(gpr_caller_save);
const ARM64Reg float_emit_tmp = EncodeRegTo64(WB);
const BitSet32 fpr_caller_save = fpr.GetCallerSavedUsed();
m_float_emit.ABI_PushRegisters(fpr_caller_save, float_emit_tmp);
const PPCAnalyst::CodeOp& op = js.op[2];
ABI_CallFunction(m_ppc_state.msr.IR ? &Core::BranchWatch::HitVirtualTrue_fk_n :
&Core::BranchWatch::HitPhysicalTrue_fk_n,
branch_watch, Core::FakeBranchWatchCollectionKey{op.address, op.branchTo},
op.inst.hex, WA);
m_float_emit.ABI_PopRegisters(fpr_caller_save, float_emit_tmp);
ABI_PopRegisters(gpr_caller_save);
FixupBranch branch_out = B();
SwitchToNearCode();
SetJumpTarget(branch_out);
SetJumpTarget(branch_over);
}
gpr.Unlock(reg_cycle_count, reg_downcount);
}
ARM64Reg effective_addr = ARM64Reg::W1;
ARM64Reg physical_addr = gpr.GetReg();
constexpr ARM64Reg effective_addr = WB;
const ARM64Reg physical_addr = gpr.GetReg();
if (a)
ADD(effective_addr, gpr.R(a), gpr.R(b));
@ -911,7 +939,7 @@ void JitArm64::dcbx(UGeckoInstruction inst)
SwitchToNearCode();
SetJumpTarget(near_addr);
gpr.Unlock(effective_addr, physical_addr, WA);
gpr.Unlock(WA, WB, physical_addr);
if (make_loop)
gpr.Unlock(loop_counter);
}