CPU/Recompiler: Use register cache for managing pc

Reduces the number of loadstores after each instruction.
This commit is contained in:
Connor McLaughlin 2019-12-12 23:34:53 +10:00
parent 002d1cd4fd
commit aa52dbfeb8
7 changed files with 88 additions and 105 deletions

View File

@ -120,6 +120,9 @@ void CodeCache::Execute()
}
}
}
// in case we switch to interpreter...
m_core->m_regs.npc = m_core->m_regs.pc;
}
void CodeCache::SetUseRecompiler(bool enable)
@ -148,17 +151,17 @@ void CodeCache::Flush()
void CodeCache::LogCurrentState()
{
const auto& regs = m_core->m_regs;
WriteToExecutionLog(
"tick=%u pc=%08X npc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X "
"t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X "
"s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X npc=%08X ldr=%s "
"ldv=%08X\n",
m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.npc, regs.zero, regs.at, regs.v0,
regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4, regs.t5, regs.t6, regs.t7,
regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7, regs.t8, regs.t9, regs.k0, regs.k1, regs.gp,
regs.sp, regs.fp, regs.ra, regs.npc,
(m_core->m_next_load_delay_reg == Reg::count) ? "NONE" : GetRegName(m_core->m_next_load_delay_reg),
(m_core->m_next_load_delay_reg == Reg::count) ? 0 : m_core->m_next_load_delay_value);
WriteToExecutionLog("tick=%u pc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X "
"t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X "
"s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X ldr=%s "
"ldv=%08X\n",
m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.zero, regs.at,
regs.v0, regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4,
regs.t5, regs.t6, regs.t7, regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7,
regs.t8, regs.t9, regs.k0, regs.k1, regs.gp, regs.sp, regs.fp, regs.ra,
(m_core->m_next_load_delay_reg == Reg::count) ? "NONE" :
GetRegName(m_core->m_next_load_delay_reg),
(m_core->m_next_load_delay_reg == Reg::count) ? 0 : m_core->m_next_load_delay_value);
}
CodeBlockKey CodeCache::GetNextBlockKey() const
@ -202,7 +205,8 @@ bool CodeCache::RevalidateBlock(CodeBlock* block)
for (const CodeBlockInstruction& cbi : block->instructions)
{
u32 new_code = 0;
m_bus->DispatchAccess<MemoryAccessType::Read, MemoryAccessSize::Word>(cbi.pc, new_code);
m_bus->DispatchAccess<MemoryAccessType::Read, MemoryAccessSize::Word>(cbi.pc & PHYSICAL_MEMORY_ADDRESS_MASK,
new_code);
if (cbi.instruction.bits != new_code)
{
Log_DebugPrintf("Block 0x%08X changed at PC 0x%08X - %08X to %08X - recompiling.", block->GetPC(), cbi.pc,
@ -419,7 +423,9 @@ void CodeCache::UnlinkBlock(CodeBlock* block)
void CodeCache::InterpretCachedBlock(const CodeBlock& block)
{
// set up the state so we've already fetched the instruction
DebugAssert((m_core->m_regs.pc & PHYSICAL_MEMORY_ADDRESS_MASK) == block.GetPC());
DebugAssert(m_core->m_regs.pc == block.GetPC());
m_core->m_regs.npc = block.GetPC() + 4;
for (const CodeBlockInstruction& cbi : block.instructions)
{
@ -427,14 +433,13 @@ void CodeCache::InterpretCachedBlock(const CodeBlock& block)
// now executing the instruction we previously fetched
m_core->m_current_instruction.bits = cbi.instruction.bits;
m_core->m_current_instruction_pc = m_core->m_regs.pc;
m_core->m_current_instruction_pc = cbi.pc;
m_core->m_current_instruction_in_branch_delay_slot = cbi.is_branch_delay_slot;
m_core->m_current_instruction_was_branch_taken = m_core->m_branch_was_taken;
m_core->m_branch_was_taken = false;
m_core->m_exception_raised = false;
// update pc
DebugAssert((m_core->m_regs.pc & PHYSICAL_MEMORY_ADDRESS_MASK) == cbi.pc);
m_core->m_regs.pc = m_core->m_regs.npc;
m_core->m_regs.npc += 4;
@ -454,6 +459,8 @@ void CodeCache::InterpretCachedBlock(const CodeBlock& block)
void CodeCache::InterpretUncachedBlock()
{
Panic("Fixme with regards to re-fetching PC");
// At this point, pc contains the last address executed (in the previous block). The instruction has not been fetched
// yet. pc shouldn't be updated until the fetch occurs, that way the exception occurs in the delay slot.
bool in_branch_delay_slot = false;

View File

@ -28,6 +28,8 @@ union CodeBlockKey
ALWAYS_INLINE u32 GetPC() const { return aligned_pc << 2; }
ALWAYS_INLINE void SetPC(u32 pc) { aligned_pc = pc >> 2; }
ALWAYS_INLINE u32 GetPCPhysicalAddress() const { return (aligned_pc << 2) & PHYSICAL_MEMORY_ADDRESS_MASK; }
ALWAYS_INLINE CodeBlockKey& operator=(const CodeBlockKey& rhs)
{
bits = rhs.bits;
@ -72,12 +74,15 @@ struct CodeBlock
const u32 GetPC() const { return key.GetPC(); }
const u32 GetSizeInBytes() const { return static_cast<u32>(instructions.size()) * sizeof(Instruction); }
const u32 GetStartPageIndex() const { return (key.GetPC() / CPU_CODE_CACHE_PAGE_SIZE); }
const u32 GetEndPageIndex() const { return ((key.GetPC() + GetSizeInBytes()) / CPU_CODE_CACHE_PAGE_SIZE); }
const u32 GetStartPageIndex() const { return (key.GetPCPhysicalAddress() / CPU_CODE_CACHE_PAGE_SIZE); }
const u32 GetEndPageIndex() const
{
return ((key.GetPCPhysicalAddress() + GetSizeInBytes()) / CPU_CODE_CACHE_PAGE_SIZE);
}
bool IsInRAM() const
{
// TODO: Constant
return key.GetPC() < 0x200000;
return key.GetPCPhysicalAddress() < 0x200000;
}
};

View File

@ -50,6 +50,8 @@ bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePoin
EmitEndBlock();
FinalizeBlock(out_host_code, out_host_code_size);
Log_ProfilePrintf("JIT block 0x%08X: %zu instructions (%u bytes), %u host bytes", block->GetPC(),
block->instructions.size(), block->GetSizeInBytes(), *out_host_code_size);
DebugAssert(m_register_cache.GetUsedHostRegisters() == 0);
@ -709,14 +711,6 @@ void CodeGenerator::BlockPrologue()
m_branch_was_taken_dirty = true;
m_current_instruction_was_branch_taken_dirty = false;
m_load_delay_dirty = true;
// sync m_current_instruction_pc so we can simply add to it
SyncCurrentInstructionPC();
// and the same for m_regs.pc
SyncPC();
EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(4));
}
void CodeGenerator::BlockEpilogue()
@ -729,17 +723,7 @@ void CodeGenerator::BlockEpilogue()
if (m_register_cache.HasLoadDelay())
m_register_cache.WriteLoadDelayToCPU(true);
// if the last instruction wasn't a fallback, we need to add its fetch
if (m_delayed_pc_add > 0)
{
EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(m_delayed_pc_add));
m_delayed_pc_add = 0;
}
AddPendingCycles();
// TODO: correct value for is_branch_delay_slot - branches in branch delay slot.
EmitStoreCPUStructField(offsetof(Core, m_next_instruction_is_branch_delay_slot), Value::FromConstantU8(0));
}
void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles,
@ -771,42 +755,29 @@ void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCou
m_current_instruction_in_branch_delay_slot_dirty = false;
}
if (cbi.is_branch_delay_slot)
// increment PC, except if we're in the branch delay slot where it was just changed
if (!cbi.is_branch_delay_slot)
{
// m_regs.pc should be synced for the next block, as the branch wrote to npc
SyncCurrentInstructionPC();
SyncPC();
// m_current_instruction_in_branch_delay_slot = true
EmitStoreCPUStructField(offsetof(Core, m_current_instruction_in_branch_delay_slot), Value::FromConstantU8(1));
m_current_instruction_in_branch_delay_slot_dirty = true;
Assert(!m_register_cache.IsGuestRegisterInHostRegister(Reg::pc));
m_register_cache.WriteGuestRegister(Reg::pc, Value::FromConstantU32(cbi.pc + 4));
}
if (!CanInstructionTrap(cbi.instruction, m_block->key.user_mode) && !force_sync)
{
// Defer updates for non-faulting instructions.
m_delayed_pc_add += INSTRUCTION_SIZE;
m_delayed_cycles_add += cycles;
return;
}
if (m_delayed_pc_add > 0)
if (cbi.is_branch_delay_slot)
{
// m_current_instruction_pc += m_delayed_pc_add
EmitAddCPUStructField(offsetof(Core, m_current_instruction_pc), Value::FromConstantU32(m_delayed_pc_add));
// m_regs.pc += m_delayed_pc_add
EmitAddCPUStructField(offsetof(Core, m_regs.pc), Value::FromConstantU32(m_delayed_pc_add));
// m_regs.npc += m_delayed_pc_add
// TODO: This can go once we recompile branch instructions and unconditionally set npc
EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(m_delayed_pc_add));
m_delayed_pc_add = 0;
// m_current_instruction_in_branch_delay_slot = true
EmitStoreCPUStructField(offsetof(Core, m_current_instruction_in_branch_delay_slot), Value::FromConstantU8(1));
m_current_instruction_in_branch_delay_slot_dirty = true;
}
if (!cbi.is_branch_instruction)
m_delayed_pc_add = INSTRUCTION_SIZE;
// Sync current instruction PC
EmitStoreCPUStructField(offsetof(Core, m_current_instruction_pc), Value::FromConstantU32(cbi.pc));
m_delayed_cycles_add += cycles;
AddPendingCycles();
@ -835,22 +806,6 @@ void CodeGenerator::InstructionEpilogue(const CodeBlockInstruction& cbi)
}
}
void CodeGenerator::SyncCurrentInstructionPC()
{
// m_current_instruction_pc = m_regs.pc
Value pc_value = m_register_cache.AllocateScratch(RegSize_32);
EmitLoadCPUStructField(pc_value.host_reg, RegSize_32, offsetof(Core, m_regs.pc));
EmitStoreCPUStructField(offsetof(Core, m_current_instruction_pc), pc_value);
}
void CodeGenerator::SyncPC()
{
// m_regs.pc = m_regs.npc
Value npc_value = m_register_cache.AllocateScratch(RegSize_32);
EmitLoadCPUStructField(npc_value.host_reg, RegSize_32, offsetof(Core, m_regs.npc));
EmitStoreCPUStructField(offsetof(Core, m_regs.pc), npc_value);
}
void CodeGenerator::AddPendingCycles()
{
if (m_delayed_cycles_add == 0)
@ -1246,8 +1201,7 @@ bool CodeGenerator::Compile_SetLess(const CodeBlockInstruction& cbi)
bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
{
// Force sync since we branches are PC-relative.
InstructionPrologue(cbi, 1, true);
InstructionPrologue(cbi, 1);
// Compute the branch target.
// This depends on the form of the instruction.
@ -1258,7 +1212,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
{
// npc = (pc & 0xF0000000) | (target << 2)
Value branch_target =
OrValues(AndValues(m_register_cache.ReadGuestRegister(Reg::pc, false), Value::FromConstantU32(0xF0000000)),
OrValues(AndValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(0xF0000000)),
Value::FromConstantU32(cbi.instruction.j.target << 2));
EmitBranch(Condition::Always, (cbi.instruction.op == InstructionOp::jal) ? Reg::ra : Reg::count,
@ -1294,7 +1248,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
case InstructionOp::bne:
{
// npc = pc + (sext(imm) << 2)
Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false),
Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc),
Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false);
// branch <- rs op rt
@ -1311,7 +1265,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
case InstructionOp::blez:
{
// npc = pc + (sext(imm) << 2)
Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false),
Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc),
Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false);
// branch <- rs op 0
@ -1327,7 +1281,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
case InstructionOp::b:
{
// npc = pc + (sext(imm) << 2)
Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false),
Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc),
Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false);
const u8 rt = static_cast<u8>(cbi.instruction.i.rt.GetValue());
@ -1344,7 +1298,8 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
if (link)
{
EmitCancelInterpreterLoadDelayForReg(Reg::ra);
m_register_cache.WriteGuestRegister(Reg::ra, m_register_cache.ReadGuestRegister(Reg::npc, false));
m_register_cache.WriteGuestRegister(
Reg::ra, AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(4), false));
}
EmitTest(lhs.host_reg, lhs);

View File

@ -165,8 +165,6 @@ private:
void BlockEpilogue();
void InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles, bool force_sync = false);
void InstructionEpilogue(const CodeBlockInstruction& cbi);
void SyncCurrentInstructionPC();
void SyncPC();
void AddPendingCycles();
Value DoGTERegisterRead(u32 index);
@ -202,7 +200,6 @@ private:
CodeEmitter m_far_emitter;
CodeEmitter* m_emit;
u32 m_delayed_pc_add = 0;
TickCount m_delayed_cycles_add = 0;
// whether various flags need to be reset.

View File

@ -201,6 +201,9 @@ void CodeGenerator::EmitEndBlock()
void CodeGenerator::EmitExceptionExit()
{
// toss away our PC value since we're jumping to the exception handler
m_register_cache.InvalidateGuestRegister(Reg::pc);
// ensure all unflushed registers are written back
m_register_cache.FlushAllGuestRegisters(false, false);
@ -1762,28 +1765,33 @@ static void EmitConditionalJump(Condition condition, bool invert, Xbyak::CodeGen
void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, Value&& branch_target)
{
// allocate scratch register for reading npc - we return to the main path, so this could cause a reg flush
Value old_npc = m_register_cache.AllocateScratch(RegSize_32);
// npc gets modified by the branch, so we can't trust it on returning. same for lr_reg, which might contain a dirty
// value
m_register_cache.FlushGuestRegister(Reg::npc, true, true);
if (lr_reg != Reg::count)
// ensure the lr register is flushed, since we want it's correct value after the branch
if (lr_reg != Reg::count && lr_reg != Reg::zero)
m_register_cache.FlushGuestRegister(lr_reg, true, true);
// condition is inverted because we want the case for skipping it
// compute return address, which is also set as the new pc when the branch isn't taken
Value new_pc;
if (condition != Condition::Always || lr_reg != Reg::count)
{
new_pc = AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(4), false);
if (!new_pc.IsInHostRegister())
new_pc = GetValueInHostRegister(new_pc);
}
Xbyak::Label skip_branch;
if (condition != Condition::Always)
{
// condition is inverted because we want the case for skipping it
EmitConditionalJump(condition, true, m_emit, skip_branch);
}
// save the old PC if we want to
if (lr_reg != Reg::count)
if (lr_reg != Reg::count && lr_reg != Reg::zero)
{
// Can't cache because we have two branches. Load delay cancel is due to the immediate flush afterwards,
// if we don't cancel it, at the end of the instruction the value we write can be overridden.
EmitCancelInterpreterLoadDelayForReg(lr_reg);
EmitLoadGuestRegister(old_npc.host_reg, Reg::npc);
EmitStoreGuestRegister(lr_reg, old_npc);
EmitStoreGuestRegister(lr_reg, new_pc);
}
// we don't need to test the address of constant branches unless they're definitely misaligned, which would be
@ -1814,12 +1822,18 @@ void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, Value&& branch_t
m_register_cache.PopState();
}
// branch taken path - write new PC and flush it, since two branches
EmitStoreGuestRegister(Reg::npc, branch_target);
EmitStoreCPUStructField(offsetof(Core, m_current_instruction_was_branch_taken), Value::FromConstantU8(1));
// branch taken path - change the return address/new pc
if (condition != Condition::Always)
EmitCopyValue(new_pc.GetHostRegister(), branch_target);
// converge point
m_emit->L(skip_branch);
// update pc
if (condition != Condition::Always)
m_register_cache.WriteGuestRegister(Reg::pc, std::move(new_pc));
else
m_register_cache.WriteGuestRegister(Reg::pc, std::move(branch_target));
}
void CodeGenerator::EmitRaiseException(Exception excode, Condition condition /* = Condition::Always */)
@ -1827,14 +1841,12 @@ void CodeGenerator::EmitRaiseException(Exception excode, Condition condition /*
if (condition == Condition::Always)
{
// no need to use far code if we're always raising the exception
EmitFunctionCall(nullptr, &Thunks::RaiseException, m_register_cache.GetCPUPtr(),
Value::FromConstantU8(static_cast<u8>(excode)));
m_register_cache.InvalidateGuestRegister(Reg::pc);
m_register_cache.FlushAllGuestRegisters(true, true);
m_register_cache.FlushLoadDelay(true);
// PC should be synced at this point. If we leave the 4 on here for this instruction, we mess up npc.
Assert(m_delayed_pc_add == 4);
m_delayed_pc_add = 0;
EmitFunctionCall(nullptr, &Thunks::RaiseException, m_register_cache.GetCPUPtr(),
Value::FromConstantU8(static_cast<u8>(excode)));
return;
}

View File

@ -252,6 +252,13 @@ public:
return cache_value.IsConstant() || cache_value.IsInHostRegister();
}
/// Returns true if the specified guest register is cached and in a host register.
bool IsGuestRegisterInHostRegister(Reg guest_reg) const
{
const Value& cache_value = m_state.guest_reg_state[static_cast<u8>(guest_reg)];
return cache_value.IsInHostRegister();
}
/// Returns the host register if the guest register is cached.
std::optional<HostReg> GetHostRegisterForGuestRegister(Reg guest_reg) const
{

View File

@ -189,7 +189,7 @@ bool CanInstructionTrap(const Instruction& instruction, bool in_user_mode)
case InstructionOp::bgtz:
case InstructionOp::blez:
case InstructionOp::bne:
return true;
return false;
case InstructionOp::funct:
{