From aa52dbfeb8e00fb385122cfe1f7a4525e5ca0cde Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Thu, 12 Dec 2019 23:34:53 +1000 Subject: [PATCH] CPU/Recompiler: Use register cache for managing pc Reduces the number of loadstores after each instruction. --- src/core/cpu_code_cache.cpp | 37 +++++---- src/core/cpu_code_cache.h | 11 ++- src/core/cpu_recompiler_code_generator.cpp | 83 +++++-------------- src/core/cpu_recompiler_code_generator.h | 3 - .../cpu_recompiler_code_generator_x64.cpp | 50 ++++++----- src/core/cpu_recompiler_register_cache.h | 7 ++ src/core/cpu_types.cpp | 2 +- 7 files changed, 88 insertions(+), 105 deletions(-) diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index cc7f6eaef..6c55ba21f 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -120,6 +120,9 @@ void CodeCache::Execute() } } } + + // in case we switch to interpreter... + m_core->m_regs.npc = m_core->m_regs.pc; } void CodeCache::SetUseRecompiler(bool enable) @@ -148,17 +151,17 @@ void CodeCache::Flush() void CodeCache::LogCurrentState() { const auto& regs = m_core->m_regs; - WriteToExecutionLog( - "tick=%u pc=%08X npc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X " - "t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X " - "s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X npc=%08X ldr=%s " - "ldv=%08X\n", - m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.npc, regs.zero, regs.at, regs.v0, - regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4, regs.t5, regs.t6, regs.t7, - regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7, regs.t8, regs.t9, regs.k0, regs.k1, regs.gp, - regs.sp, regs.fp, regs.ra, regs.npc, - (m_core->m_next_load_delay_reg == Reg::count) ? "NONE" : GetRegName(m_core->m_next_load_delay_reg), - (m_core->m_next_load_delay_reg == Reg::count) ? 0 : m_core->m_next_load_delay_value); + WriteToExecutionLog("tick=%u pc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X " + "t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X " + "s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X ldr=%s " + "ldv=%08X\n", + m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.zero, regs.at, + regs.v0, regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4, + regs.t5, regs.t6, regs.t7, regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7, + regs.t8, regs.t9, regs.k0, regs.k1, regs.gp, regs.sp, regs.fp, regs.ra, + (m_core->m_next_load_delay_reg == Reg::count) ? "NONE" : + GetRegName(m_core->m_next_load_delay_reg), + (m_core->m_next_load_delay_reg == Reg::count) ? 0 : m_core->m_next_load_delay_value); } CodeBlockKey CodeCache::GetNextBlockKey() const @@ -202,7 +205,8 @@ bool CodeCache::RevalidateBlock(CodeBlock* block) for (const CodeBlockInstruction& cbi : block->instructions) { u32 new_code = 0; - m_bus->DispatchAccess(cbi.pc, new_code); + m_bus->DispatchAccess(cbi.pc & PHYSICAL_MEMORY_ADDRESS_MASK, + new_code); if (cbi.instruction.bits != new_code) { Log_DebugPrintf("Block 0x%08X changed at PC 0x%08X - %08X to %08X - recompiling.", block->GetPC(), cbi.pc, @@ -419,7 +423,9 @@ void CodeCache::UnlinkBlock(CodeBlock* block) void CodeCache::InterpretCachedBlock(const CodeBlock& block) { // set up the state so we've already fetched the instruction - DebugAssert((m_core->m_regs.pc & PHYSICAL_MEMORY_ADDRESS_MASK) == block.GetPC()); + DebugAssert(m_core->m_regs.pc == block.GetPC()); + + m_core->m_regs.npc = block.GetPC() + 4; for (const CodeBlockInstruction& cbi : block.instructions) { @@ -427,14 +433,13 @@ void CodeCache::InterpretCachedBlock(const CodeBlock& block) // now executing the instruction we previously fetched m_core->m_current_instruction.bits = cbi.instruction.bits; - m_core->m_current_instruction_pc = m_core->m_regs.pc; + m_core->m_current_instruction_pc = cbi.pc; m_core->m_current_instruction_in_branch_delay_slot = cbi.is_branch_delay_slot; m_core->m_current_instruction_was_branch_taken = m_core->m_branch_was_taken; m_core->m_branch_was_taken = false; m_core->m_exception_raised = false; // update pc - DebugAssert((m_core->m_regs.pc & PHYSICAL_MEMORY_ADDRESS_MASK) == cbi.pc); m_core->m_regs.pc = m_core->m_regs.npc; m_core->m_regs.npc += 4; @@ -454,6 +459,8 @@ void CodeCache::InterpretCachedBlock(const CodeBlock& block) void CodeCache::InterpretUncachedBlock() { + Panic("Fixme with regards to re-fetching PC"); + // At this point, pc contains the last address executed (in the previous block). The instruction has not been fetched // yet. pc shouldn't be updated until the fetch occurs, that way the exception occurs in the delay slot. bool in_branch_delay_slot = false; diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h index 9c00dd821..4dd264782 100644 --- a/src/core/cpu_code_cache.h +++ b/src/core/cpu_code_cache.h @@ -28,6 +28,8 @@ union CodeBlockKey ALWAYS_INLINE u32 GetPC() const { return aligned_pc << 2; } ALWAYS_INLINE void SetPC(u32 pc) { aligned_pc = pc >> 2; } + ALWAYS_INLINE u32 GetPCPhysicalAddress() const { return (aligned_pc << 2) & PHYSICAL_MEMORY_ADDRESS_MASK; } + ALWAYS_INLINE CodeBlockKey& operator=(const CodeBlockKey& rhs) { bits = rhs.bits; @@ -72,12 +74,15 @@ struct CodeBlock const u32 GetPC() const { return key.GetPC(); } const u32 GetSizeInBytes() const { return static_cast(instructions.size()) * sizeof(Instruction); } - const u32 GetStartPageIndex() const { return (key.GetPC() / CPU_CODE_CACHE_PAGE_SIZE); } - const u32 GetEndPageIndex() const { return ((key.GetPC() + GetSizeInBytes()) / CPU_CODE_CACHE_PAGE_SIZE); } + const u32 GetStartPageIndex() const { return (key.GetPCPhysicalAddress() / CPU_CODE_CACHE_PAGE_SIZE); } + const u32 GetEndPageIndex() const + { + return ((key.GetPCPhysicalAddress() + GetSizeInBytes()) / CPU_CODE_CACHE_PAGE_SIZE); + } bool IsInRAM() const { // TODO: Constant - return key.GetPC() < 0x200000; + return key.GetPCPhysicalAddress() < 0x200000; } }; diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index cad54b6f4..08c94bfa8 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -50,6 +50,8 @@ bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePoin EmitEndBlock(); FinalizeBlock(out_host_code, out_host_code_size); + Log_ProfilePrintf("JIT block 0x%08X: %zu instructions (%u bytes), %u host bytes", block->GetPC(), + block->instructions.size(), block->GetSizeInBytes(), *out_host_code_size); DebugAssert(m_register_cache.GetUsedHostRegisters() == 0); @@ -709,14 +711,6 @@ void CodeGenerator::BlockPrologue() m_branch_was_taken_dirty = true; m_current_instruction_was_branch_taken_dirty = false; m_load_delay_dirty = true; - - // sync m_current_instruction_pc so we can simply add to it - SyncCurrentInstructionPC(); - - // and the same for m_regs.pc - SyncPC(); - - EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(4)); } void CodeGenerator::BlockEpilogue() @@ -729,17 +723,7 @@ void CodeGenerator::BlockEpilogue() if (m_register_cache.HasLoadDelay()) m_register_cache.WriteLoadDelayToCPU(true); - // if the last instruction wasn't a fallback, we need to add its fetch - if (m_delayed_pc_add > 0) - { - EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(m_delayed_pc_add)); - m_delayed_pc_add = 0; - } - AddPendingCycles(); - - // TODO: correct value for is_branch_delay_slot - branches in branch delay slot. - EmitStoreCPUStructField(offsetof(Core, m_next_instruction_is_branch_delay_slot), Value::FromConstantU8(0)); } void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles, @@ -771,42 +755,29 @@ void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCou m_current_instruction_in_branch_delay_slot_dirty = false; } - if (cbi.is_branch_delay_slot) + // increment PC, except if we're in the branch delay slot where it was just changed + if (!cbi.is_branch_delay_slot) { - // m_regs.pc should be synced for the next block, as the branch wrote to npc - SyncCurrentInstructionPC(); - SyncPC(); - - // m_current_instruction_in_branch_delay_slot = true - EmitStoreCPUStructField(offsetof(Core, m_current_instruction_in_branch_delay_slot), Value::FromConstantU8(1)); - m_current_instruction_in_branch_delay_slot_dirty = true; + Assert(!m_register_cache.IsGuestRegisterInHostRegister(Reg::pc)); + m_register_cache.WriteGuestRegister(Reg::pc, Value::FromConstantU32(cbi.pc + 4)); } if (!CanInstructionTrap(cbi.instruction, m_block->key.user_mode) && !force_sync) { // Defer updates for non-faulting instructions. - m_delayed_pc_add += INSTRUCTION_SIZE; m_delayed_cycles_add += cycles; return; } - if (m_delayed_pc_add > 0) + if (cbi.is_branch_delay_slot) { - // m_current_instruction_pc += m_delayed_pc_add - EmitAddCPUStructField(offsetof(Core, m_current_instruction_pc), Value::FromConstantU32(m_delayed_pc_add)); - - // m_regs.pc += m_delayed_pc_add - EmitAddCPUStructField(offsetof(Core, m_regs.pc), Value::FromConstantU32(m_delayed_pc_add)); - - // m_regs.npc += m_delayed_pc_add - // TODO: This can go once we recompile branch instructions and unconditionally set npc - EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(m_delayed_pc_add)); - - m_delayed_pc_add = 0; + // m_current_instruction_in_branch_delay_slot = true + EmitStoreCPUStructField(offsetof(Core, m_current_instruction_in_branch_delay_slot), Value::FromConstantU8(1)); + m_current_instruction_in_branch_delay_slot_dirty = true; } - if (!cbi.is_branch_instruction) - m_delayed_pc_add = INSTRUCTION_SIZE; + // Sync current instruction PC + EmitStoreCPUStructField(offsetof(Core, m_current_instruction_pc), Value::FromConstantU32(cbi.pc)); m_delayed_cycles_add += cycles; AddPendingCycles(); @@ -835,22 +806,6 @@ void CodeGenerator::InstructionEpilogue(const CodeBlockInstruction& cbi) } } -void CodeGenerator::SyncCurrentInstructionPC() -{ - // m_current_instruction_pc = m_regs.pc - Value pc_value = m_register_cache.AllocateScratch(RegSize_32); - EmitLoadCPUStructField(pc_value.host_reg, RegSize_32, offsetof(Core, m_regs.pc)); - EmitStoreCPUStructField(offsetof(Core, m_current_instruction_pc), pc_value); -} - -void CodeGenerator::SyncPC() -{ - // m_regs.pc = m_regs.npc - Value npc_value = m_register_cache.AllocateScratch(RegSize_32); - EmitLoadCPUStructField(npc_value.host_reg, RegSize_32, offsetof(Core, m_regs.npc)); - EmitStoreCPUStructField(offsetof(Core, m_regs.pc), npc_value); -} - void CodeGenerator::AddPendingCycles() { if (m_delayed_cycles_add == 0) @@ -1246,8 +1201,7 @@ bool CodeGenerator::Compile_SetLess(const CodeBlockInstruction& cbi) bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) { - // Force sync since we branches are PC-relative. - InstructionPrologue(cbi, 1, true); + InstructionPrologue(cbi, 1); // Compute the branch target. // This depends on the form of the instruction. @@ -1258,7 +1212,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) { // npc = (pc & 0xF0000000) | (target << 2) Value branch_target = - OrValues(AndValues(m_register_cache.ReadGuestRegister(Reg::pc, false), Value::FromConstantU32(0xF0000000)), + OrValues(AndValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(0xF0000000)), Value::FromConstantU32(cbi.instruction.j.target << 2)); EmitBranch(Condition::Always, (cbi.instruction.op == InstructionOp::jal) ? Reg::ra : Reg::count, @@ -1294,7 +1248,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) case InstructionOp::bne: { // npc = pc + (sext(imm) << 2) - Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false), + Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false); // branch <- rs op rt @@ -1311,7 +1265,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) case InstructionOp::blez: { // npc = pc + (sext(imm) << 2) - Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false), + Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false); // branch <- rs op 0 @@ -1327,7 +1281,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) case InstructionOp::b: { // npc = pc + (sext(imm) << 2) - Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false), + Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false); const u8 rt = static_cast(cbi.instruction.i.rt.GetValue()); @@ -1344,7 +1298,8 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) if (link) { EmitCancelInterpreterLoadDelayForReg(Reg::ra); - m_register_cache.WriteGuestRegister(Reg::ra, m_register_cache.ReadGuestRegister(Reg::npc, false)); + m_register_cache.WriteGuestRegister( + Reg::ra, AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(4), false)); } EmitTest(lhs.host_reg, lhs); diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index b4ccf42e6..c401d7e07 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -165,8 +165,6 @@ private: void BlockEpilogue(); void InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles, bool force_sync = false); void InstructionEpilogue(const CodeBlockInstruction& cbi); - void SyncCurrentInstructionPC(); - void SyncPC(); void AddPendingCycles(); Value DoGTERegisterRead(u32 index); @@ -202,7 +200,6 @@ private: CodeEmitter m_far_emitter; CodeEmitter* m_emit; - u32 m_delayed_pc_add = 0; TickCount m_delayed_cycles_add = 0; // whether various flags need to be reset. diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 3c9a7ba83..f4bdaabf9 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -201,6 +201,9 @@ void CodeGenerator::EmitEndBlock() void CodeGenerator::EmitExceptionExit() { + // toss away our PC value since we're jumping to the exception handler + m_register_cache.InvalidateGuestRegister(Reg::pc); + // ensure all unflushed registers are written back m_register_cache.FlushAllGuestRegisters(false, false); @@ -1762,28 +1765,33 @@ static void EmitConditionalJump(Condition condition, bool invert, Xbyak::CodeGen void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, Value&& branch_target) { - // allocate scratch register for reading npc - we return to the main path, so this could cause a reg flush - Value old_npc = m_register_cache.AllocateScratch(RegSize_32); - - // npc gets modified by the branch, so we can't trust it on returning. same for lr_reg, which might contain a dirty - // value - m_register_cache.FlushGuestRegister(Reg::npc, true, true); - if (lr_reg != Reg::count) + // ensure the lr register is flushed, since we want it's correct value after the branch + if (lr_reg != Reg::count && lr_reg != Reg::zero) m_register_cache.FlushGuestRegister(lr_reg, true, true); - // condition is inverted because we want the case for skipping it + // compute return address, which is also set as the new pc when the branch isn't taken + Value new_pc; + if (condition != Condition::Always || lr_reg != Reg::count) + { + new_pc = AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(4), false); + if (!new_pc.IsInHostRegister()) + new_pc = GetValueInHostRegister(new_pc); + } + Xbyak::Label skip_branch; if (condition != Condition::Always) + { + // condition is inverted because we want the case for skipping it EmitConditionalJump(condition, true, m_emit, skip_branch); + } // save the old PC if we want to - if (lr_reg != Reg::count) + if (lr_reg != Reg::count && lr_reg != Reg::zero) { // Can't cache because we have two branches. Load delay cancel is due to the immediate flush afterwards, // if we don't cancel it, at the end of the instruction the value we write can be overridden. EmitCancelInterpreterLoadDelayForReg(lr_reg); - EmitLoadGuestRegister(old_npc.host_reg, Reg::npc); - EmitStoreGuestRegister(lr_reg, old_npc); + EmitStoreGuestRegister(lr_reg, new_pc); } // we don't need to test the address of constant branches unless they're definitely misaligned, which would be @@ -1814,12 +1822,18 @@ void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, Value&& branch_t m_register_cache.PopState(); } - // branch taken path - write new PC and flush it, since two branches - EmitStoreGuestRegister(Reg::npc, branch_target); - EmitStoreCPUStructField(offsetof(Core, m_current_instruction_was_branch_taken), Value::FromConstantU8(1)); + // branch taken path - change the return address/new pc + if (condition != Condition::Always) + EmitCopyValue(new_pc.GetHostRegister(), branch_target); // converge point m_emit->L(skip_branch); + + // update pc + if (condition != Condition::Always) + m_register_cache.WriteGuestRegister(Reg::pc, std::move(new_pc)); + else + m_register_cache.WriteGuestRegister(Reg::pc, std::move(branch_target)); } void CodeGenerator::EmitRaiseException(Exception excode, Condition condition /* = Condition::Always */) @@ -1827,14 +1841,12 @@ void CodeGenerator::EmitRaiseException(Exception excode, Condition condition /* if (condition == Condition::Always) { // no need to use far code if we're always raising the exception - EmitFunctionCall(nullptr, &Thunks::RaiseException, m_register_cache.GetCPUPtr(), - Value::FromConstantU8(static_cast(excode))); + m_register_cache.InvalidateGuestRegister(Reg::pc); m_register_cache.FlushAllGuestRegisters(true, true); m_register_cache.FlushLoadDelay(true); - // PC should be synced at this point. If we leave the 4 on here for this instruction, we mess up npc. - Assert(m_delayed_pc_add == 4); - m_delayed_pc_add = 0; + EmitFunctionCall(nullptr, &Thunks::RaiseException, m_register_cache.GetCPUPtr(), + Value::FromConstantU8(static_cast(excode))); return; } diff --git a/src/core/cpu_recompiler_register_cache.h b/src/core/cpu_recompiler_register_cache.h index 67406cf8e..dfaa13857 100644 --- a/src/core/cpu_recompiler_register_cache.h +++ b/src/core/cpu_recompiler_register_cache.h @@ -252,6 +252,13 @@ public: return cache_value.IsConstant() || cache_value.IsInHostRegister(); } + /// Returns true if the specified guest register is cached and in a host register. + bool IsGuestRegisterInHostRegister(Reg guest_reg) const + { + const Value& cache_value = m_state.guest_reg_state[static_cast(guest_reg)]; + return cache_value.IsInHostRegister(); + } + /// Returns the host register if the guest register is cached. std::optional GetHostRegisterForGuestRegister(Reg guest_reg) const { diff --git a/src/core/cpu_types.cpp b/src/core/cpu_types.cpp index 5c52f9e08..c8fd684d3 100644 --- a/src/core/cpu_types.cpp +++ b/src/core/cpu_types.cpp @@ -189,7 +189,7 @@ bool CanInstructionTrap(const Instruction& instruction, bool in_user_mode) case InstructionOp::bgtz: case InstructionOp::blez: case InstructionOp::bne: - return true; + return false; case InstructionOp::funct: {