From e0598822b9e698f623bc9d78fb016fea6e557307 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Thu, 27 Aug 2020 23:39:08 +1000 Subject: [PATCH] WIP --- src/core/cpu_code_cache.cpp | 41 +++- src/core/cpu_code_cache.h | 4 + src/core/cpu_recompiler_code_generator.cpp | 206 +++++++++++++++++---- src/core/cpu_recompiler_code_generator.h | 6 + src/core/cpu_recompiler_register_cache.cpp | 15 ++ src/core/cpu_recompiler_register_cache.h | 3 + src/core/cpu_recompiler_thunks.h | 1 + src/core/cpu_types.cpp | 11 +- src/core/cpu_types.h | 2 +- 9 files changed, 244 insertions(+), 45 deletions(-) diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index cb2a23c19..1b1cf4ee8 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -321,8 +321,12 @@ void ExecuteRecompiler() { const u32 pc = g_state.regs.pc; g_state.current_instruction_pc = pc; +#if 0 + if (pc == 0xbfc0d444) + __debugbreak(); +#endif const u32 fast_map_index = GetFastMapIndex(pc); - s_single_block_asm_dispatcher[fast_map_index](); + s_single_block_asm_dispatcher(s_fast_map[fast_map_index]); } TimingEvents::RunEvents(); @@ -520,6 +524,23 @@ bool CompileBlock(CodeBlock* block) cbi.is_store_instruction = IsMemoryStoreInstruction(cbi.instruction); cbi.has_load_delay = InstructionHasLoadDelay(cbi.instruction); cbi.can_trap = CanInstructionTrap(cbi.instruction, InUserMode()); + cbi.is_direct_branch_instruction = IsDirectBranchInstruction(cbi.instruction); + if (cbi.is_direct_branch_instruction && true) + { + // backwards branch? + VirtualMemoryAddress branch_pc = GetDirectBranchTarget(cbi.instruction, cbi.pc); + for (CodeBlockInstruction& other_cbi : block->instructions) + { + if (other_cbi.pc == branch_pc) + { + other_cbi.is_direct_branch_target = true; + cbi.is_direct_branch_in_block = true; + block->has_in_block_branches = true; + Log_InfoPrintf("Found reverse branch from %08X to %08X", cbi.pc, branch_pc); + break; + } + } + } if (g_settings.cpu_recompiler_icache) { @@ -552,7 +573,7 @@ bool CompileBlock(CodeBlock* block) // change the pc for the second branch's delay slot, it comes from the first branch const CodeBlockInstruction& prev_cbi = block->instructions.back(); - pc = GetBranchInstructionTarget(prev_cbi.instruction, prev_cbi.pc); + pc = GetDirectBranchTarget(prev_cbi.instruction, prev_cbi.pc); Log_DevPrintf("Double branch at %08X, using delay slot from %08X -> %08X", cbi.pc, prev_cbi.pc, pc); } @@ -590,6 +611,17 @@ bool CompileBlock(CodeBlock* block) cbi.is_load_delay_slot ? "LD" : " ", cbi.pc, cbi.instruction.bits, disasm.GetCharArray()); } #endif + + if (block->instructions.size() >= 2) + { + Log_InfoPrintf("%08X -> %08X", block->instructions.front().pc, block->instructions.back().pc); + + const auto& cbi = block->instructions[block->instructions.size() - 2]; + SmallString disasm; + CPU::DisassembleInstruction(&disasm, cbi.pc, cbi.instruction.bits); + Log_InfoPrintf("[%s %s 0x%08X] %08X %s", cbi.is_branch_delay_slot ? "BD" : " ", + cbi.is_load_delay_slot ? "LD" : " ", cbi.pc, cbi.instruction.bits, disasm.GetCharArray()); + } } else { @@ -899,3 +931,8 @@ Common::PageFaultHandler::HandlerResult LUTPageFaultHandler(void* exception_pc, #endif // WITH_RECOMPILER } // namespace CPU::CodeCache + +void CPU::Recompiler::Thunks::templog() +{ + // CPU::CodeCache::LogCurrentState(); +} diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h index a984a0224..f2f91273d 100644 --- a/src/core/cpu_code_cache.h +++ b/src/core/cpu_code_cache.h @@ -54,6 +54,9 @@ struct CodeBlockInstruction bool is_branch_instruction : 1; bool is_unconditional_branch_instruction : 1; bool is_branch_delay_slot : 1; + bool is_direct_branch_instruction : 1; + bool is_direct_branch_target : 1; + bool is_direct_branch_in_block : 1; bool is_load_instruction : 1; bool is_store_instruction : 1; bool is_load_delay_slot : 1; @@ -86,6 +89,7 @@ struct CodeBlock bool contains_loadstore_instructions = false; bool contains_double_branches = false; bool invalidated = false; + bool has_in_block_branches = false; const u32 GetPC() const { return key.GetPC(); } const u32 GetSizeInBytes() const { return static_cast(instructions.size()) * sizeof(Instruction); } diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index ddeadbe0b..3d7c27a3b 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -30,17 +30,10 @@ bool CodeGenerator::CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* o EmitBeginBlock(); BlockPrologue(); - const CodeBlockInstruction* cbi = m_block_start; - while (cbi != m_block_end) + m_current_instruction = m_block_start; + while (m_current_instruction != m_block_end) { -#ifdef _DEBUG - SmallString disasm; - DisassembleInstruction(&disasm, cbi->pc, cbi->instruction.bits); - Log_DebugPrintf("Compiling instruction '%s'", disasm.GetCharArray()); -#endif - - m_current_instruction = cbi; - if (!CompileInstruction(*cbi)) + if (!CompileInstruction(*m_current_instruction)) { m_current_instruction = nullptr; m_block_end = nullptr; @@ -49,7 +42,7 @@ bool CodeGenerator::CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* o return false; } - cbi++; + m_current_instruction++; } BlockEpilogue(); @@ -70,6 +63,12 @@ bool CodeGenerator::CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* o bool CodeGenerator::CompileInstruction(const CodeBlockInstruction& cbi) { +#ifdef _DEBUG + SmallString disasm; + DisassembleInstruction(&disasm, cbi.pc, cbi.instruction.bits); + Log_DebugPrintf("Compiling instruction '%s'", disasm.GetCharArray()); +#endif + bool result; switch (cbi.instruction.op) { @@ -864,6 +863,17 @@ Value CodeGenerator::NotValue(const Value& val) return res; } +LabelType* CodeGenerator::GetBranchTargetLabel(VirtualMemoryAddress pc) +{ + for (auto& it : m_branch_targets) + { + if (it.first == pc) + return &it.second; + } + + return nullptr; +} + void CodeGenerator::GenerateExceptionExit(const CodeBlockInstruction& cbi, Exception excode, Condition condition /* = Condition::Always */) { @@ -903,6 +913,8 @@ void CodeGenerator::BlockPrologue() { InitSpeculativeRegs(); + // EmitFunctionCall(nullptr, &CPU::Recompiler::Thunks::templog); + EmitStoreCPUStructField(offsetof(State, exception_raised), Value::FromConstantU8(0)); if (m_block->uncached_fetch_ticks > 0) @@ -940,6 +952,23 @@ void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCou m_emit->nop(); #endif + // flush and reload registers on branch targets since we'll be coming back here + if (cbi.is_direct_branch_target) + { + if (&cbi != m_block_start) + { + m_register_cache.FlushAllGuestRegisters(true, true); + if (m_register_cache.HasLoadDelay()) + m_register_cache.WriteLoadDelayToCPU(true); + AddPendingCycles(true); + SyncPC(); + } + LabelType label; + EmitBindLabel(&label); + m_branch_targets.emplace_back(cbi.pc, std::move(label)); + m_load_delay_dirty = true; + } + // move instruction offsets forward m_current_instruction_pc_offset = m_pc_offset; m_pc_offset = m_next_pc_offset; @@ -1063,6 +1092,17 @@ void CodeGenerator::WriteNewPC(const Value& value, bool commit) m_next_pc_offset = 0; } +void CodeGenerator::SyncPC() +{ + if (m_pc_offset == 0) + return; + + EmitAddCPUStructField(offsetof(State, regs.pc), Value::FromConstantU32(m_pc_offset)); + + m_pc_offset = 0; + m_next_pc_offset = 4; +} + bool CodeGenerator::Compile_Fallback(const CodeBlockInstruction& cbi) { InstructionPrologue(cbi, 1, true); @@ -1956,7 +1996,8 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) { InstructionPrologue(cbi, 1); - auto DoBranch = [this](Condition condition, const Value& lhs, const Value& rhs, Reg lr_reg, Value&& branch_target) { + auto DoBranch = [this, &cbi](Condition condition, const Value& lhs, const Value& rhs, Reg lr_reg, + Value&& branch_target) { // ensure the lr register is flushed, since we want it's correct value after the branch // we don't want to invalidate it yet because of "jalr r0, r0", branch_target could be the lr_reg. if (lr_reg != Reg::count && lr_reg != Reg::zero) @@ -1967,16 +2008,62 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) if (condition != Condition::Always || lr_reg != Reg::count) next_pc = CalculatePC(4); - LabelType branch_not_taken; + LabelType* in_block_target = nullptr; + if (cbi.is_direct_branch_in_block) + in_block_target = GetBranchTargetLabel(GetDirectBranchTarget(cbi.instruction, cbi.pc)); + + Value take_branch; + LabelType branch_taken, branch_not_taken; if (condition != Condition::Always) { - // condition is inverted because we want the case for skipping it - if (lhs.IsValid() && rhs.IsValid()) - EmitConditionalBranch(condition, true, lhs.host_reg, rhs, &branch_not_taken); - else if (lhs.IsValid()) - EmitConditionalBranch(condition, true, lhs.host_reg, lhs.size, &branch_not_taken); + if (!in_block_target) + { + // condition is inverted because we want the case for skipping it + if (lhs.IsValid() && rhs.IsValid()) + EmitConditionalBranch(condition, true, lhs.host_reg, rhs, &branch_not_taken); + else if (lhs.IsValid()) + EmitConditionalBranch(condition, true, lhs.host_reg, lhs.size, &branch_not_taken); + else + EmitConditionalBranch(condition, true, &branch_not_taken); + } else - EmitConditionalBranch(condition, true, &branch_not_taken); + { + take_branch = m_register_cache.AllocateScratch(RegSize_32); + switch (condition) + { + case Condition::NotEqual: + case Condition::Equal: + case Condition::Overflow: + case Condition::Greater: + case Condition::GreaterEqual: + case Condition::LessEqual: + case Condition::Less: + case Condition::Above: + case Condition::AboveEqual: + case Condition::Below: + case Condition::BelowEqual: + { + EmitCmp(lhs.GetHostRegister(), rhs); + EmitSetConditionResult(take_branch.GetHostRegister(), take_branch.size, condition); + } + break; + + case Condition::Negative: + case Condition::PositiveOrZero: + case Condition::NotZero: + case Condition::Zero: + { + Assert(!rhs.IsValid() || (rhs.IsConstant() && rhs.GetS64ConstantValue() == 0)); + EmitTest(lhs.GetHostRegister(), lhs); + EmitSetConditionResult(take_branch.GetHostRegister(), take_branch.size, condition); + } + break; + + default: + UnreachableCode(); + break; + } + } } // save the old PC if we want to @@ -2024,9 +2111,54 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) m_register_cache.PopState(); } + if (in_block_target) + { + // if it's an in-block branch, compile the delay slot now + Assert((m_current_instruction + 1) != m_block_end); + InstructionEpilogue(cbi); + m_current_instruction++; + if (!CompileInstruction(*m_current_instruction)) + return false; + + // flush all regs since we're at the end of the block now + m_register_cache.FlushAllGuestRegisters(false, true); + if (m_register_cache.HasLoadDelay()) + m_register_cache.WriteLoadDelayToCPU(true); + AddPendingCycles(true); + + // branch not taken? + EmitConditionalBranch(Condition::NotZero, true, take_branch.GetHostRegister(), take_branch.size, + &branch_not_taken); + + m_register_cache.PushState(); + { + // check downcount + { + Value pending_ticks = m_register_cache.AllocateScratch(RegSize_32); + Value downcount = m_register_cache.AllocateScratch(RegSize_32); + EmitLoadCPUStructField(pending_ticks.GetHostRegister(), RegSize_32, offsetof(State, pending_ticks)); + EmitLoadCPUStructField(downcount.GetHostRegister(), RegSize_32, offsetof(State, downcount)); + + // pending < downcount + EmitConditionalBranch(Condition::GreaterEqual, false, pending_ticks.GetHostRegister(), downcount, + &branch_taken); + } + + // EmitFunctionCall(nullptr, &CPU::Recompiler::Thunks::templog); + + // now, we can jump back in the block + // if it's an in-branch block, we can skip writing the PC since it's synced anyway + EmitBranch(in_block_target); + } + + // restore back + m_register_cache.PopState(); + } + if (condition != Condition::Always) { // branch taken path - modify the next pc + EmitBindLabel(&branch_taken); EmitCopyValue(next_pc.GetHostRegister(), branch_target); // converge point @@ -2042,6 +2174,11 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) // now invalidate lr becuase it was possibly written in the branch if (lr_reg != Reg::count && lr_reg != Reg::zero) m_register_cache.InvalidateGuestRegister(lr_reg); + + if (!in_block_target) + InstructionEpilogue(cbi); + + return true; }; // Compute the branch target. @@ -2055,10 +2192,9 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) Value branch_target = OrValues(AndValues(CalculatePC(), Value::FromConstantU32(0xF0000000)), Value::FromConstantU32(cbi.instruction.j.target << 2)); - DoBranch(Condition::Always, Value(), Value(), (cbi.instruction.op == InstructionOp::jal) ? Reg::ra : Reg::count, - std::move(branch_target)); + return DoBranch(Condition::Always, Value(), Value(), + (cbi.instruction.op == InstructionOp::jal) ? Reg::ra : Reg::count, std::move(branch_target)); } - break; case InstructionOp::funct: { @@ -2066,9 +2202,9 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) { // npc = rs, link to rt Value branch_target = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); - DoBranch(Condition::Always, Value(), Value(), - (cbi.instruction.r.funct == InstructionFunct::jalr) ? cbi.instruction.r.rd : Reg::count, - std::move(branch_target)); + return DoBranch(Condition::Always, Value(), Value(), + (cbi.instruction.r.funct == InstructionFunct::jalr) ? cbi.instruction.r.rd : Reg::count, + std::move(branch_target)); } else if (cbi.instruction.r.funct == InstructionFunct::syscall || cbi.instruction.r.funct == InstructionFunct::break_) @@ -2076,13 +2212,15 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) const Exception excode = (cbi.instruction.r.funct == InstructionFunct::syscall) ? Exception::Syscall : Exception::BP; GenerateExceptionExit(cbi, excode); + InstructionEpilogue(cbi); + return true; } else { UnreachableCode(); + return false; } } - break; case InstructionOp::beq: case InstructionOp::bne: @@ -2094,7 +2232,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) if (cbi.instruction.op == InstructionOp::beq && cbi.instruction.i.rs == Reg::zero && cbi.instruction.i.rt == Reg::zero) { - DoBranch(Condition::Always, Value(), Value(), Reg::count, std::move(branch_target)); + return DoBranch(Condition::Always, Value(), Value(), Reg::count, std::move(branch_target)); } else { @@ -2102,10 +2240,9 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) Value lhs = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs, true, true); Value rhs = m_register_cache.ReadGuestRegister(cbi.instruction.i.rt); const Condition condition = (cbi.instruction.op == InstructionOp::beq) ? Condition::Equal : Condition::NotEqual; - DoBranch(condition, lhs, rhs, Reg::count, std::move(branch_target)); + return DoBranch(condition, lhs, rhs, Reg::count, std::move(branch_target)); } } - break; case InstructionOp::bgtz: case InstructionOp::blez: @@ -2118,9 +2255,8 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) const Condition condition = (cbi.instruction.op == InstructionOp::bgtz) ? Condition::Greater : Condition::LessEqual; - DoBranch(condition, lhs, Value::FromConstantU32(0), Reg::count, std::move(branch_target)); + return DoBranch(condition, lhs, Value::FromConstantU32(0), Reg::count, std::move(branch_target)); } - break; case InstructionOp::b: { @@ -2142,17 +2278,13 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) m_register_cache.WriteGuestRegister(Reg::ra, CalculatePC(4)); } - DoBranch(condition, lhs, Value(), Reg::count, std::move(branch_target)); + return DoBranch(condition, lhs, Value(), Reg::count, std::move(branch_target)); } - break; default: UnreachableCode(); - break; + return false; } - - InstructionEpilogue(cbi); - return true; } bool CodeGenerator::Compile_lui(const CodeBlockInstruction& cbi) diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index fcd8a4d20..bc9971085 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -2,6 +2,7 @@ #include #include #include +#include #include "common/jit_code_buffer.h" @@ -188,6 +189,8 @@ private: void* GetCurrentNearCodePointer() const; void* GetCurrentFarCodePointer() const; + LabelType* GetBranchTargetLabel(VirtualMemoryAddress pc); + ////////////////////////////////////////////////////////////////////////// // Code Generation Helpers ////////////////////////////////////////////////////////////////////////// @@ -202,6 +205,7 @@ private: Value GetCurrentInstructionPC(u32 offset = 0); void UpdateCurrentInstructionPC(bool commit); void WriteNewPC(const Value& value, bool commit); + void SyncPC(); Value DoGTERegisterRead(u32 index); void DoGTERegisterWrite(u32 index, const Value& value); @@ -239,6 +243,8 @@ private: CodeEmitter m_far_emitter; CodeEmitter* m_emit; + std::vector> m_branch_targets; + TickCount m_delayed_cycles_add = 0; TickCount m_pc_offset = 0; TickCount m_current_instruction_pc_offset = 0; diff --git a/src/core/cpu_recompiler_register_cache.cpp b/src/core/cpu_recompiler_register_cache.cpp index 3aae2d167..8c089b917 100644 --- a/src/core/cpu_recompiler_register_cache.cpp +++ b/src/core/cpu_recompiler_register_cache.cpp @@ -280,6 +280,21 @@ Value RegisterCache::AllocateScratch(RegSize size, HostReg reg /* = HostReg_Inva return Value::FromScratch(this, reg, size); } +void RegisterCache::ReserveCallerSavedRegisters() +{ + for (u32 reg = 0; reg < HostReg_Count; reg++) + { + if ((m_state.host_reg_state[reg] & (HostRegState::CalleeSaved | HostRegState::CalleeSavedAllocated)) == + HostRegState::CalleeSaved) + { + DebugAssert(m_state.callee_saved_order_count < HostReg_Count); + m_code_generator.EmitPushHostReg(static_cast(reg), GetActiveCalleeSavedRegisterCount()); + m_state.callee_saved_order[m_state.callee_saved_order_count++] = static_cast(reg); + m_state.host_reg_state[reg] |= HostRegState::CalleeSavedAllocated; + } + } +} + u32 RegisterCache::PushCallerSavedRegisters() const { u32 position = GetActiveCalleeSavedRegisterCount(); diff --git a/src/core/cpu_recompiler_register_cache.h b/src/core/cpu_recompiler_register_cache.h index 4863f318d..f0c0da6e2 100644 --- a/src/core/cpu_recompiler_register_cache.h +++ b/src/core/cpu_recompiler_register_cache.h @@ -241,6 +241,9 @@ public: /// Ensures a host register is free, removing any value cached. void EnsureHostRegFree(HostReg reg); + /// Preallocates caller saved registers, enabling later use without stack pushes. + void ReserveCallerSavedRegisters(); + /// Push/pop volatile host registers. Returns the number of registers pushed/popped. u32 PushCallerSavedRegisters() const; u32 PopCallerSavedRegisters() const; diff --git a/src/core/cpu_recompiler_thunks.h b/src/core/cpu_recompiler_thunks.h index 400ff9131..347c679fc 100644 --- a/src/core/cpu_recompiler_thunks.h +++ b/src/core/cpu_recompiler_thunks.h @@ -33,6 +33,7 @@ void UncheckedWriteMemoryHalfWord(u32 address, u16 value); void UncheckedWriteMemoryWord(u32 address, u32 value); void UpdateFastmemMapping(); +void templog(); } // namespace Recompiler::Thunks diff --git a/src/core/cpu_types.cpp b/src/core/cpu_types.cpp index db9960e45..a1c75b30f 100644 --- a/src/core/cpu_types.cpp +++ b/src/core/cpu_types.cpp @@ -98,24 +98,25 @@ bool IsDirectBranchInstruction(const Instruction& instruction) } } -u32 GetBranchInstructionTarget(const Instruction& instruction, u32 instruction_pc) +VirtualMemoryAddress GetDirectBranchTarget(const Instruction& instruction, VirtualMemoryAddress instruction_pc) { + const VirtualMemoryAddress pc = instruction_pc + 4; + switch (instruction.op) { case InstructionOp::j: case InstructionOp::jal: - return ((instruction_pc + 4) & UINT32_C(0xF0000000)) | (instruction.j.target << 2); + return (pc & UINT32_C(0xF0000000)) | (instruction.j.target << 2); case InstructionOp::b: case InstructionOp::beq: case InstructionOp::bgtz: case InstructionOp::blez: case InstructionOp::bne: - return instruction_pc + 4 + (instruction.i.imm_sext32() << 2); + return (pc + (instruction.i.imm_sext32() << 2)); default: - Panic("Trying to get branch target of indirect or invalid branch"); - return instruction_pc; + return pc; } } diff --git a/src/core/cpu_types.h b/src/core/cpu_types.h index 9fbb0b12c..6fd4154cb 100644 --- a/src/core/cpu_types.h +++ b/src/core/cpu_types.h @@ -223,7 +223,7 @@ union Instruction bool IsBranchInstruction(const Instruction& instruction); bool IsUnconditionalBranchInstruction(const Instruction& instruction); bool IsDirectBranchInstruction(const Instruction& instruction); -u32 GetBranchInstructionTarget(const Instruction& instruction, u32 instruction_pc); +VirtualMemoryAddress GetDirectBranchTarget(const Instruction& instruction, VirtualMemoryAddress instruction_pc); bool IsCallInstruction(const Instruction& instruction); bool IsReturnInstruction(const Instruction& instruction); bool IsMemoryLoadInstruction(const Instruction& instruction);