From 8c5fcc8f4826a1eb1a7429e2e895b7fc4eb3997a Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 24 Nov 2019 01:10:51 +1000 Subject: [PATCH] CPU: Fix more load delay slot issues Fixes Spyro again. b{ltz,gez}(al)? disabled in the recompiler until issues are fixed. --- src/core/cpu_code_cache.cpp | 22 +++-- src/core/cpu_core.cpp | 8 +- src/core/cpu_core.h | 5 +- src/core/cpu_recompiler_code_generator.cpp | 19 ++-- src/core/cpu_recompiler_code_generator.h | 6 +- .../cpu_recompiler_code_generator_generic.cpp | 16 +--- .../cpu_recompiler_code_generator_x64.cpp | 87 +++++++++++-------- src/core/cpu_recompiler_register_cache.cpp | 35 +++++--- src/core/cpu_recompiler_register_cache.h | 1 + 9 files changed, 109 insertions(+), 90 deletions(-) diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 9805305e8..9a3928504 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -55,22 +55,23 @@ void CodeCache::Execute() break; } + reexecute_block: + #if 0 const u32 tick = m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(); - if (tick == 58672386) + if (tick == 61033207) __debugbreak(); #endif - reexecute_block: - if (m_use_recompiler) - block->host_code(m_core); - else - InterpretCachedBlock(*block); - #if 0 LogCurrentState(); #endif + if (m_use_recompiler) + block->host_code(m_core); + else + InterpretCachedBlock(*block); + if (m_core->m_downcount < 0) break; else if (m_core->HasPendingInterrupt() || !USE_BLOCK_LINKING) @@ -142,11 +143,14 @@ void CodeCache::LogCurrentState() WriteToExecutionLog( "tick=%u pc=%08X npc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X " "t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X " - "s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X\n", + "s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X npc=%08X ldr=%s " + "ldv=%08X\n", m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.npc, regs.zero, regs.at, regs.v0, regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4, regs.t5, regs.t6, regs.t7, regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7, regs.t8, regs.t9, regs.k0, regs.k1, regs.gp, - regs.sp, regs.fp, regs.ra); + regs.sp, regs.fp, regs.ra, regs.npc, + (m_core->m_next_load_delay_reg == Reg::count) ? "NONE" : GetRegName(m_core->m_next_load_delay_reg), + (m_core->m_next_load_delay_reg == Reg::count) ? 0 : m_core->m_next_load_delay_value); } CodeBlockKey CodeCache::GetNextBlockKey() const diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 405a69543..a79f2e24b 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -94,10 +94,8 @@ bool Core::DoState(StateWrapper& sw) sw.Do(&m_branch_was_taken); sw.Do(&m_load_delay_reg); sw.Do(&m_load_delay_value); - sw.Do(&m_load_delay_old_value); sw.Do(&m_next_load_delay_reg); sw.Do(&m_next_load_delay_value); - sw.Do(&m_next_load_delay_old_value); sw.Do(&m_cache_control); sw.DoBytes(m_dcache.data(), m_dcache.size()); @@ -389,6 +387,7 @@ u32 Core::ReadReg(Reg rs) void Core::WriteReg(Reg rd, u32 value) { m_regs.r[static_cast(rd)] = value; + m_load_delay_reg = (rd == m_load_delay_reg) ? Reg::count : m_load_delay_reg; // prevent writes to $zero from going through - better than branching/cmov m_regs.zero = 0; @@ -407,7 +406,6 @@ void Core::WriteRegDelayed(Reg rd, u32 value) // save the old value, if something else overwrites this reg we want to preserve it m_next_load_delay_reg = rd; m_next_load_delay_value = value; - m_next_load_delay_old_value = m_regs.r[static_cast(rd)]; } std::optional Core::ReadCop0Reg(Cop0Reg reg) @@ -1120,7 +1118,7 @@ void Core::ExecuteInstruction() case InstructionOp::jal: { - m_regs.ra = m_regs.npc; + WriteReg(Reg::ra, m_regs.npc); m_next_instruction_is_branch_delay_slot = true; Branch((m_regs.pc & UINT32_C(0xF0000000)) | (inst.j.target << 2)); } @@ -1175,7 +1173,7 @@ void Core::ExecuteInstruction() // register is still linked even if the branch isn't taken const bool link = (rt & u8(0x1E)) == u8(0x10); if (link) - m_regs.ra = m_regs.npc; + WriteReg(Reg::ra, m_regs.npc); if (branch) Branch(m_regs.pc + (inst.i.imm_sext32() << 2)); diff --git a/src/core/cpu_core.h b/src/core/cpu_core.h index 8641220f2..45c75e16c 100644 --- a/src/core/cpu_core.h +++ b/src/core/cpu_core.h @@ -106,12 +106,11 @@ private: ALWAYS_INLINE void UpdateLoadDelay() { // the old value is needed in case the delay slot instruction overwrites the same register - if (m_load_delay_reg != Reg::count && m_regs.r[static_cast(m_load_delay_reg)] == m_load_delay_old_value) + if (m_load_delay_reg != Reg::count) m_regs.r[static_cast(m_load_delay_reg)] = m_load_delay_value; m_load_delay_reg = m_next_load_delay_reg; m_load_delay_value = m_next_load_delay_value; - m_load_delay_old_value = m_next_load_delay_old_value; m_next_load_delay_reg = Reg::count; } @@ -168,10 +167,8 @@ private: // load delays Reg m_load_delay_reg = Reg::count; u32 m_load_delay_value = 0; - u32 m_load_delay_old_value = 0; Reg m_next_load_delay_reg = Reg::count; u32 m_next_load_delay_value = 0; - u32 m_next_load_delay_old_value = 0; u32 m_cache_control = 0; diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index ee6cd97c5..952ffdafa 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -93,7 +93,7 @@ bool CodeGenerator::CompileInstruction(const CodeBlockInstruction& cbi) case InstructionOp::j: case InstructionOp::jal: - case InstructionOp::b: + //case InstructionOp::b: case InstructionOp::beq: case InstructionOp::bne: case InstructionOp::bgtz: @@ -812,20 +812,23 @@ void CodeGenerator::InstructionEpilogue(const CodeBlockInstruction& cbi) { m_register_cache.UpdateLoadDelay(); + if (m_load_delay_dirty) + { + // we have to invalidate the register cache, since the load delayed register might've been cached + Log_DebugPrint("Emitting delay slot flush"); + EmitFlushInterpreterLoadDelay(); + m_register_cache.InvalidateAllNonDirtyGuestRegisters(); + m_load_delay_dirty = false; + } + // copy if the previous instruction was a load, reset the current value on the next instruction if (m_next_load_delay_dirty) { Log_DebugPrint("Emitting delay slot flush (with move next)"); - EmitDelaySlotUpdate(false, false, true); + EmitMoveNextInterpreterLoadDelay(); m_next_load_delay_dirty = false; m_load_delay_dirty = true; } - else if (m_load_delay_dirty) - { - Log_DebugPrint("Emitting delay slot flush"); - EmitDelaySlotUpdate(true, false, false); - m_load_delay_dirty = false; - } } void CodeGenerator::SyncCurrentInstructionPC() diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index 02a401fc1..05f639d5c 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -65,7 +65,10 @@ public: void EmitLoadGuestRegister(HostReg host_reg, Reg guest_reg); void EmitStoreGuestRegister(Reg guest_reg, const Value& value); - void EmitStoreLoadDelay(Reg reg, const Value& value); + void EmitStoreInterpreterLoadDelay(Reg reg, const Value& value); + void EmitFlushInterpreterLoadDelay(); + void EmitMoveNextInterpreterLoadDelay(); + void EmitCancelInterpreterLoadDelayForReg(Reg reg); void EmitLoadCPUStructField(HostReg host_reg, RegSize size, u32 offset); void EmitStoreCPUStructField(u32 offset, const Value& value); void EmitAddCPUStructField(u32 offset, const Value& value); @@ -168,7 +171,6 @@ private: void SyncCurrentInstructionPC(); void SyncPC(); void AddPendingCycles(); - void EmitDelaySlotUpdate(bool skip_check_for_delay, bool skip_check_old_value, bool move_next); ////////////////////////////////////////////////////////////////////////// // Instruction Code Generators diff --git a/src/core/cpu_recompiler_code_generator_generic.cpp b/src/core/cpu_recompiler_code_generator_generic.cpp index cafa90ed6..8808016d9 100644 --- a/src/core/cpu_recompiler_code_generator_generic.cpp +++ b/src/core/cpu_recompiler_code_generator_generic.cpp @@ -17,25 +17,11 @@ void CodeGenerator::EmitStoreGuestRegister(Reg guest_reg, const Value& value) EmitStoreCPUStructField(CalculateRegisterOffset(guest_reg), value); } -void CodeGenerator::EmitStoreLoadDelay(Reg reg, const Value& value) +void CodeGenerator::EmitStoreInterpreterLoadDelay(Reg reg, const Value& value) { DebugAssert(value.size == RegSize_32 && value.IsInHostRegister()); EmitStoreCPUStructField(offsetof(Core, m_load_delay_reg), Value::FromConstantU8(static_cast(reg))); EmitStoreCPUStructField(offsetof(Core, m_load_delay_value), value); - - // We don't want to allocate a register since this could be in a block exit, so re-use the value. - if (m_register_cache.IsGuestRegisterCached(reg)) - { - EmitStoreCPUStructField(offsetof(Core, m_load_delay_old_value), m_register_cache.ReadGuestRegister(reg)); - } - else - { - EmitPushHostReg(value.host_reg); - EmitLoadCPUStructField(value.host_reg, RegSize_32, CalculateRegisterOffset(reg)); - EmitStoreCPUStructField(offsetof(Core, m_load_delay_old_value), value); - EmitPopHostReg(value.host_reg); - } - m_load_delay_dirty = true; } diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 9b16468b5..c304fc6c2 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -1614,60 +1614,65 @@ void CodeGenerator::EmitStoreGuestMemory(const Value& address, const Value& valu SwitchToNearCode(); } -void CodeGenerator::EmitDelaySlotUpdate(bool skip_check_for_delay, bool skip_check_old_value, bool move_next) +void CodeGenerator::EmitFlushInterpreterLoadDelay() { Value reg = m_register_cache.AllocateScratch(RegSize_8); Value value = m_register_cache.AllocateScratch(RegSize_32); - Xbyak::Label skip_flush; - auto load_delay_reg = m_emit->byte[GetCPUPtrReg() + offsetof(Core, m_load_delay_reg)]; - auto load_delay_old_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_load_delay_old_value)]; auto load_delay_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_load_delay_value)]; auto reg_ptr = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_regs.r[0]) + GetHostReg64(reg.host_reg) * 4]; + Xbyak::Label skip_flush; + // reg = load_delay_reg m_emit->movzx(GetHostReg32(reg.host_reg), load_delay_reg); - if (!skip_check_old_value) - m_emit->mov(GetHostReg32(value), load_delay_old_value); - if (!skip_check_for_delay) - { - // if load_delay_reg == Reg::count goto skip_flush - m_emit->cmp(GetHostReg32(reg.host_reg), static_cast(Reg::count)); - m_emit->je(skip_flush); - } - - if (!skip_check_old_value) - { - // if r[reg] != load_delay_old_value goto skip_flush - m_emit->cmp(GetHostReg32(value), reg_ptr); - m_emit->jne(skip_flush); - } + // if load_delay_reg == Reg::count goto skip_flush + m_emit->cmp(GetHostReg32(reg.host_reg), static_cast(Reg::count)); + m_emit->je(skip_flush); // r[reg] = load_delay_value m_emit->mov(GetHostReg32(value), load_delay_value); m_emit->mov(reg_ptr, GetHostReg32(value)); - // if !move_next load_delay_reg = Reg::count - if (!move_next) - m_emit->mov(load_delay_reg, static_cast(Reg::count)); + // load_delay_reg = Reg::count + m_emit->mov(load_delay_reg, static_cast(Reg::count)); m_emit->L(skip_flush); +} - if (move_next) - { - auto next_load_delay_reg = m_emit->byte[GetCPUPtrReg() + offsetof(Core, m_next_load_delay_reg)]; - auto next_load_delay_old_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_next_load_delay_old_value)]; - auto next_load_delay_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_next_load_delay_value)]; - m_emit->mov(GetHostReg32(value), next_load_delay_value); - m_emit->mov(GetHostReg8(reg), next_load_delay_reg); - m_emit->mov(load_delay_value, GetHostReg32(value)); - m_emit->mov(GetHostReg32(value), next_load_delay_old_value); - m_emit->mov(load_delay_reg, GetHostReg8(reg)); - m_emit->mov(load_delay_old_value, GetHostReg32(value)); - m_emit->mov(next_load_delay_reg, static_cast(Reg::count)); - } +void CodeGenerator::EmitMoveNextInterpreterLoadDelay() +{ + Value reg = m_register_cache.AllocateScratch(RegSize_8); + Value value = m_register_cache.AllocateScratch(RegSize_32); + + auto load_delay_reg = m_emit->byte[GetCPUPtrReg() + offsetof(Core, m_load_delay_reg)]; + auto load_delay_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_load_delay_value)]; + auto next_load_delay_reg = m_emit->byte[GetCPUPtrReg() + offsetof(Core, m_next_load_delay_reg)]; + auto next_load_delay_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_next_load_delay_value)]; + + m_emit->mov(GetHostReg32(value), next_load_delay_value); + m_emit->mov(GetHostReg8(reg), next_load_delay_reg); + m_emit->mov(load_delay_value, GetHostReg32(value)); + m_emit->mov(load_delay_reg, GetHostReg8(reg)); + m_emit->mov(next_load_delay_reg, static_cast(Reg::count)); +} + +void CodeGenerator::EmitCancelInterpreterLoadDelayForReg(Reg reg) +{ + auto load_delay_reg = m_emit->byte[GetCPUPtrReg() + offsetof(Core, m_load_delay_reg)]; + + Xbyak::Label skip_cancel; + + // if load_delay_reg != reg goto skip_cancel + m_emit->cmp(load_delay_reg, static_cast(reg)); + m_emit->jne(skip_cancel); + + // load_delay_reg = Reg::count + m_emit->mov(load_delay_reg, static_cast(Reg::count)); + + m_emit->L(skip_cancel); } template @@ -1746,7 +1751,9 @@ void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, bool always_link old_npc = m_register_cache.ReadGuestRegister(Reg::npc, false, true); if (always_link) { - // can't cache because we have two branches + // Can't cache because we have two branches. Load delay cancel is due to the immediate flush afterwards, + // if we don't cancel it, at the end of the instruction the value we write can be overridden. + EmitCancelInterpreterLoadDelayForReg(lr_reg); m_register_cache.WriteGuestRegister(lr_reg, std::move(old_npc)); m_register_cache.FlushGuestRegister(lr_reg, true, true); } @@ -1760,7 +1767,9 @@ void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, bool always_link // save the old PC if we want to if (lr_reg != Reg::count && !always_link) { - // can't cache because we have two branches + // Can't cache because we have two branches. Load delay cancel is due to the immediate flush afterwards, + // if we don't cancel it, at the end of the instruction the value we write can be overridden. + EmitCancelInterpreterLoadDelayForReg(lr_reg); m_register_cache.WriteGuestRegister(lr_reg, std::move(old_npc)); m_register_cache.FlushGuestRegister(lr_reg, true, true); } @@ -1807,6 +1816,10 @@ void CodeGenerator::EmitRaiseException(Exception excode, Condition condition /* Value::FromConstantU8(static_cast(excode))); m_register_cache.FlushAllGuestRegisters(true, true); m_register_cache.FlushLoadDelay(true); + + // PC should be synced at this point. If we leave the 4 on here for this instruction, we mess up npc. + Assert(m_delayed_pc_add == 4); + m_delayed_pc_add = 0; return; } diff --git a/src/core/cpu_recompiler_register_cache.cpp b/src/core/cpu_recompiler_register_cache.cpp index 955a5a168..600065bc6 100644 --- a/src/core/cpu_recompiler_register_cache.cpp +++ b/src/core/cpu_recompiler_register_cache.cpp @@ -509,6 +509,9 @@ void RegisterCache::WriteGuestRegisterDelayed(Reg guest_reg, Value&& value) m_load_delay_value.ReleaseAndClear(); } + // two load delay case with interpreter load delay + m_code_generator.EmitCancelInterpreterLoadDelayForReg(guest_reg); + // set up the load delay at the end of this instruction Value& cache_value = m_next_load_delay_value; Assert(m_next_load_delay_register == Reg::count); @@ -561,7 +564,7 @@ void RegisterCache::WriteLoadDelayToCPU(bool clear) if (m_load_delay_register != Reg::count) { Log_DebugPrintf("Flushing pending load delay of %s", GetRegName(m_load_delay_register)); - m_code_generator.EmitStoreLoadDelay(m_load_delay_register, m_load_delay_value); + m_code_generator.EmitStoreInterpreterLoadDelay(m_load_delay_register, m_load_delay_value); if (clear) { m_load_delay_register = Reg::count; @@ -573,17 +576,18 @@ void RegisterCache::WriteLoadDelayToCPU(bool clear) void RegisterCache::FlushLoadDelay(bool clear) { Assert(m_next_load_delay_register == Reg::count); - if (m_load_delay_register == Reg::count) - return; - // if this is an exception exit, write the new value to the CPU register file, but keep it tracked for the next - // non-exception-raised path. TODO: push/pop whole state would avoid this issue - m_code_generator.EmitStoreGuestRegister(m_load_delay_register, m_load_delay_value); - - if (clear) + if (m_load_delay_register != Reg::count) { - m_load_delay_register = Reg::count; - m_load_delay_value.ReleaseAndClear(); + // if this is an exception exit, write the new value to the CPU register file, but keep it tracked for the next + // non-exception-raised path. TODO: push/pop whole state would avoid this issue + m_code_generator.EmitStoreGuestRegister(m_load_delay_register, m_load_delay_value); + + if (clear) + { + m_load_delay_register = Reg::count; + m_load_delay_value.ReleaseAndClear(); + } } } @@ -627,6 +631,16 @@ void RegisterCache::InvalidateGuestRegister(Reg guest_reg) cache_value.Clear(); } +void RegisterCache::InvalidateAllNonDirtyGuestRegisters() +{ + for (u8 reg = 0; reg < static_cast(Reg::count); reg++) + { + Value& cache_value = m_guest_reg_cache[reg]; + if (cache_value.IsValid() && !cache_value.IsDirty()) + InvalidateGuestRegister(static_cast(reg)); + } +} + void RegisterCache::FlushAllGuestRegisters(bool invalidate, bool clear_dirty) { for (u8 reg = 0; reg < static_cast(Reg::count); reg++) @@ -694,4 +708,5 @@ void RegisterCache::AppendRegisterToOrder(Reg reg) m_guest_register_order[0] = reg; m_guest_register_order_count++; } + } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_register_cache.h b/src/core/cpu_recompiler_register_cache.h index b8e068beb..818681a3b 100644 --- a/src/core/cpu_recompiler_register_cache.h +++ b/src/core/cpu_recompiler_register_cache.h @@ -257,6 +257,7 @@ public: void FlushGuestRegister(Reg guest_reg, bool invalidate, bool clear_dirty); void InvalidateGuestRegister(Reg guest_reg); + void InvalidateAllNonDirtyGuestRegisters(); void FlushAllGuestRegisters(bool invalidate, bool clear_dirty); bool EvictOneGuestRegister();