CPU: Fix more load delay slot issues

Fixes Spyro again. b{ltz,gez}(al)? disabled in the recompiler until
issues are fixed.
This commit is contained in:
Connor McLaughlin 2019-11-24 01:10:51 +10:00
parent 897e75fa8e
commit 8c5fcc8f48
9 changed files with 109 additions and 90 deletions

View File

@ -55,22 +55,23 @@ void CodeCache::Execute()
break;
}
reexecute_block:
#if 0
const u32 tick = m_system->GetGlobalTickCounter() + m_core->GetPendingTicks();
if (tick == 58672386)
if (tick == 61033207)
__debugbreak();
#endif
reexecute_block:
if (m_use_recompiler)
block->host_code(m_core);
else
InterpretCachedBlock(*block);
#if 0
LogCurrentState();
#endif
if (m_use_recompiler)
block->host_code(m_core);
else
InterpretCachedBlock(*block);
if (m_core->m_downcount < 0)
break;
else if (m_core->HasPendingInterrupt() || !USE_BLOCK_LINKING)
@ -142,11 +143,14 @@ void CodeCache::LogCurrentState()
WriteToExecutionLog(
"tick=%u pc=%08X npc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X "
"t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X "
"s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X\n",
"s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X npc=%08X ldr=%s "
"ldv=%08X\n",
m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.npc, regs.zero, regs.at, regs.v0,
regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4, regs.t5, regs.t6, regs.t7,
regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7, regs.t8, regs.t9, regs.k0, regs.k1, regs.gp,
regs.sp, regs.fp, regs.ra);
regs.sp, regs.fp, regs.ra, regs.npc,
(m_core->m_next_load_delay_reg == Reg::count) ? "NONE" : GetRegName(m_core->m_next_load_delay_reg),
(m_core->m_next_load_delay_reg == Reg::count) ? 0 : m_core->m_next_load_delay_value);
}
CodeBlockKey CodeCache::GetNextBlockKey() const

View File

@ -94,10 +94,8 @@ bool Core::DoState(StateWrapper& sw)
sw.Do(&m_branch_was_taken);
sw.Do(&m_load_delay_reg);
sw.Do(&m_load_delay_value);
sw.Do(&m_load_delay_old_value);
sw.Do(&m_next_load_delay_reg);
sw.Do(&m_next_load_delay_value);
sw.Do(&m_next_load_delay_old_value);
sw.Do(&m_cache_control);
sw.DoBytes(m_dcache.data(), m_dcache.size());
@ -389,6 +387,7 @@ u32 Core::ReadReg(Reg rs)
void Core::WriteReg(Reg rd, u32 value)
{
m_regs.r[static_cast<u8>(rd)] = value;
m_load_delay_reg = (rd == m_load_delay_reg) ? Reg::count : m_load_delay_reg;
// prevent writes to $zero from going through - better than branching/cmov
m_regs.zero = 0;
@ -407,7 +406,6 @@ void Core::WriteRegDelayed(Reg rd, u32 value)
// save the old value, if something else overwrites this reg we want to preserve it
m_next_load_delay_reg = rd;
m_next_load_delay_value = value;
m_next_load_delay_old_value = m_regs.r[static_cast<u8>(rd)];
}
std::optional<u32> Core::ReadCop0Reg(Cop0Reg reg)
@ -1120,7 +1118,7 @@ void Core::ExecuteInstruction()
case InstructionOp::jal:
{
m_regs.ra = m_regs.npc;
WriteReg(Reg::ra, m_regs.npc);
m_next_instruction_is_branch_delay_slot = true;
Branch((m_regs.pc & UINT32_C(0xF0000000)) | (inst.j.target << 2));
}
@ -1175,7 +1173,7 @@ void Core::ExecuteInstruction()
// register is still linked even if the branch isn't taken
const bool link = (rt & u8(0x1E)) == u8(0x10);
if (link)
m_regs.ra = m_regs.npc;
WriteReg(Reg::ra, m_regs.npc);
if (branch)
Branch(m_regs.pc + (inst.i.imm_sext32() << 2));

View File

@ -106,12 +106,11 @@ private:
ALWAYS_INLINE void UpdateLoadDelay()
{
// the old value is needed in case the delay slot instruction overwrites the same register
if (m_load_delay_reg != Reg::count && m_regs.r[static_cast<u8>(m_load_delay_reg)] == m_load_delay_old_value)
if (m_load_delay_reg != Reg::count)
m_regs.r[static_cast<u8>(m_load_delay_reg)] = m_load_delay_value;
m_load_delay_reg = m_next_load_delay_reg;
m_load_delay_value = m_next_load_delay_value;
m_load_delay_old_value = m_next_load_delay_old_value;
m_next_load_delay_reg = Reg::count;
}
@ -168,10 +167,8 @@ private:
// load delays
Reg m_load_delay_reg = Reg::count;
u32 m_load_delay_value = 0;
u32 m_load_delay_old_value = 0;
Reg m_next_load_delay_reg = Reg::count;
u32 m_next_load_delay_value = 0;
u32 m_next_load_delay_old_value = 0;
u32 m_cache_control = 0;

View File

@ -93,7 +93,7 @@ bool CodeGenerator::CompileInstruction(const CodeBlockInstruction& cbi)
case InstructionOp::j:
case InstructionOp::jal:
case InstructionOp::b:
//case InstructionOp::b:
case InstructionOp::beq:
case InstructionOp::bne:
case InstructionOp::bgtz:
@ -812,20 +812,23 @@ void CodeGenerator::InstructionEpilogue(const CodeBlockInstruction& cbi)
{
m_register_cache.UpdateLoadDelay();
if (m_load_delay_dirty)
{
// we have to invalidate the register cache, since the load delayed register might've been cached
Log_DebugPrint("Emitting delay slot flush");
EmitFlushInterpreterLoadDelay();
m_register_cache.InvalidateAllNonDirtyGuestRegisters();
m_load_delay_dirty = false;
}
// copy if the previous instruction was a load, reset the current value on the next instruction
if (m_next_load_delay_dirty)
{
Log_DebugPrint("Emitting delay slot flush (with move next)");
EmitDelaySlotUpdate(false, false, true);
EmitMoveNextInterpreterLoadDelay();
m_next_load_delay_dirty = false;
m_load_delay_dirty = true;
}
else if (m_load_delay_dirty)
{
Log_DebugPrint("Emitting delay slot flush");
EmitDelaySlotUpdate(true, false, false);
m_load_delay_dirty = false;
}
}
void CodeGenerator::SyncCurrentInstructionPC()

View File

@ -65,7 +65,10 @@ public:
void EmitLoadGuestRegister(HostReg host_reg, Reg guest_reg);
void EmitStoreGuestRegister(Reg guest_reg, const Value& value);
void EmitStoreLoadDelay(Reg reg, const Value& value);
void EmitStoreInterpreterLoadDelay(Reg reg, const Value& value);
void EmitFlushInterpreterLoadDelay();
void EmitMoveNextInterpreterLoadDelay();
void EmitCancelInterpreterLoadDelayForReg(Reg reg);
void EmitLoadCPUStructField(HostReg host_reg, RegSize size, u32 offset);
void EmitStoreCPUStructField(u32 offset, const Value& value);
void EmitAddCPUStructField(u32 offset, const Value& value);
@ -168,7 +171,6 @@ private:
void SyncCurrentInstructionPC();
void SyncPC();
void AddPendingCycles();
void EmitDelaySlotUpdate(bool skip_check_for_delay, bool skip_check_old_value, bool move_next);
//////////////////////////////////////////////////////////////////////////
// Instruction Code Generators

View File

@ -17,25 +17,11 @@ void CodeGenerator::EmitStoreGuestRegister(Reg guest_reg, const Value& value)
EmitStoreCPUStructField(CalculateRegisterOffset(guest_reg), value);
}
void CodeGenerator::EmitStoreLoadDelay(Reg reg, const Value& value)
void CodeGenerator::EmitStoreInterpreterLoadDelay(Reg reg, const Value& value)
{
DebugAssert(value.size == RegSize_32 && value.IsInHostRegister());
EmitStoreCPUStructField(offsetof(Core, m_load_delay_reg), Value::FromConstantU8(static_cast<u8>(reg)));
EmitStoreCPUStructField(offsetof(Core, m_load_delay_value), value);
// We don't want to allocate a register since this could be in a block exit, so re-use the value.
if (m_register_cache.IsGuestRegisterCached(reg))
{
EmitStoreCPUStructField(offsetof(Core, m_load_delay_old_value), m_register_cache.ReadGuestRegister(reg));
}
else
{
EmitPushHostReg(value.host_reg);
EmitLoadCPUStructField(value.host_reg, RegSize_32, CalculateRegisterOffset(reg));
EmitStoreCPUStructField(offsetof(Core, m_load_delay_old_value), value);
EmitPopHostReg(value.host_reg);
}
m_load_delay_dirty = true;
}

View File

@ -1614,60 +1614,65 @@ void CodeGenerator::EmitStoreGuestMemory(const Value& address, const Value& valu
SwitchToNearCode();
}
void CodeGenerator::EmitDelaySlotUpdate(bool skip_check_for_delay, bool skip_check_old_value, bool move_next)
void CodeGenerator::EmitFlushInterpreterLoadDelay()
{
Value reg = m_register_cache.AllocateScratch(RegSize_8);
Value value = m_register_cache.AllocateScratch(RegSize_32);
Xbyak::Label skip_flush;
auto load_delay_reg = m_emit->byte[GetCPUPtrReg() + offsetof(Core, m_load_delay_reg)];
auto load_delay_old_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_load_delay_old_value)];
auto load_delay_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_load_delay_value)];
auto reg_ptr = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_regs.r[0]) + GetHostReg64(reg.host_reg) * 4];
Xbyak::Label skip_flush;
// reg = load_delay_reg
m_emit->movzx(GetHostReg32(reg.host_reg), load_delay_reg);
if (!skip_check_old_value)
m_emit->mov(GetHostReg32(value), load_delay_old_value);
if (!skip_check_for_delay)
{
// if load_delay_reg == Reg::count goto skip_flush
m_emit->cmp(GetHostReg32(reg.host_reg), static_cast<u8>(Reg::count));
m_emit->je(skip_flush);
}
if (!skip_check_old_value)
{
// if r[reg] != load_delay_old_value goto skip_flush
m_emit->cmp(GetHostReg32(value), reg_ptr);
m_emit->jne(skip_flush);
}
// r[reg] = load_delay_value
m_emit->mov(GetHostReg32(value), load_delay_value);
m_emit->mov(reg_ptr, GetHostReg32(value));
// if !move_next load_delay_reg = Reg::count
if (!move_next)
// load_delay_reg = Reg::count
m_emit->mov(load_delay_reg, static_cast<u8>(Reg::count));
m_emit->L(skip_flush);
}
if (move_next)
{
void CodeGenerator::EmitMoveNextInterpreterLoadDelay()
{
Value reg = m_register_cache.AllocateScratch(RegSize_8);
Value value = m_register_cache.AllocateScratch(RegSize_32);
auto load_delay_reg = m_emit->byte[GetCPUPtrReg() + offsetof(Core, m_load_delay_reg)];
auto load_delay_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_load_delay_value)];
auto next_load_delay_reg = m_emit->byte[GetCPUPtrReg() + offsetof(Core, m_next_load_delay_reg)];
auto next_load_delay_old_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_next_load_delay_old_value)];
auto next_load_delay_value = m_emit->dword[GetCPUPtrReg() + offsetof(Core, m_next_load_delay_value)];
m_emit->mov(GetHostReg32(value), next_load_delay_value);
m_emit->mov(GetHostReg8(reg), next_load_delay_reg);
m_emit->mov(load_delay_value, GetHostReg32(value));
m_emit->mov(GetHostReg32(value), next_load_delay_old_value);
m_emit->mov(load_delay_reg, GetHostReg8(reg));
m_emit->mov(load_delay_old_value, GetHostReg32(value));
m_emit->mov(next_load_delay_reg, static_cast<u8>(Reg::count));
}
}
void CodeGenerator::EmitCancelInterpreterLoadDelayForReg(Reg reg)
{
auto load_delay_reg = m_emit->byte[GetCPUPtrReg() + offsetof(Core, m_load_delay_reg)];
Xbyak::Label skip_cancel;
// if load_delay_reg != reg goto skip_cancel
m_emit->cmp(load_delay_reg, static_cast<u8>(reg));
m_emit->jne(skip_cancel);
// load_delay_reg = Reg::count
m_emit->mov(load_delay_reg, static_cast<u8>(Reg::count));
m_emit->L(skip_cancel);
}
template<typename T>
@ -1746,7 +1751,9 @@ void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, bool always_link
old_npc = m_register_cache.ReadGuestRegister(Reg::npc, false, true);
if (always_link)
{
// can't cache because we have two branches
// Can't cache because we have two branches. Load delay cancel is due to the immediate flush afterwards,
// if we don't cancel it, at the end of the instruction the value we write can be overridden.
EmitCancelInterpreterLoadDelayForReg(lr_reg);
m_register_cache.WriteGuestRegister(lr_reg, std::move(old_npc));
m_register_cache.FlushGuestRegister(lr_reg, true, true);
}
@ -1760,7 +1767,9 @@ void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, bool always_link
// save the old PC if we want to
if (lr_reg != Reg::count && !always_link)
{
// can't cache because we have two branches
// Can't cache because we have two branches. Load delay cancel is due to the immediate flush afterwards,
// if we don't cancel it, at the end of the instruction the value we write can be overridden.
EmitCancelInterpreterLoadDelayForReg(lr_reg);
m_register_cache.WriteGuestRegister(lr_reg, std::move(old_npc));
m_register_cache.FlushGuestRegister(lr_reg, true, true);
}
@ -1807,6 +1816,10 @@ void CodeGenerator::EmitRaiseException(Exception excode, Condition condition /*
Value::FromConstantU8(static_cast<u8>(excode)));
m_register_cache.FlushAllGuestRegisters(true, true);
m_register_cache.FlushLoadDelay(true);
// PC should be synced at this point. If we leave the 4 on here for this instruction, we mess up npc.
Assert(m_delayed_pc_add == 4);
m_delayed_pc_add = 0;
return;
}

View File

@ -509,6 +509,9 @@ void RegisterCache::WriteGuestRegisterDelayed(Reg guest_reg, Value&& value)
m_load_delay_value.ReleaseAndClear();
}
// two load delay case with interpreter load delay
m_code_generator.EmitCancelInterpreterLoadDelayForReg(guest_reg);
// set up the load delay at the end of this instruction
Value& cache_value = m_next_load_delay_value;
Assert(m_next_load_delay_register == Reg::count);
@ -561,7 +564,7 @@ void RegisterCache::WriteLoadDelayToCPU(bool clear)
if (m_load_delay_register != Reg::count)
{
Log_DebugPrintf("Flushing pending load delay of %s", GetRegName(m_load_delay_register));
m_code_generator.EmitStoreLoadDelay(m_load_delay_register, m_load_delay_value);
m_code_generator.EmitStoreInterpreterLoadDelay(m_load_delay_register, m_load_delay_value);
if (clear)
{
m_load_delay_register = Reg::count;
@ -573,9 +576,9 @@ void RegisterCache::WriteLoadDelayToCPU(bool clear)
void RegisterCache::FlushLoadDelay(bool clear)
{
Assert(m_next_load_delay_register == Reg::count);
if (m_load_delay_register == Reg::count)
return;
if (m_load_delay_register != Reg::count)
{
// if this is an exception exit, write the new value to the CPU register file, but keep it tracked for the next
// non-exception-raised path. TODO: push/pop whole state would avoid this issue
m_code_generator.EmitStoreGuestRegister(m_load_delay_register, m_load_delay_value);
@ -585,6 +588,7 @@ void RegisterCache::FlushLoadDelay(bool clear)
m_load_delay_register = Reg::count;
m_load_delay_value.ReleaseAndClear();
}
}
}
void RegisterCache::FlushGuestRegister(Reg guest_reg, bool invalidate, bool clear_dirty)
@ -627,6 +631,16 @@ void RegisterCache::InvalidateGuestRegister(Reg guest_reg)
cache_value.Clear();
}
void RegisterCache::InvalidateAllNonDirtyGuestRegisters()
{
for (u8 reg = 0; reg < static_cast<u8>(Reg::count); reg++)
{
Value& cache_value = m_guest_reg_cache[reg];
if (cache_value.IsValid() && !cache_value.IsDirty())
InvalidateGuestRegister(static_cast<Reg>(reg));
}
}
void RegisterCache::FlushAllGuestRegisters(bool invalidate, bool clear_dirty)
{
for (u8 reg = 0; reg < static_cast<u8>(Reg::count); reg++)
@ -694,4 +708,5 @@ void RegisterCache::AppendRegisterToOrder(Reg reg)
m_guest_register_order[0] = reg;
m_guest_register_order_count++;
}
} // namespace CPU::Recompiler

View File

@ -257,6 +257,7 @@ public:
void FlushGuestRegister(Reg guest_reg, bool invalidate, bool clear_dirty);
void InvalidateGuestRegister(Reg guest_reg);
void InvalidateAllNonDirtyGuestRegisters();
void FlushAllGuestRegisters(bool invalidate, bool clear_dirty);
bool EvictOneGuestRegister();