From aa52dbfeb8e00fb385122cfe1f7a4525e5ca0cde Mon Sep 17 00:00:00 2001
From: Connor McLaughlin <stenzek@gmail.com>
Date: Thu, 12 Dec 2019 23:34:53 +1000
Subject: [PATCH] CPU/Recompiler: Use register cache for managing pc

Reduces the number of loadstores after each instruction.
---
 src/core/cpu_code_cache.cpp                   | 37 +++++----
 src/core/cpu_code_cache.h                     | 11 ++-
 src/core/cpu_recompiler_code_generator.cpp    | 83 +++++--------------
 src/core/cpu_recompiler_code_generator.h      |  3 -
 .../cpu_recompiler_code_generator_x64.cpp     | 50 ++++++-----
 src/core/cpu_recompiler_register_cache.h      |  7 ++
 src/core/cpu_types.cpp                        |  2 +-
 7 files changed, 88 insertions(+), 105 deletions(-)

diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp
index cc7f6eaef..6c55ba21f 100644
--- a/src/core/cpu_code_cache.cpp
+++ b/src/core/cpu_code_cache.cpp
@@ -120,6 +120,9 @@ void CodeCache::Execute()
       }
     }
   }
+
+  // in case we switch to interpreter...
+  m_core->m_regs.npc = m_core->m_regs.pc;
 }
 
 void CodeCache::SetUseRecompiler(bool enable)
@@ -148,17 +151,17 @@ void CodeCache::Flush()
 void CodeCache::LogCurrentState()
 {
   const auto& regs = m_core->m_regs;
-  WriteToExecutionLog(
-    "tick=%u pc=%08X npc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X "
-    "t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X "
-    "s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X npc=%08X ldr=%s "
-    "ldv=%08X\n",
-    m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.npc, regs.zero, regs.at, regs.v0,
-    regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4, regs.t5, regs.t6, regs.t7,
-    regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7, regs.t8, regs.t9, regs.k0, regs.k1, regs.gp,
-    regs.sp, regs.fp, regs.ra, regs.npc,
-    (m_core->m_next_load_delay_reg == Reg::count) ? "NONE" : GetRegName(m_core->m_next_load_delay_reg),
-    (m_core->m_next_load_delay_reg == Reg::count) ? 0 : m_core->m_next_load_delay_value);
+  WriteToExecutionLog("tick=%u pc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X "
+                      "t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X "
+                      "s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X ldr=%s "
+                      "ldv=%08X\n",
+                      m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.zero, regs.at,
+                      regs.v0, regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4,
+                      regs.t5, regs.t6, regs.t7, regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7,
+                      regs.t8, regs.t9, regs.k0, regs.k1, regs.gp, regs.sp, regs.fp, regs.ra,
+                      (m_core->m_next_load_delay_reg == Reg::count) ? "NONE" :
+                                                                      GetRegName(m_core->m_next_load_delay_reg),
+                      (m_core->m_next_load_delay_reg == Reg::count) ? 0 : m_core->m_next_load_delay_value);
 }
 
 CodeBlockKey CodeCache::GetNextBlockKey() const
@@ -202,7 +205,8 @@ bool CodeCache::RevalidateBlock(CodeBlock* block)
   for (const CodeBlockInstruction& cbi : block->instructions)
   {
     u32 new_code = 0;
-    m_bus->DispatchAccess<MemoryAccessType::Read, MemoryAccessSize::Word>(cbi.pc, new_code);
+    m_bus->DispatchAccess<MemoryAccessType::Read, MemoryAccessSize::Word>(cbi.pc & PHYSICAL_MEMORY_ADDRESS_MASK,
+                                                                          new_code);
     if (cbi.instruction.bits != new_code)
     {
       Log_DebugPrintf("Block 0x%08X changed at PC 0x%08X - %08X to %08X - recompiling.", block->GetPC(), cbi.pc,
@@ -419,7 +423,9 @@ void CodeCache::UnlinkBlock(CodeBlock* block)
 void CodeCache::InterpretCachedBlock(const CodeBlock& block)
 {
   // set up the state so we've already fetched the instruction
-  DebugAssert((m_core->m_regs.pc & PHYSICAL_MEMORY_ADDRESS_MASK) == block.GetPC());
+  DebugAssert(m_core->m_regs.pc == block.GetPC());
+
+  m_core->m_regs.npc = block.GetPC() + 4;
 
   for (const CodeBlockInstruction& cbi : block.instructions)
   {
@@ -427,14 +433,13 @@ void CodeCache::InterpretCachedBlock(const CodeBlock& block)
 
     // now executing the instruction we previously fetched
     m_core->m_current_instruction.bits = cbi.instruction.bits;
-    m_core->m_current_instruction_pc = m_core->m_regs.pc;
+    m_core->m_current_instruction_pc = cbi.pc;
     m_core->m_current_instruction_in_branch_delay_slot = cbi.is_branch_delay_slot;
     m_core->m_current_instruction_was_branch_taken = m_core->m_branch_was_taken;
     m_core->m_branch_was_taken = false;
     m_core->m_exception_raised = false;
 
     // update pc
-    DebugAssert((m_core->m_regs.pc & PHYSICAL_MEMORY_ADDRESS_MASK) == cbi.pc);
     m_core->m_regs.pc = m_core->m_regs.npc;
     m_core->m_regs.npc += 4;
 
@@ -454,6 +459,8 @@ void CodeCache::InterpretCachedBlock(const CodeBlock& block)
 
 void CodeCache::InterpretUncachedBlock()
 {
+  Panic("Fixme with regards to re-fetching PC");
+
   // At this point, pc contains the last address executed (in the previous block). The instruction has not been fetched
   // yet. pc shouldn't be updated until the fetch occurs, that way the exception occurs in the delay slot.
   bool in_branch_delay_slot = false;
diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h
index 9c00dd821..4dd264782 100644
--- a/src/core/cpu_code_cache.h
+++ b/src/core/cpu_code_cache.h
@@ -28,6 +28,8 @@ union CodeBlockKey
   ALWAYS_INLINE u32 GetPC() const { return aligned_pc << 2; }
   ALWAYS_INLINE void SetPC(u32 pc) { aligned_pc = pc >> 2; }
 
+  ALWAYS_INLINE u32 GetPCPhysicalAddress() const { return (aligned_pc << 2) & PHYSICAL_MEMORY_ADDRESS_MASK; }
+
   ALWAYS_INLINE CodeBlockKey& operator=(const CodeBlockKey& rhs)
   {
     bits = rhs.bits;
@@ -72,12 +74,15 @@ struct CodeBlock
 
   const u32 GetPC() const { return key.GetPC(); }
   const u32 GetSizeInBytes() const { return static_cast<u32>(instructions.size()) * sizeof(Instruction); }
-  const u32 GetStartPageIndex() const { return (key.GetPC() / CPU_CODE_CACHE_PAGE_SIZE); }
-  const u32 GetEndPageIndex() const { return ((key.GetPC() + GetSizeInBytes()) / CPU_CODE_CACHE_PAGE_SIZE); }
+  const u32 GetStartPageIndex() const { return (key.GetPCPhysicalAddress() / CPU_CODE_CACHE_PAGE_SIZE); }
+  const u32 GetEndPageIndex() const
+  {
+    return ((key.GetPCPhysicalAddress() + GetSizeInBytes()) / CPU_CODE_CACHE_PAGE_SIZE);
+  }
   bool IsInRAM() const
   {
     // TODO: Constant
-    return key.GetPC() < 0x200000;
+    return key.GetPCPhysicalAddress() < 0x200000;
   }
 };
 
diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp
index cad54b6f4..08c94bfa8 100644
--- a/src/core/cpu_recompiler_code_generator.cpp
+++ b/src/core/cpu_recompiler_code_generator.cpp
@@ -50,6 +50,8 @@ bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePoin
   EmitEndBlock();
 
   FinalizeBlock(out_host_code, out_host_code_size);
+  Log_ProfilePrintf("JIT block 0x%08X: %zu instructions (%u bytes), %u host bytes", block->GetPC(),
+                    block->instructions.size(), block->GetSizeInBytes(), *out_host_code_size);
 
   DebugAssert(m_register_cache.GetUsedHostRegisters() == 0);
 
@@ -709,14 +711,6 @@ void CodeGenerator::BlockPrologue()
   m_branch_was_taken_dirty = true;
   m_current_instruction_was_branch_taken_dirty = false;
   m_load_delay_dirty = true;
-
-  // sync m_current_instruction_pc so we can simply add to it
-  SyncCurrentInstructionPC();
-
-  // and the same for m_regs.pc
-  SyncPC();
-
-  EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(4));
 }
 
 void CodeGenerator::BlockEpilogue()
@@ -729,17 +723,7 @@ void CodeGenerator::BlockEpilogue()
   if (m_register_cache.HasLoadDelay())
     m_register_cache.WriteLoadDelayToCPU(true);
 
-  // if the last instruction wasn't a fallback, we need to add its fetch
-  if (m_delayed_pc_add > 0)
-  {
-    EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(m_delayed_pc_add));
-    m_delayed_pc_add = 0;
-  }
-
   AddPendingCycles();
-
-  // TODO: correct value for is_branch_delay_slot - branches in branch delay slot.
-  EmitStoreCPUStructField(offsetof(Core, m_next_instruction_is_branch_delay_slot), Value::FromConstantU8(0));
 }
 
 void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles,
@@ -771,42 +755,29 @@ void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCou
     m_current_instruction_in_branch_delay_slot_dirty = false;
   }
 
-  if (cbi.is_branch_delay_slot)
+  // increment PC, except if we're in the branch delay slot where it was just changed
+  if (!cbi.is_branch_delay_slot)
   {
-    // m_regs.pc should be synced for the next block, as the branch wrote to npc
-    SyncCurrentInstructionPC();
-    SyncPC();
-
-    // m_current_instruction_in_branch_delay_slot = true
-    EmitStoreCPUStructField(offsetof(Core, m_current_instruction_in_branch_delay_slot), Value::FromConstantU8(1));
-    m_current_instruction_in_branch_delay_slot_dirty = true;
+    Assert(!m_register_cache.IsGuestRegisterInHostRegister(Reg::pc));
+    m_register_cache.WriteGuestRegister(Reg::pc, Value::FromConstantU32(cbi.pc + 4));
   }
 
   if (!CanInstructionTrap(cbi.instruction, m_block->key.user_mode) && !force_sync)
   {
     // Defer updates for non-faulting instructions.
-    m_delayed_pc_add += INSTRUCTION_SIZE;
     m_delayed_cycles_add += cycles;
     return;
   }
 
-  if (m_delayed_pc_add > 0)
+  if (cbi.is_branch_delay_slot)
   {
-    // m_current_instruction_pc += m_delayed_pc_add
-    EmitAddCPUStructField(offsetof(Core, m_current_instruction_pc), Value::FromConstantU32(m_delayed_pc_add));
-
-    // m_regs.pc += m_delayed_pc_add
-    EmitAddCPUStructField(offsetof(Core, m_regs.pc), Value::FromConstantU32(m_delayed_pc_add));
-
-    // m_regs.npc += m_delayed_pc_add
-    // TODO: This can go once we recompile branch instructions and unconditionally set npc
-    EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(m_delayed_pc_add));
-
-    m_delayed_pc_add = 0;
+    // m_current_instruction_in_branch_delay_slot = true
+    EmitStoreCPUStructField(offsetof(Core, m_current_instruction_in_branch_delay_slot), Value::FromConstantU8(1));
+    m_current_instruction_in_branch_delay_slot_dirty = true;
   }
 
-  if (!cbi.is_branch_instruction)
-    m_delayed_pc_add = INSTRUCTION_SIZE;
+  // Sync current instruction PC
+  EmitStoreCPUStructField(offsetof(Core, m_current_instruction_pc), Value::FromConstantU32(cbi.pc));
 
   m_delayed_cycles_add += cycles;
   AddPendingCycles();
@@ -835,22 +806,6 @@ void CodeGenerator::InstructionEpilogue(const CodeBlockInstruction& cbi)
   }
 }
 
-void CodeGenerator::SyncCurrentInstructionPC()
-{
-  // m_current_instruction_pc = m_regs.pc
-  Value pc_value = m_register_cache.AllocateScratch(RegSize_32);
-  EmitLoadCPUStructField(pc_value.host_reg, RegSize_32, offsetof(Core, m_regs.pc));
-  EmitStoreCPUStructField(offsetof(Core, m_current_instruction_pc), pc_value);
-}
-
-void CodeGenerator::SyncPC()
-{
-  // m_regs.pc = m_regs.npc
-  Value npc_value = m_register_cache.AllocateScratch(RegSize_32);
-  EmitLoadCPUStructField(npc_value.host_reg, RegSize_32, offsetof(Core, m_regs.npc));
-  EmitStoreCPUStructField(offsetof(Core, m_regs.pc), npc_value);
-}
-
 void CodeGenerator::AddPendingCycles()
 {
   if (m_delayed_cycles_add == 0)
@@ -1246,8 +1201,7 @@ bool CodeGenerator::Compile_SetLess(const CodeBlockInstruction& cbi)
 
 bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
 {
-  // Force sync since we branches are PC-relative.
-  InstructionPrologue(cbi, 1, true);
+  InstructionPrologue(cbi, 1);
 
   // Compute the branch target.
   // This depends on the form of the instruction.
@@ -1258,7 +1212,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
     {
       // npc = (pc & 0xF0000000) | (target << 2)
       Value branch_target =
-        OrValues(AndValues(m_register_cache.ReadGuestRegister(Reg::pc, false), Value::FromConstantU32(0xF0000000)),
+        OrValues(AndValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(0xF0000000)),
                  Value::FromConstantU32(cbi.instruction.j.target << 2));
 
       EmitBranch(Condition::Always, (cbi.instruction.op == InstructionOp::jal) ? Reg::ra : Reg::count,
@@ -1294,7 +1248,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
     case InstructionOp::bne:
     {
       // npc = pc + (sext(imm) << 2)
-      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false),
+      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc),
                                       Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false);
 
       // branch <- rs op rt
@@ -1311,7 +1265,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
     case InstructionOp::blez:
     {
       // npc = pc + (sext(imm) << 2)
-      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false),
+      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc),
                                       Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false);
 
       // branch <- rs op 0
@@ -1327,7 +1281,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
     case InstructionOp::b:
     {
       // npc = pc + (sext(imm) << 2)
-      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false),
+      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc),
                                       Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false);
 
       const u8 rt = static_cast<u8>(cbi.instruction.i.rt.GetValue());
@@ -1344,7 +1298,8 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
       if (link)
       {
         EmitCancelInterpreterLoadDelayForReg(Reg::ra);
-        m_register_cache.WriteGuestRegister(Reg::ra, m_register_cache.ReadGuestRegister(Reg::npc, false));
+        m_register_cache.WriteGuestRegister(
+          Reg::ra, AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(4), false));
       }
 
       EmitTest(lhs.host_reg, lhs);
diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h
index b4ccf42e6..c401d7e07 100644
--- a/src/core/cpu_recompiler_code_generator.h
+++ b/src/core/cpu_recompiler_code_generator.h
@@ -165,8 +165,6 @@ private:
   void BlockEpilogue();
   void InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles, bool force_sync = false);
   void InstructionEpilogue(const CodeBlockInstruction& cbi);
-  void SyncCurrentInstructionPC();
-  void SyncPC();
   void AddPendingCycles();
 
   Value DoGTERegisterRead(u32 index);
@@ -202,7 +200,6 @@ private:
   CodeEmitter m_far_emitter;
   CodeEmitter* m_emit;
 
-  u32 m_delayed_pc_add = 0;
   TickCount m_delayed_cycles_add = 0;
 
   // whether various flags need to be reset.
diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp
index 3c9a7ba83..f4bdaabf9 100644
--- a/src/core/cpu_recompiler_code_generator_x64.cpp
+++ b/src/core/cpu_recompiler_code_generator_x64.cpp
@@ -201,6 +201,9 @@ void CodeGenerator::EmitEndBlock()
 
 void CodeGenerator::EmitExceptionExit()
 {
+  // toss away our PC value since we're jumping to the exception handler
+  m_register_cache.InvalidateGuestRegister(Reg::pc);
+
   // ensure all unflushed registers are written back
   m_register_cache.FlushAllGuestRegisters(false, false);
 
@@ -1762,28 +1765,33 @@ static void EmitConditionalJump(Condition condition, bool invert, Xbyak::CodeGen
 
 void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, Value&& branch_target)
 {
-  // allocate scratch register for reading npc - we return to the main path, so this could cause a reg flush
-  Value old_npc = m_register_cache.AllocateScratch(RegSize_32);
-
-  // npc gets modified by the branch, so we can't trust it on returning. same for lr_reg, which might contain a dirty
-  // value
-  m_register_cache.FlushGuestRegister(Reg::npc, true, true);
-  if (lr_reg != Reg::count)
+  // ensure the lr register is flushed, since we want it's correct value after the branch
+  if (lr_reg != Reg::count && lr_reg != Reg::zero)
     m_register_cache.FlushGuestRegister(lr_reg, true, true);
 
-  // condition is inverted because we want the case for skipping it
+  // compute return address, which is also set as the new pc when the branch isn't taken
+  Value new_pc;
+  if (condition != Condition::Always || lr_reg != Reg::count)
+  {
+    new_pc = AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(4), false);
+    if (!new_pc.IsInHostRegister())
+      new_pc = GetValueInHostRegister(new_pc);
+  }
+
   Xbyak::Label skip_branch;
   if (condition != Condition::Always)
+  {
+    // condition is inverted because we want the case for skipping it
     EmitConditionalJump(condition, true, m_emit, skip_branch);
+  }
 
   // save the old PC if we want to
-  if (lr_reg != Reg::count)
+  if (lr_reg != Reg::count && lr_reg != Reg::zero)
   {
     // Can't cache because we have two branches. Load delay cancel is due to the immediate flush afterwards,
     // if we don't cancel it, at the end of the instruction the value we write can be overridden.
     EmitCancelInterpreterLoadDelayForReg(lr_reg);
-    EmitLoadGuestRegister(old_npc.host_reg, Reg::npc);
-    EmitStoreGuestRegister(lr_reg, old_npc);
+    EmitStoreGuestRegister(lr_reg, new_pc);
   }
 
   // we don't need to test the address of constant branches unless they're definitely misaligned, which would be
@@ -1814,12 +1822,18 @@ void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, Value&& branch_t
     m_register_cache.PopState();
   }
 
-  // branch taken path - write new PC and flush it, since two branches
-  EmitStoreGuestRegister(Reg::npc, branch_target);
-  EmitStoreCPUStructField(offsetof(Core, m_current_instruction_was_branch_taken), Value::FromConstantU8(1));
+  // branch taken path - change the return address/new pc
+  if (condition != Condition::Always)
+    EmitCopyValue(new_pc.GetHostRegister(), branch_target);
 
   // converge point
   m_emit->L(skip_branch);
+
+  // update pc
+  if (condition != Condition::Always)
+    m_register_cache.WriteGuestRegister(Reg::pc, std::move(new_pc));
+  else
+    m_register_cache.WriteGuestRegister(Reg::pc, std::move(branch_target));
 }
 
 void CodeGenerator::EmitRaiseException(Exception excode, Condition condition /* = Condition::Always */)
@@ -1827,14 +1841,12 @@ void CodeGenerator::EmitRaiseException(Exception excode, Condition condition /*
   if (condition == Condition::Always)
   {
     // no need to use far code if we're always raising the exception
-    EmitFunctionCall(nullptr, &Thunks::RaiseException, m_register_cache.GetCPUPtr(),
-                     Value::FromConstantU8(static_cast<u8>(excode)));
+    m_register_cache.InvalidateGuestRegister(Reg::pc);
     m_register_cache.FlushAllGuestRegisters(true, true);
     m_register_cache.FlushLoadDelay(true);
 
-    // PC should be synced at this point. If we leave the 4 on here for this instruction, we mess up npc.
-    Assert(m_delayed_pc_add == 4);
-    m_delayed_pc_add = 0;
+    EmitFunctionCall(nullptr, &Thunks::RaiseException, m_register_cache.GetCPUPtr(),
+                     Value::FromConstantU8(static_cast<u8>(excode)));
     return;
   }
 
diff --git a/src/core/cpu_recompiler_register_cache.h b/src/core/cpu_recompiler_register_cache.h
index 67406cf8e..dfaa13857 100644
--- a/src/core/cpu_recompiler_register_cache.h
+++ b/src/core/cpu_recompiler_register_cache.h
@@ -252,6 +252,13 @@ public:
     return cache_value.IsConstant() || cache_value.IsInHostRegister();
   }
 
+  /// Returns true if the specified guest register is cached and in a host register.
+  bool IsGuestRegisterInHostRegister(Reg guest_reg) const
+  {
+    const Value& cache_value = m_state.guest_reg_state[static_cast<u8>(guest_reg)];
+    return cache_value.IsInHostRegister();
+  }
+
   /// Returns the host register if the guest register is cached.
   std::optional<HostReg> GetHostRegisterForGuestRegister(Reg guest_reg) const
   {
diff --git a/src/core/cpu_types.cpp b/src/core/cpu_types.cpp
index 5c52f9e08..c8fd684d3 100644
--- a/src/core/cpu_types.cpp
+++ b/src/core/cpu_types.cpp
@@ -189,7 +189,7 @@ bool CanInstructionTrap(const Instruction& instruction, bool in_user_mode)
     case InstructionOp::bgtz:
     case InstructionOp::blez:
     case InstructionOp::bne:
-      return true;
+      return false;
 
     case InstructionOp::funct:
     {