CPU/Recompiler: Use register cache for managing pc

Reduces the number of loadstores after each instruction.
2019-12-12 23:34:53 +10:00 · 2019-12-12 23:34:53 +10:00 · aa52dbfeb8
parent 002d1cd4fd
commit aa52dbfeb8
7 changed files with 88 additions and 105 deletions
--- a/src/core/cpu_code_cache.cpp
+++ b/src/core/cpu_code_cache.cpp
@ -120,6 +120,9 @@ void CodeCache::Execute()
      }
    }
  }
+
+  // in case we switch to interpreter...
+  m_core->m_regs.npc = m_core->m_regs.pc;
 }

 void CodeCache::SetUseRecompiler(bool enable)
@ -148,17 +151,17 @@ void CodeCache::Flush()
 void CodeCache::LogCurrentState()
 {
  const auto& regs = m_core->m_regs;
-  WriteToExecutionLog(
-    "tick=%u pc=%08X npc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X "
-    "t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X "
-    "s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X npc=%08X ldr=%s "
-    "ldv=%08X\n",
-    m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.npc, regs.zero, regs.at, regs.v0,
-    regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4, regs.t5, regs.t6, regs.t7,
-    regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7, regs.t8, regs.t9, regs.k0, regs.k1, regs.gp,
-    regs.sp, regs.fp, regs.ra, regs.npc,
-    (m_core->m_next_load_delay_reg == Reg::count) ? "NONE" : GetRegName(m_core->m_next_load_delay_reg),
-    (m_core->m_next_load_delay_reg == Reg::count) ? 0 : m_core->m_next_load_delay_value);
+  WriteToExecutionLog("tick=%u pc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X "
+                      "t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X "
+                      "s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X ldr=%s "
+                      "ldv=%08X\n",
+                      m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.zero, regs.at,
+                      regs.v0, regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4,
+                      regs.t5, regs.t6, regs.t7, regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7,
+                      regs.t8, regs.t9, regs.k0, regs.k1, regs.gp, regs.sp, regs.fp, regs.ra,
+                      (m_core->m_next_load_delay_reg == Reg::count) ? "NONE" :
+                                                                      GetRegName(m_core->m_next_load_delay_reg),
+                      (m_core->m_next_load_delay_reg == Reg::count) ? 0 : m_core->m_next_load_delay_value);
 }

 CodeBlockKey CodeCache::GetNextBlockKey() const
@ -202,7 +205,8 @@ bool CodeCache::RevalidateBlock(CodeBlock* block)
  for (const CodeBlockInstruction& cbi : block->instructions)
  {
    u32 new_code = 0;
-    m_bus->DispatchAccess<MemoryAccessType::Read, MemoryAccessSize::Word>(cbi.pc, new_code);
+    m_bus->DispatchAccess<MemoryAccessType::Read, MemoryAccessSize::Word>(cbi.pc & PHYSICAL_MEMORY_ADDRESS_MASK,
+                                                                          new_code);
    if (cbi.instruction.bits != new_code)
    {
      Log_DebugPrintf("Block 0x%08X changed at PC 0x%08X - %08X to %08X - recompiling.", block->GetPC(), cbi.pc,
@ -419,7 +423,9 @@ void CodeCache::UnlinkBlock(CodeBlock* block)
 void CodeCache::InterpretCachedBlock(const CodeBlock& block)
 {
  // set up the state so we've already fetched the instruction
-  DebugAssert((m_core->m_regs.pc & PHYSICAL_MEMORY_ADDRESS_MASK) == block.GetPC());
+  DebugAssert(m_core->m_regs.pc == block.GetPC());
+
+  m_core->m_regs.npc = block.GetPC() + 4;

  for (const CodeBlockInstruction& cbi : block.instructions)
  {
@ -427,14 +433,13 @@ void CodeCache::InterpretCachedBlock(const CodeBlock& block)

    // now executing the instruction we previously fetched
    m_core->m_current_instruction.bits = cbi.instruction.bits;
-    m_core->m_current_instruction_pc = m_core->m_regs.pc;
+    m_core->m_current_instruction_pc = cbi.pc;
    m_core->m_current_instruction_in_branch_delay_slot = cbi.is_branch_delay_slot;
    m_core->m_current_instruction_was_branch_taken = m_core->m_branch_was_taken;
    m_core->m_branch_was_taken = false;
    m_core->m_exception_raised = false;

    // update pc
-    DebugAssert((m_core->m_regs.pc & PHYSICAL_MEMORY_ADDRESS_MASK) == cbi.pc);
    m_core->m_regs.pc = m_core->m_regs.npc;
    m_core->m_regs.npc += 4;

@ -454,6 +459,8 @@ void CodeCache::InterpretCachedBlock(const CodeBlock& block)

 void CodeCache::InterpretUncachedBlock()
 {
+  Panic("Fixme with regards to re-fetching PC");
+
  // At this point, pc contains the last address executed (in the previous block). The instruction has not been fetched
  // yet. pc shouldn't be updated until the fetch occurs, that way the exception occurs in the delay slot.
  bool in_branch_delay_slot = false;
--- a/src/core/cpu_code_cache.h
+++ b/src/core/cpu_code_cache.h
@ -28,6 +28,8 @@ union CodeBlockKey
  ALWAYS_INLINE u32 GetPC() const { return aligned_pc << 2; }
  ALWAYS_INLINE void SetPC(u32 pc) { aligned_pc = pc >> 2; }

+  ALWAYS_INLINE u32 GetPCPhysicalAddress() const { return (aligned_pc << 2) & PHYSICAL_MEMORY_ADDRESS_MASK; }
+
  ALWAYS_INLINE CodeBlockKey& operator=(const CodeBlockKey& rhs)
  {
    bits = rhs.bits;
@ -72,12 +74,15 @@ struct CodeBlock

  const u32 GetPC() const { return key.GetPC(); }
  const u32 GetSizeInBytes() const { return static_cast<u32>(instructions.size()) * sizeof(Instruction); }
-  const u32 GetStartPageIndex() const { return (key.GetPC() / CPU_CODE_CACHE_PAGE_SIZE); }
-  const u32 GetEndPageIndex() const { return ((key.GetPC() + GetSizeInBytes()) / CPU_CODE_CACHE_PAGE_SIZE); }
+  const u32 GetStartPageIndex() const { return (key.GetPCPhysicalAddress() / CPU_CODE_CACHE_PAGE_SIZE); }
+  const u32 GetEndPageIndex() const
+  {
+    return ((key.GetPCPhysicalAddress() + GetSizeInBytes()) / CPU_CODE_CACHE_PAGE_SIZE);
+  }
  bool IsInRAM() const
  {
    // TODO: Constant
-    return key.GetPC() < 0x200000;
+    return key.GetPCPhysicalAddress() < 0x200000;
  }
 };

--- a/src/core/cpu_recompiler_code_generator.cpp
+++ b/src/core/cpu_recompiler_code_generator.cpp
@ -50,6 +50,8 @@ bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePoin
  EmitEndBlock();

  FinalizeBlock(out_host_code, out_host_code_size);
+  Log_ProfilePrintf("JIT block 0x%08X: %zu instructions (%u bytes), %u host bytes", block->GetPC(),
+                    block->instructions.size(), block->GetSizeInBytes(), *out_host_code_size);

  DebugAssert(m_register_cache.GetUsedHostRegisters() == 0);

@ -709,14 +711,6 @@ void CodeGenerator::BlockPrologue()
  m_branch_was_taken_dirty = true;
  m_current_instruction_was_branch_taken_dirty = false;
  m_load_delay_dirty = true;
-
-  // sync m_current_instruction_pc so we can simply add to it
-  SyncCurrentInstructionPC();
-
-  // and the same for m_regs.pc
-  SyncPC();
-
-  EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(4));
 }

 void CodeGenerator::BlockEpilogue()
@ -729,17 +723,7 @@ void CodeGenerator::BlockEpilogue()
  if (m_register_cache.HasLoadDelay())
    m_register_cache.WriteLoadDelayToCPU(true);

-  // if the last instruction wasn't a fallback, we need to add its fetch
-  if (m_delayed_pc_add > 0)
-  {
-    EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(m_delayed_pc_add));
-    m_delayed_pc_add = 0;
-  }
-
  AddPendingCycles();
-
-  // TODO: correct value for is_branch_delay_slot - branches in branch delay slot.
-  EmitStoreCPUStructField(offsetof(Core, m_next_instruction_is_branch_delay_slot), Value::FromConstantU8(0));
 }

 void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles,
@ -771,42 +755,29 @@ void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCou
    m_current_instruction_in_branch_delay_slot_dirty = false;
  }

-  if (cbi.is_branch_delay_slot)
+  // increment PC, except if we're in the branch delay slot where it was just changed
+  if (!cbi.is_branch_delay_slot)
  {
-    // m_regs.pc should be synced for the next block, as the branch wrote to npc
-    SyncCurrentInstructionPC();
-    SyncPC();
-
-    // m_current_instruction_in_branch_delay_slot = true
-    EmitStoreCPUStructField(offsetof(Core, m_current_instruction_in_branch_delay_slot), Value::FromConstantU8(1));
-    m_current_instruction_in_branch_delay_slot_dirty = true;
+    Assert(!m_register_cache.IsGuestRegisterInHostRegister(Reg::pc));
+    m_register_cache.WriteGuestRegister(Reg::pc, Value::FromConstantU32(cbi.pc + 4));
  }

  if (!CanInstructionTrap(cbi.instruction, m_block->key.user_mode) && !force_sync)
  {
    // Defer updates for non-faulting instructions.
-    m_delayed_pc_add += INSTRUCTION_SIZE;
    m_delayed_cycles_add += cycles;
    return;
  }

-  if (m_delayed_pc_add > 0)
+  if (cbi.is_branch_delay_slot)
  {
-    // m_current_instruction_pc += m_delayed_pc_add
-    EmitAddCPUStructField(offsetof(Core, m_current_instruction_pc), Value::FromConstantU32(m_delayed_pc_add));
-
-    // m_regs.pc += m_delayed_pc_add
-    EmitAddCPUStructField(offsetof(Core, m_regs.pc), Value::FromConstantU32(m_delayed_pc_add));
-
-    // m_regs.npc += m_delayed_pc_add
-    // TODO: This can go once we recompile branch instructions and unconditionally set npc
-    EmitAddCPUStructField(offsetof(Core, m_regs.npc), Value::FromConstantU32(m_delayed_pc_add));
-
-    m_delayed_pc_add = 0;
+    // m_current_instruction_in_branch_delay_slot = true
+    EmitStoreCPUStructField(offsetof(Core, m_current_instruction_in_branch_delay_slot), Value::FromConstantU8(1));
+    m_current_instruction_in_branch_delay_slot_dirty = true;
  }

-  if (!cbi.is_branch_instruction)
-    m_delayed_pc_add = INSTRUCTION_SIZE;
+  // Sync current instruction PC
+  EmitStoreCPUStructField(offsetof(Core, m_current_instruction_pc), Value::FromConstantU32(cbi.pc));

  m_delayed_cycles_add += cycles;
  AddPendingCycles();
@ -835,22 +806,6 @@ void CodeGenerator::InstructionEpilogue(const CodeBlockInstruction& cbi)
  }
 }

-void CodeGenerator::SyncCurrentInstructionPC()
-{
-  // m_current_instruction_pc = m_regs.pc
-  Value pc_value = m_register_cache.AllocateScratch(RegSize_32);
-  EmitLoadCPUStructField(pc_value.host_reg, RegSize_32, offsetof(Core, m_regs.pc));
-  EmitStoreCPUStructField(offsetof(Core, m_current_instruction_pc), pc_value);
-}
-
-void CodeGenerator::SyncPC()
-{
-  // m_regs.pc = m_regs.npc
-  Value npc_value = m_register_cache.AllocateScratch(RegSize_32);
-  EmitLoadCPUStructField(npc_value.host_reg, RegSize_32, offsetof(Core, m_regs.npc));
-  EmitStoreCPUStructField(offsetof(Core, m_regs.pc), npc_value);
-}
-
 void CodeGenerator::AddPendingCycles()
 {
  if (m_delayed_cycles_add == 0)
@ -1246,8 +1201,7 @@ bool CodeGenerator::Compile_SetLess(const CodeBlockInstruction& cbi)

 bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
 {
-  // Force sync since we branches are PC-relative.
-  InstructionPrologue(cbi, 1, true);
+  InstructionPrologue(cbi, 1);

  // Compute the branch target.
  // This depends on the form of the instruction.
@ -1258,7 +1212,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
    {
      // npc = (pc & 0xF0000000) | (target << 2)
      Value branch_target =
-        OrValues(AndValues(m_register_cache.ReadGuestRegister(Reg::pc, false), Value::FromConstantU32(0xF0000000)),
+        OrValues(AndValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(0xF0000000)),
                 Value::FromConstantU32(cbi.instruction.j.target << 2));

      EmitBranch(Condition::Always, (cbi.instruction.op == InstructionOp::jal) ? Reg::ra : Reg::count,
@ -1294,7 +1248,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
    case InstructionOp::bne:
    {
      // npc = pc + (sext(imm) << 2)
-      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false),
+      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc),
                                      Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false);

      // branch <- rs op rt
@ -1311,7 +1265,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
    case InstructionOp::blez:
    {
      // npc = pc + (sext(imm) << 2)
-      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false),
+      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc),
                                      Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false);

      // branch <- rs op 0
@ -1327,7 +1281,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
    case InstructionOp::b:
    {
      // npc = pc + (sext(imm) << 2)
-      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc, false),
+      Value branch_target = AddValues(m_register_cache.ReadGuestRegister(Reg::pc),
                                      Value::FromConstantU32(cbi.instruction.i.imm_sext32() << 2), false);

      const u8 rt = static_cast<u8>(cbi.instruction.i.rt.GetValue());
@ -1344,7 +1298,8 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi)
      if (link)
      {
        EmitCancelInterpreterLoadDelayForReg(Reg::ra);
-        m_register_cache.WriteGuestRegister(Reg::ra, m_register_cache.ReadGuestRegister(Reg::npc, false));
+        m_register_cache.WriteGuestRegister(
+          Reg::ra, AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(4), false));
      }

      EmitTest(lhs.host_reg, lhs);
--- a/src/core/cpu_recompiler_code_generator.h
+++ b/src/core/cpu_recompiler_code_generator.h
@ -165,8 +165,6 @@ private:
  void BlockEpilogue();
  void InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles, bool force_sync = false);
  void InstructionEpilogue(const CodeBlockInstruction& cbi);
-  void SyncCurrentInstructionPC();
-  void SyncPC();
  void AddPendingCycles();

  Value DoGTERegisterRead(u32 index);
@ -202,7 +200,6 @@ private:
  CodeEmitter m_far_emitter;
  CodeEmitter* m_emit;

-  u32 m_delayed_pc_add = 0;
  TickCount m_delayed_cycles_add = 0;

  // whether various flags need to be reset.
--- a/src/core/cpu_recompiler_code_generator_x64.cpp
+++ b/src/core/cpu_recompiler_code_generator_x64.cpp
@ -201,6 +201,9 @@ void CodeGenerator::EmitEndBlock()

 void CodeGenerator::EmitExceptionExit()
 {
+  // toss away our PC value since we're jumping to the exception handler
+  m_register_cache.InvalidateGuestRegister(Reg::pc);
+
  // ensure all unflushed registers are written back
  m_register_cache.FlushAllGuestRegisters(false, false);

@ -1762,28 +1765,33 @@ static void EmitConditionalJump(Condition condition, bool invert, Xbyak::CodeGen

 void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, Value&& branch_target)
 {
-  // allocate scratch register for reading npc - we return to the main path, so this could cause a reg flush
-  Value old_npc = m_register_cache.AllocateScratch(RegSize_32);
-
-  // npc gets modified by the branch, so we can't trust it on returning. same for lr_reg, which might contain a dirty
-  // value
-  m_register_cache.FlushGuestRegister(Reg::npc, true, true);
-  if (lr_reg != Reg::count)
+  // ensure the lr register is flushed, since we want it's correct value after the branch
+  if (lr_reg != Reg::count && lr_reg != Reg::zero)
    m_register_cache.FlushGuestRegister(lr_reg, true, true);

-  // condition is inverted because we want the case for skipping it
+  // compute return address, which is also set as the new pc when the branch isn't taken
+  Value new_pc;
+  if (condition != Condition::Always || lr_reg != Reg::count)
+  {
+    new_pc = AddValues(m_register_cache.ReadGuestRegister(Reg::pc), Value::FromConstantU32(4), false);
+    if (!new_pc.IsInHostRegister())
+      new_pc = GetValueInHostRegister(new_pc);
+  }
+
  Xbyak::Label skip_branch;
  if (condition != Condition::Always)
+  {
+    // condition is inverted because we want the case for skipping it
    EmitConditionalJump(condition, true, m_emit, skip_branch);
+  }

  // save the old PC if we want to
-  if (lr_reg != Reg::count)
+  if (lr_reg != Reg::count && lr_reg != Reg::zero)
  {
    // Can't cache because we have two branches. Load delay cancel is due to the immediate flush afterwards,
    // if we don't cancel it, at the end of the instruction the value we write can be overridden.
    EmitCancelInterpreterLoadDelayForReg(lr_reg);
-    EmitLoadGuestRegister(old_npc.host_reg, Reg::npc);
-    EmitStoreGuestRegister(lr_reg, old_npc);
+    EmitStoreGuestRegister(lr_reg, new_pc);
  }

  // we don't need to test the address of constant branches unless they're definitely misaligned, which would be
@ -1814,12 +1822,18 @@ void CodeGenerator::EmitBranch(Condition condition, Reg lr_reg, Value&& branch_t
    m_register_cache.PopState();
  }

-  // branch taken path - write new PC and flush it, since two branches
-  EmitStoreGuestRegister(Reg::npc, branch_target);
-  EmitStoreCPUStructField(offsetof(Core, m_current_instruction_was_branch_taken), Value::FromConstantU8(1));
+  // branch taken path - change the return address/new pc
+  if (condition != Condition::Always)
+    EmitCopyValue(new_pc.GetHostRegister(), branch_target);

  // converge point
  m_emit->L(skip_branch);
+
+  // update pc
+  if (condition != Condition::Always)
+    m_register_cache.WriteGuestRegister(Reg::pc, std::move(new_pc));
+  else
+    m_register_cache.WriteGuestRegister(Reg::pc, std::move(branch_target));
 }

 void CodeGenerator::EmitRaiseException(Exception excode, Condition condition /* = Condition::Always */)
@ -1827,14 +1841,12 @@ void CodeGenerator::EmitRaiseException(Exception excode, Condition condition /*
  if (condition == Condition::Always)
  {
    // no need to use far code if we're always raising the exception
-    EmitFunctionCall(nullptr, &Thunks::RaiseException, m_register_cache.GetCPUPtr(),
-                     Value::FromConstantU8(static_cast<u8>(excode)));
+    m_register_cache.InvalidateGuestRegister(Reg::pc);
    m_register_cache.FlushAllGuestRegisters(true, true);
    m_register_cache.FlushLoadDelay(true);

-    // PC should be synced at this point. If we leave the 4 on here for this instruction, we mess up npc.
-    Assert(m_delayed_pc_add == 4);
-    m_delayed_pc_add = 0;
+    EmitFunctionCall(nullptr, &Thunks::RaiseException, m_register_cache.GetCPUPtr(),
+                     Value::FromConstantU8(static_cast<u8>(excode)));
    return;
  }

--- a/src/core/cpu_recompiler_register_cache.h
+++ b/src/core/cpu_recompiler_register_cache.h
@ -252,6 +252,13 @@ public:
    return cache_value.IsConstant() || cache_value.IsInHostRegister();
  }

+  /// Returns true if the specified guest register is cached and in a host register.
+  bool IsGuestRegisterInHostRegister(Reg guest_reg) const
+  {
+    const Value& cache_value = m_state.guest_reg_state[static_cast<u8>(guest_reg)];
+    return cache_value.IsInHostRegister();
+  }
+
  /// Returns the host register if the guest register is cached.
  std::optional<HostReg> GetHostRegisterForGuestRegister(Reg guest_reg) const
  {
--- a/src/core/cpu_types.cpp
+++ b/src/core/cpu_types.cpp
@ -189,7 +189,7 @@ bool CanInstructionTrap(const Instruction& instruction, bool in_user_mode)
    case InstructionOp::bgtz:
    case InstructionOp::blez:
    case InstructionOp::bne:
-      return true;
+      return false;

    case InstructionOp::funct:
    {