From fc90d847885ca641cd51e27eeab08f1b28af9a96 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 2 Mar 2025 15:48:28 +1000 Subject: [PATCH] tailcall --- src/core/cpu_code_cache.cpp | 113 +++++++++++++++++++++++------- src/core/cpu_code_cache_private.h | 15 +++- src/core/cpu_core.cpp | 15 ++-- 3 files changed, 111 insertions(+), 32 deletions(-) diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index a65816933..a48834c4d 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -75,7 +75,6 @@ static void RemoveBlockFromPageList(Block* block); static void SetCachedInterpreterHandlers(); static void CompileCachedInterpreterBlock(const u32); -static void ExecuteCachedInterpreterBlock(const CachedInterpreterInstruction* cinst); [[noreturn]] static void ExecuteCachedInterpreter(); // Fast map provides lookup from PC to function @@ -714,10 +713,78 @@ PageFaultHandler::HandlerResult PageFaultHandler::HandlePageFault(void* exceptio // MARK: - Cached Interpreter //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +namespace CPU::CodeCache::CachedInterpreterFunctions { +static void CompileOrRevalidateBlock(const CachedInterpreterInstruction* cinst); +static void LookupAndExecuteBlock(const CachedInterpreterInstruction* cinst); +static void LogCurrentState(const CachedInterpreterInstruction* cinst); +static void CheckAndUpdateICacheTags(const CachedInterpreterInstruction* cinst); +static void AddDynamicFetchTicks(const CachedInterpreterInstruction* cinst); +static void AddUncachedFetchTicks(const CachedInterpreterInstruction* cinst); +static void EndBlock(const CachedInterpreterInstruction* cinst); +} // namespace CPU::CodeCache::CachedInterpreterFunctions + +void CPU::CodeCache::CachedInterpreterFunctions::CompileOrRevalidateBlock(const CachedInterpreterInstruction* cinst) +{ + CompileCachedInterpreterBlock(g_state.pc); + END_CACHED_INTERPRETER_INSTRUCTION(cinst); +} + +void CPU::CodeCache::CachedInterpreterFunctions::LookupAndExecuteBlock(const CachedInterpreterInstruction*) +{ + const u32 pc = g_state.pc; + const u32 table = pc >> LUT_TABLE_SHIFT; + const u32 idx = (pc & 0xFFFF) >> 2; + const CachedInterpreterInstruction* cinst = + reinterpret_cast(g_code_lut[table][idx]); + +#ifdef HAS_MUSTTAIL + RETURN_MUSTTAIL(cinst->handler(cinst)); +#else + do + { + cinst->handler(cinst); + cinst++; + } while (cinst->handler); +#endif +} + +void CPU::CodeCache::CachedInterpreterFunctions::LogCurrentState(const CachedInterpreterInstruction* cinst) +{ + CPU::CodeCache::LogCurrentState(); + END_CACHED_INTERPRETER_INSTRUCTION(cinst); +} + +void CPU::CodeCache::CachedInterpreterFunctions::CheckAndUpdateICacheTags(const CachedInterpreterInstruction* cinst) +{ + CPU::CheckAndUpdateICacheTags(cinst->arg); + END_CACHED_INTERPRETER_INSTRUCTION(cinst); +} + +void CPU::CodeCache::CachedInterpreterFunctions::AddDynamicFetchTicks(const CachedInterpreterInstruction* cinst) +{ + AddPendingTicks(static_cast( + cinst->arg * + static_cast(*Bus::GetMemoryAccessTimePtr(g_state.pc & PHYSICAL_MEMORY_ADDRESS_MASK, MemoryAccessSize::Word)))); + END_CACHED_INTERPRETER_INSTRUCTION(cinst); +} + +void CPU::CodeCache::CachedInterpreterFunctions::AddUncachedFetchTicks(const CachedInterpreterInstruction* cinst) +{ + CPU::AddPendingTicks(static_cast(cinst->arg)); + END_CACHED_INTERPRETER_INSTRUCTION(cinst); +} + +void CPU::CodeCache::CachedInterpreterFunctions::EndBlock(const CachedInterpreterInstruction* cinst) +{ + // TODO: jump to top of block if looping, block linking, etc. + return; +} + void CPU::CodeCache::SetCachedInterpreterHandlers() { static constexpr const CachedInterpreterInstruction compile_or_revalidate_block[] = { - {&CompileCachedInterpreterBlock, 0u}, + {&CachedInterpreterFunctions::CompileOrRevalidateBlock, 0u}, + {&CachedInterpreterFunctions::LookupAndExecuteBlock, 0u}, {nullptr, 0u}, }; @@ -775,32 +842,26 @@ void CPU::CodeCache::CompileCachedInterpreterBlock(const u32) if (false) { - cinst->handler = [](u32) { LogCurrentState(); }; + cinst->handler = &CachedInterpreterFunctions::LogCurrentState; cinst->arg = 0; cinst++; } if (block->HasFlag(BlockFlags::IsUsingICache)) { - cinst->handler = &CheckAndUpdateICacheTags; + cinst->handler = &CachedInterpreterFunctions::CheckAndUpdateICacheTags; cinst->arg = block->icache_line_count; cinst++; } else if (block->HasFlag(BlockFlags::NeedsDynamicFetchTicks)) { - static const auto dynamic_fetch_handler = [](u32 size) { - AddPendingTicks( - static_cast(size * static_cast(*Bus::GetMemoryAccessTimePtr( - g_state.pc & PHYSICAL_MEMORY_ADDRESS_MASK, MemoryAccessSize::Word)))); - }; - - cinst->handler = dynamic_fetch_handler; + cinst->handler = &CachedInterpreterFunctions::AddDynamicFetchTicks; cinst->arg = block->size; cinst++; } else if (block->uncached_fetch_ticks > 0) { - cinst->handler = reinterpret_cast(&CPU::AddPendingTicks); + cinst->handler = &CachedInterpreterFunctions::AddUncachedFetchTicks; cinst->arg = static_cast(block->uncached_fetch_ticks); cinst++; } @@ -817,7 +878,11 @@ void CPU::CodeCache::CompileCachedInterpreterBlock(const u32) } // end +#ifdef HAS_MUSTTAIL + cinst->handler = &CachedInterpreterFunctions::EndBlock; +#else cinst->handler = nullptr; +#endif cinst->arg = 0; cinst++; @@ -828,21 +893,9 @@ void CPU::CodeCache::CompileCachedInterpreterBlock(const u32) CommitCode(required_space); MemMap::EndCodeWrite(); - // execute it - ExecuteCachedInterpreterBlock(cstart); - // TODO: Block linking! } -ALWAYS_INLINE_RELEASE void CPU::CodeCache::ExecuteCachedInterpreterBlock(const CachedInterpreterInstruction* cinst) -{ - do - { - cinst->handler(cinst->arg); - cinst++; - } while (cinst->handler); -} - [[noreturn]] void CPU::CodeCache::ExecuteCachedInterpreter() { #define CHECK_DOWNCOUNT() \ @@ -874,7 +927,17 @@ ALWAYS_INLINE_RELEASE void CPU::CodeCache::ExecuteCachedInterpreterBlock(const C reexecute_block: // Execute block. DebugAssert(!(HasPendingInterrupt())); - ExecuteCachedInterpreterBlock(cinst); + +#ifdef HAS_MUSTTAIL + cinst->handler(cinst); +#else + do + { + cinst->handler(cinst); + cinst++; + } while (cinst->handler); +#endif + CHECK_DOWNCOUNT(); // Handle self-looping blocks diff --git a/src/core/cpu_code_cache_private.h b/src/core/cpu_code_cache_private.h index c74140862..e7718ea01 100644 --- a/src/core/cpu_code_cache_private.h +++ b/src/core/cpu_code_cache_private.h @@ -13,6 +13,11 @@ #include #include +#ifdef __clang__ +#define HAS_MUSTTAIL 1 +#define RETURN_MUSTTAIL(val) __attribute__((musttail)) return val +#endif + namespace CPU::CodeCache { enum : u32 @@ -205,9 +210,17 @@ struct PageProtectionInfo }; static_assert(sizeof(PageProtectionInfo) == (sizeof(Block*) * 2 + 8)); -using CachedInterpreterHandler = void(*)(u32 arg); +struct CachedInterpreterInstruction; + +using CachedInterpreterHandler = void (*)(const CachedInterpreterInstruction*); CachedInterpreterHandler GetCachedInterpreterHandler(const Instruction inst); +#ifdef HAS_MUSTTAIL +#define END_CACHED_INTERPRETER_INSTRUCTION(arg) RETURN_MUSTTAIL((arg + 1)->handler(arg + 1)); +#else +#define END_CACHED_INTERPRETER_INSTRUCTION(arg) +#endif + struct CachedInterpreterInstruction { CachedInterpreterHandler handler; diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 8c08a3900..3a345979f 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -2775,29 +2775,31 @@ void CPU::SetSingleStepFlag() namespace CPU { #define MAKE_CACHED_INSTRUCTION_HANDLER(insn) \ template \ - static void CachedInstructionHandler_##insn(u32 arg) \ + static void CachedInstructionHandler_##insn(const CPU::CodeCache::CachedInterpreterInstruction* cinst) \ { \ + const Instruction inst{cinst->arg}; \ g_state.pending_ticks++; \ - g_state.current_instruction.bits = arg; \ + g_state.current_instruction.bits = inst.bits; \ g_state.current_instruction_pc = g_state.pc; \ g_state.current_instruction_was_branch_taken = g_state.branch_was_taken; \ g_state.branch_was_taken = false; \ g_state.exception_raised = false; \ g_state.pc = g_state.npc; \ g_state.npc += 4; \ - Execute_##insn(Instruction{arg}); \ + Execute_##insn(inst); \ UpdateLoadDelay(); /* TODO: For non-load instructions, we don't need to update next_load_delay_reg */ \ g_state.next_instruction_is_branch_delay_slot = false; /* FIXME */ \ + END_CACHED_INTERPRETER_INSTRUCTION(cinst); \ } CPU_FOR_EACH_INSTRUCTION(MAKE_CACHED_INSTRUCTION_HANDLER); // TODO: inline gte ops -static void CachedInstructionHandler_gte(u32 arg) +static void CachedInstructionHandler_gte(const CPU::CodeCache::CachedInterpreterInstruction* cinst) { g_state.pending_ticks++; - g_state.current_instruction.bits = arg; + g_state.current_instruction.bits = cinst->arg; g_state.current_instruction_pc = g_state.pc; g_state.current_instruction_was_branch_taken = g_state.branch_was_taken; g_state.branch_was_taken = false; @@ -2805,9 +2807,10 @@ static void CachedInstructionHandler_gte(u32 arg) g_state.pc = g_state.npc; g_state.npc += 4; StallUntilGTEComplete(); - GTE::ExecuteInstruction(arg); + GTE::ExecuteInstruction(cinst->arg); UpdateLoadDelay(); g_state.next_instruction_is_branch_delay_slot = false; /* FIXME */ + END_CACHED_INTERPRETER_INSTRUCTION(cinst); } } // namespace CPU