diff --git a/CMakeLists.txt b/CMakeLists.txt index 39546d85b..49179481b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -986,8 +986,6 @@ target_sources(${PROJECT_NAME} PRIVATE core/hw/sh4/interpr/sh4_interpreter.cpp core/hw/sh4/interpr/sh4_opcodes.cpp core/hw/sh4/interpr/sh4_opcodes.h - core/hw/sh4/interpr/sh4_ultra_interpreter.cpp - core/hw/sh4/interpr/sh4_ultra_interpreter.h core/hw/sh4/modules/bsc.cpp core/hw/sh4/modules/ccn.cpp core/hw/sh4/modules/ccn.h diff --git a/core/hw/sh4/interpr/sh4_interpreter.cpp b/core/hw/sh4/interpr/sh4_interpreter.cpp index f5ef5d4d4..440a0adfc 100644 --- a/core/hw/sh4/interpr/sh4_interpreter.cpp +++ b/core/hw/sh4/interpr/sh4_interpreter.cpp @@ -1,5 +1,5 @@ /* - Highly inefficient and boring interpreter. Nothing special here + Optimized SH4 interpreter for better performance */ #include "types.h" @@ -13,7 +13,6 @@ #include "../sh4_cache.h" #include "debug/gdb_server.h" #include "../sh4_cycles.h" -#include "sh4_ultra_interpreter.h" // SH4 underclock factor when using the interpreter so that it's somewhat usable #ifdef STRICT_MODE @@ -25,40 +24,76 @@ constexpr int CPU_RATIO = 8; Sh4ICache icache; Sh4OCache ocache; -static void ExecuteOpcode(u16 op) +// === SIMPLE INSTRUCTION CACHE FOR BETTER PERFORMANCE === +#define SIMPLE_ICACHE_SIZE 1024 +#define SIMPLE_ICACHE_MASK (SIMPLE_ICACHE_SIZE - 1) + +struct SimpleInstructionCache { + u32 pc[SIMPLE_ICACHE_SIZE]; + u16 opcode[SIMPLE_ICACHE_SIZE]; + + void reset() { + for (int i = 0; i < SIMPLE_ICACHE_SIZE; i++) { + pc[i] = 0xFFFFFFFF; + } + } + + u16 fetch(u32 addr) { + u32 index = (addr >> 1) & SIMPLE_ICACHE_MASK; + + if (__builtin_expect(pc[index] == addr, 1)) { + return opcode[index]; + } + + // Cache miss - fetch from memory + u16 op = IReadMem16(addr); + pc[index] = addr; + opcode[index] = op; + return op; + } +}; + +static SimpleInstructionCache g_simple_icache; + +static inline void ExecuteOpcode(u16 op) { - if (sr.FD == 1 && OpDesc[op]->IsFloatingPoint()) + if (__builtin_expect(sr.FD == 1 && OpDesc[op]->IsFloatingPoint(), 0)) RaiseFPUDisableException(); OpPtr[op](op); sh4cycles.executeCycles(op); } -static u16 ReadNexOp() +static inline u16 ReadNexOp() { - if (!mmu_enabled() && (next_pc & 1)) + if (__builtin_expect(!mmu_enabled() && (next_pc & 1), 0)) // address error throw SH4ThrownException(next_pc, Sh4Ex_AddressErrorRead); u32 addr = next_pc; next_pc += 2; - return IReadMem16(addr); + // Use simple instruction cache for better performance + return g_simple_icache.fetch(addr); } static void Sh4_int_Run() { RestoreHostRoundingMode(); + // Reset instruction cache at start + g_simple_icache.reset(); + try { do { try { + // Optimized inner loop with minimal overhead do { u32 op = ReadNexOp(); - ExecuteOpcode(op); - } while (p_sh4rcb->cntx.cycle_counter > 0); + } while (__builtin_expect(p_sh4rcb->cntx.cycle_counter > 0, 1)); + p_sh4rcb->cntx.cycle_counter += SH4_TIMESLICE; UpdateSystem_INTC(); } catch (const SH4ThrownException& ex) { @@ -66,7 +101,7 @@ static void Sh4_int_Run() // an exception requires the instruction pipeline to drain, so approx 5 cycles sh4cycles.addCycles(5 * CPU_RATIO); } - } while (sh4_int_bCpuRun); + } while (__builtin_expect(sh4_int_bCpuRun, 1)); } catch (const debugger::Stop&) { } @@ -128,6 +163,9 @@ static void Sh4_int_Reset(bool hard) ocache.Reset(hard); sh4cycles.reset(); p_sh4rcb->cntx.cycle_counter = SH4_TIMESLICE; + + // Reset simple instruction cache + g_simple_icache.reset(); INFO_LOG(INTERPRETER, "Sh4 Reset"); } @@ -193,6 +231,7 @@ int UpdateSystem_INTC() } static void sh4_int_resetcache() { + g_simple_icache.reset(); } static void Sh4_int_Init() @@ -200,6 +239,7 @@ static void Sh4_int_Init() static_assert(sizeof(Sh4cntx) == 448, "Invalid Sh4Cntx size"); memset(&p_sh4rcb->cntx, 0, sizeof(p_sh4rcb->cntx)); + g_simple_icache.reset(); } static void Sh4_int_Term() @@ -211,21 +251,7 @@ static void Sh4_int_Term() #ifndef ENABLE_SH4_CACHED_IR void Get_Sh4Interpreter(sh4_if* cpu) { -#ifdef USE_ULTRA_INTERPRETER - fprintf(stderr, "🚀 ULTRA-INTERPRETER: Get_Sh4Interpreter called — linking ultra-fast interpreter!\n"); - - // Use the ultra-interpreter instead of legacy - cpu->Start = Sh4_int_Start; - cpu->Run = (void(*)())Get_UltraInterpreter(); // Use ultra-interpreter run function - cpu->Stop = Sh4_int_Stop; - cpu->Step = Sh4_int_Step; - cpu->Reset = Sh4_int_Reset; - cpu->Init = Sh4_int_Init; - cpu->Term = Sh4_int_Term; - cpu->IsCpuRunning = Sh4_int_IsCpuRunning; - cpu->ResetCache = sh4_int_resetcache; -#else - fprintf(stderr, "[LEGACY-INT] Get_Sh4Interpreter called — linking legacy interpreter!\n"); + INFO_LOG(INTERPRETER, "🚀 OPTIMIZED-INTERPRETER: Get_Sh4Interpreter called — linking optimized interpreter!"); cpu->Start = Sh4_int_Start; cpu->Run = Sh4_int_Run; cpu->Stop = Sh4_int_Stop; @@ -235,6 +261,5 @@ void Get_Sh4Interpreter(sh4_if* cpu) cpu->Term = Sh4_int_Term; cpu->IsCpuRunning = Sh4_int_IsCpuRunning; cpu->ResetCache = sh4_int_resetcache; -#endif } #endif // ENABLE_SH4_CACHED_IR diff --git a/core/hw/sh4/interpr/sh4_ultra_interpreter.cpp b/core/hw/sh4/interpr/sh4_ultra_interpreter.cpp deleted file mode 100644 index 7ad386eb3..000000000 --- a/core/hw/sh4/interpr/sh4_ultra_interpreter.cpp +++ /dev/null @@ -1,978 +0,0 @@ -// === NEXT-GENERATION ULTRA-FAST SH4 INTERPRETER === -// This interpreter beats the legacy interpreter by being SIMPLER and FASTER -// - Direct SH4 execution like legacy but with optimizations that actually work -// - Simple instruction caching without complex block compilation -// - BLOCK CACHING: Groups instructions into blocks for dynarec-like performance -// - MMU-aware optimizations -// - ARM64 prefetching and branch prediction - -#include "sh4_ultra_interpreter.h" -#include "hw/sh4/sh4_interpreter.h" -#include "hw/sh4/sh4_opcode_list.h" -#include "hw/sh4/sh4_core.h" -#include "hw/sh4/sh4_interrupts.h" -#include "hw/sh4/sh4_mem.h" -#include "hw/sh4/sh4_sched.h" -#include "hw/sh4/sh4_cache.h" -#include "hw/sh4/modules/mmu.h" -#include -#include - -// === BLOCK CACHING SYSTEM === -// This is the key to achieving dynarec-like performance! -#define MAX_BLOCK_SIZE 32 -#define BLOCK_CACHE_SIZE 2048 -#define HOT_BLOCK_THRESHOLD 10 - -struct CachedBlock { - u32 pc_start; - u32 pc_end; - u32 execution_count; - bool is_hot_block; - std::vector opcodes; - - // Block analysis - bool has_branches; - bool has_memory_ops; - bool is_pure_arithmetic; - bool can_use_fast_path; - - CachedBlock() : pc_start(0), pc_end(0), execution_count(0), is_hot_block(false), - has_branches(false), has_memory_ops(false), is_pure_arithmetic(true), can_use_fast_path(true) {} -}; - -// Block cache using unordered_map for fast lookups -static std::unordered_map g_block_cache; - -// Block cache statistics -struct BlockCacheStats { - u64 total_blocks_executed; - u64 hot_block_executions; - u64 cold_block_executions; - u64 blocks_created; - u64 cache_hits; - u64 cache_misses; - - void reset() { - total_blocks_executed = 0; - hot_block_executions = 0; - cold_block_executions = 0; - blocks_created = 0; - cache_hits = 0; - cache_misses = 0; - } -}; - -static BlockCacheStats g_block_stats; - -// === INSTRUCTION SEQUENCE CACHING === -#define MAX_SEQUENCE_LENGTH 8 -#define SEQUENCE_CACHE_SIZE 256 - -struct InstructionSequence { - u32 start_pc; - u16 opcodes[MAX_SEQUENCE_LENGTH]; - u32 length; - u32 execution_count; - bool is_optimized; - - // Optimization flags - bool is_pure_arithmetic; // Only arithmetic ops, no memory/branches - bool is_register_shuffle; // Only register moves - bool is_memory_block; // Memory operations that can be batched - bool uses_neon; // Can use NEON SIMD optimization -}; - -static InstructionSequence g_sequence_cache[SEQUENCE_CACHE_SIZE]; - -// === ULTRA-SIMPLE INSTRUCTION CACHE === -#define ICACHE_SIZE 1024 -#define ICACHE_MASK (ICACHE_SIZE - 1) - -struct UltraInstructionCache { - u32 pc[ICACHE_SIZE]; - u16 opcode[ICACHE_SIZE]; -#ifdef DEBUG - u64 hits; - u64 misses; -#endif - - void reset() { - for (int i = 0; i < ICACHE_SIZE; i++) { - pc[i] = 0xFFFFFFFF; - opcode[i] = 0; - } -#ifdef DEBUG - hits = 0; - misses = 0; -#endif - } - - u16 fetch(u32 addr) { - u32 index = (addr >> 1) & ICACHE_MASK; - - if (pc[index] == addr) { -#ifdef DEBUG - hits++; -#endif - return opcode[index]; - } - - // Cache miss - fetch from memory -#ifdef DEBUG - misses++; -#endif - u16 op = IReadMem16(addr); - pc[index] = addr; - opcode[index] = op; - return op; - } -}; - -static UltraInstructionCache g_icache; - -// === PERFORMANCE STATS === -struct UltraStats { -#ifdef DEBUG - u64 instructions; - u64 cycles; - u32 mmu_state_changes; -#endif - bool mmu_enabled; - - void reset() { -#ifdef DEBUG - instructions = 0; - cycles = 0; - mmu_state_changes = 0; -#endif - mmu_enabled = ::mmu_enabled(); - } - - void check_mmu() { - bool current_mmu = ::mmu_enabled(); - if (current_mmu != mmu_enabled) { -#ifdef DEBUG - mmu_state_changes++; - INFO_LOG(INTERPRETER, "🔄 MMU state changed: %s", current_mmu ? "ENABLED" : "DISABLED"); -#endif - mmu_enabled = current_mmu; - } - } -}; - -static UltraStats g_stats; - -// === FORWARD DECLARATIONS === -static inline bool ultra_execute_hot_opcode(u16 op); - -// === BLOCK CACHING FUNCTIONS === - -// Create a new cached block starting at the given PC -static CachedBlock create_cached_block(u32 start_pc) { - CachedBlock block; - block.pc_start = start_pc; - block.execution_count = 0; - block.is_hot_block = false; - - // Decode instructions until we hit a branch or reach max block size - u32 current_pc = start_pc; - for (u32 i = 0; i < MAX_BLOCK_SIZE; i++) { - u16 op = IReadMem16(current_pc); - block.opcodes.push_back(op); - current_pc += 2; - - // Analyze the instruction - if (OpDesc[op]->SetPC()) { - // This is a branch instruction - end the block - block.has_branches = true; - break; - } - - // Analyze instruction type based on opcode patterns - u32 op_high = op >> 12; - u32 op_low = op & 0xF; - - // Check for memory operations (rough heuristic) - if (op_high == 0x2 || op_high == 0x6 || op_high == 0x8 || op_high == 0x9 || op_high == 0xC || op_high == 0xD) { - block.has_memory_ops = true; - } - - // Check if it's arithmetic/logical operation - if (!(op_high == 0x3 || op_high == 0x7 || (op_high == 0x2 && (op_low == 0x9 || op_low == 0xA || op_low == 0xB)))) { - block.is_pure_arithmetic = false; - } - } - - block.pc_end = current_pc; - - // Determine if this block can use fast path execution - block.can_use_fast_path = !block.has_branches && block.opcodes.size() <= 16; - - g_block_stats.blocks_created++; - - INFO_LOG(INTERPRETER, "🔨 Created block PC=0x%08X-0x%08X (%d opcodes, branches=%s, memory=%s)", - block.pc_start, block.pc_end, (int)block.opcodes.size(), - block.has_branches ? "yes" : "no", block.has_memory_ops ? "yes" : "no"); - - return block; -} - -// Execute a cached block with proper exception and control flow handling -static void execute_cached_block(CachedBlock& block) { - block.execution_count++; - g_block_stats.total_blocks_executed++; - - // Promote to hot block if executed frequently - if (block.execution_count >= HOT_BLOCK_THRESHOLD && !block.is_hot_block) { - block.is_hot_block = true; - INFO_LOG(INTERPRETER, "🔥 Block at PC=0x%08X promoted to HOT BLOCK (%u executions)", - block.pc_start, block.execution_count); - } - - // Track hot vs cold execution - if (block.is_hot_block) { - g_block_stats.hot_block_executions++; - } else { - g_block_stats.cold_block_executions++; - } - - // CRITICAL: Execute instructions one by one to handle exceptions properly - u32 block_pc = block.pc_start; - - try { - for (size_t i = 0; i < block.opcodes.size(); i++) { - u16 op = block.opcodes[i]; - - // Update PC and next_pc for this instruction - Sh4cntx.pc = block_pc; - next_pc = block_pc + 2; - - // Check for interrupts before each instruction - if (__builtin_expect(UpdateSystem_INTC(), 0)) { - // Interrupt pending - must break out of block - return; - } - - // Check for floating point disable exception - if (__builtin_expect(sr.FD == 1 && OpDesc[op]->IsFloatingPoint(), 0)) { - RaiseFPUDisableException(); - } - - if (block.is_hot_block && block.can_use_fast_path) { - // Try ultra-fast inline execution first - if (!ultra_execute_hot_opcode(op)) { - // Fall back to legacy handler - OpPtr[op](op); - } - } else { - // Execute using legacy handler - OpPtr[op](op); - } - - // Execute cycles - sh4cycles.executeCycles(op); - - // CRITICAL: Check if PC was changed by instruction (jumps, branches, exceptions) - if (next_pc != block_pc + 2) { - // Control flow changed - instruction modified PC - // This means we have a jump, branch, or exception - return; // Exit block execution immediately - } - - // Move to next instruction in block - block_pc += 2; - } - } catch (const SH4ThrownException& ex) { - // Exception occurred during block execution - Do_Exception(ex.epc, ex.expEvn); - // Exception requires pipeline drain, so approx 5 cycles - sh4cycles.addCycles(5 * 8); // 8 = CPU_RATIO from legacy - return; // Exit block execution - } -} - -// === ULTRA-FAST INSTRUCTION FETCH === -static inline u16 ultra_fetch_instruction(u32 pc) { - // Use instruction cache for frequently accessed instructions - return g_icache.fetch(pc); -} - -// === MEMORY ACCESS OPTIMIZATIONS === -// Fast paths for common memory operations - -// Fast path for main RAM access (0x0C000000-0x0CFFFFFF) -static inline u32 ultra_read_mem32_fast(u32 addr) { - // Fast path for main RAM (most common case) - if (__builtin_expect((addr & 0xFF000000) == 0x0C000000, 1)) { - return ReadMem32(addr); // Direct access to main RAM - } - - // Fallback for other memory regions - return ReadMem32(addr); -} - -static inline void ultra_write_mem32_fast(u32 addr, u32 data) { - // Fast path for main RAM (most common case) - if (__builtin_expect((addr & 0xFF000000) == 0x0C000000, 1)) { - WriteMem32(addr, data); // Direct access to main RAM - return; - } - - // Fallback for other memory regions - WriteMem32(addr, data); -} - -// === HOT OPCODE SPECIALIZATION === -// Inline optimized versions of the most common opcodes to bypass function call overhead - -// ULTRA-HOT PATH: Inline the top 10 most critical opcodes for massive speedup -static inline bool ultra_execute_superhot_opcode(u16 op) { - // Fast decode without switches for maximum performance - u32 op_high = op >> 12; - u32 op_low = op & 0xF; - - // TOP 1: mov , (0x6xx3) - 25% of all instructions - if (__builtin_expect((op & 0xF00F) == 0x6003, 1)) { - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] = r[m]; - return true; - } - - // TOP 2: add #, (0x7xxx) - 15% of all instructions - if (__builtin_expect(op_high == 0x7, 1)) { - u32 n = (op >> 8) & 0xF; - s32 imm = (s32)(s8)(op & 0xFF); - r[n] += imm; - return true; - } - - // TOP 3: add , (0x3xxC) - 10% of all instructions - if (__builtin_expect((op & 0xF00F) == 0x300C, 1)) { - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] += r[m]; - return true; - } - - // TOP 4: mov.l @, (0x6xx2) - 8% of all instructions - if (__builtin_expect((op & 0xF00F) == 0x6002, 1)) { - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] = ReadMem32(r[m]); - return true; - } - - // TOP 5: mov.l ,@ (0x2xx2) - 6% of all instructions - if (__builtin_expect((op & 0xF00F) == 0x2002, 1)) { - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - WriteMem32(r[n], r[m]); - return true; - } - - // TOP 6: cmp/eq , (0x3xx0) - 5% of all instructions - if (__builtin_expect((op & 0xF00F) == 0x3000, 1)) { - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - sr.T = (r[m] == r[n]) ? 1 : 0; - return true; - } - - // TOP 7: sub , (0x3xx8) - 4% of all instructions - if (__builtin_expect((op & 0xF00F) == 0x3008, 1)) { - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] -= r[m]; - return true; - } - - // TOP 8: nop (0x0009) - 4% of all instructions - if (__builtin_expect(op == 0x0009, 1)) { - return true; // Nothing to do - } - - // TOP 9: mov.l @+, (0x6xx6) - 3% of all instructions - if (__builtin_expect((op & 0xF00F) == 0x6006, 1)) { - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] = ReadMem32(r[m]); - if (n != m) r[m] += 4; - return true; - } - - // TOP 10: dt (0x4xx0 with low byte 0x10) - 3% of all instructions - if (__builtin_expect((op & 0xF0FF) == 0x4010, 1)) { - u32 n = (op >> 8) & 0xF; - r[n]--; - sr.T = (r[n] == 0) ? 1 : 0; - return true; - } - - return false; // Not a superhot opcode -} - -// Optimized memory operations for hot opcodes -static inline void ultra_execute_hot_opcode_with_mem_opt(u16 op) { - // Extract opcode patterns for fastest common instructions - u32 opcode_high = (op >> 12) & 0xF; - u32 opcode_low = op & 0xF; - - switch (opcode_high) { - case 0x6: // 6xxx - MOV instructions (super hot!) - switch (opcode_low) { - case 0x3: { // mov , - HOTTEST OPCODE - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] = r[m]; - return; - } - case 0x2: { // mov.l @, - OPTIMIZED - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] = ultra_read_mem32_fast(r[m]); - - // Prefetch next cache line if sequential access pattern - #ifdef __aarch64__ - __builtin_prefetch(reinterpret_cast(static_cast(r[m] + 32)), 0, 1); - #endif - return; - } - case 0x6: { // mov.l @+, - OPTIMIZED - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] = ultra_read_mem32_fast(r[m]); - if (n != m) { - r[m] += 4; - // Prefetch next memory location - #ifdef __aarch64__ - __builtin_prefetch(reinterpret_cast(static_cast(r[m] + 16)), 0, 1); - #endif - } - return; - } - } - break; - - case 0x2: // 2xxx - Memory operations and logic - switch (opcode_low) { - case 0x2: { // mov.l ,@ - OPTIMIZED - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - ultra_write_mem32_fast(r[n], r[m]); - - // Prefetch next cache line for sequential writes - #ifdef __aarch64__ - __builtin_prefetch(reinterpret_cast(static_cast(r[n] + 32)), 1, 1); - #endif - return; - } - case 0x6: { // mov.l ,@- - OPTIMIZED - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] -= 4; - ultra_write_mem32_fast(r[n], r[m]); - - // Prefetch previous cache line for stack operations - #ifdef __aarch64__ - __builtin_prefetch(reinterpret_cast(static_cast(r[n] - 32)), 1, 1); - #endif - return; - } - } - break; - } - - // Fallback to standard hot opcode execution - ultra_execute_hot_opcode(op); -} - -// === FORWARD DECLARATIONS === -static void ultra_interpreter_run(); - -// === ARM64 NEON SIMD OPTIMIZATIONS === -// Use ARM64 NEON to process multiple registers simultaneously - -#ifdef __aarch64__ -#include - -// Bulk register clear using NEON (4 registers at once) -static inline void neon_clear_registers(u32* reg_base, int count) { - uint32x4_t zero = vdupq_n_u32(0); - for (int i = 0; i < count; i += 4) { - vst1q_u32(®_base[i], zero); - } -} - -// Bulk register copy using NEON (4 registers at once) -static inline void neon_copy_registers(u32* dst, const u32* src, int count) { - for (int i = 0; i < count; i += 4) { - uint32x4_t data = vld1q_u32(&src[i]); - vst1q_u32(&dst[i], data); - } -} - -// NEON-optimized register bank switching -static inline void neon_switch_register_bank() { - // Save current bank using NEON (only 8 registers in r_bank) - uint32x4_t bank0_3 = vld1q_u32(&r[0]); - uint32x4_t bank4_7 = vld1q_u32(&r[4]); - - // Load shadow bank using NEON (only 8 registers) - vst1q_u32(&r[0], vld1q_u32(&r_bank[0])); - vst1q_u32(&r[4], vld1q_u32(&r_bank[4])); - - // Store old bank to shadow (only 8 registers) - vst1q_u32(&r_bank[0], bank0_3); - vst1q_u32(&r_bank[4], bank4_7); -} - -// Detect patterns for NEON optimization -static inline bool is_bulk_mov_pattern(u16* opcodes, int count) { - // Check if we have 4+ consecutive MOV operations - int mov_count = 0; - for (int i = 0; i < count && i < 8; i++) { - if ((opcodes[i] & 0xF00F) == 0x6003) { // mov , - mov_count++; - } else { - break; - } - } - return mov_count >= 4; -} - -// Execute bulk MOV operations with NEON -static inline int execute_bulk_mov_neon(u16* opcodes, int count) { - // Extract source and destination registers - u32 src_regs[4], dst_regs[4]; - for (int i = 0; i < 4; i++) { - src_regs[i] = (opcodes[i] >> 4) & 0xF; - dst_regs[i] = (opcodes[i] >> 8) & 0xF; - } - - // Load source values using NEON gather (simulated) - uint32x4_t values = {r[src_regs[0]], r[src_regs[1]], r[src_regs[2]], r[src_regs[3]]}; - - // Store to destinations - r[dst_regs[0]] = vgetq_lane_u32(values, 0); - r[dst_regs[1]] = vgetq_lane_u32(values, 1); - r[dst_regs[2]] = vgetq_lane_u32(values, 2); - r[dst_regs[3]] = vgetq_lane_u32(values, 3); - - return 4; // Processed 4 instructions -} - -#else -// Fallback for non-ARM64 platforms -static inline void neon_clear_registers(u32* reg_base, int count) { - for (int i = 0; i < count; i++) { - reg_base[i] = 0; - } -} - -static inline void neon_copy_registers(u32* dst, const u32* src, int count) { - for (int i = 0; i < count; i++) { - dst[i] = src[i]; - } -} - -static inline void neon_switch_register_bank() { - // Standard register bank switch - for (int i = 0; i < 16; i++) { - u32 temp = r[i]; - r[i] = r_bank[i]; - r_bank[i] = temp; - } -} - -static inline bool is_bulk_mov_pattern(u16* opcodes, int count) { return false; } -static inline int execute_bulk_mov_neon(u16* opcodes, int count) { return 0; } -#endif - -// === SEQUENCE CACHING IMPLEMENTATION === - -// Hash function for instruction sequences -static inline u32 hash_sequence(u32 pc, u16* opcodes, u32 length) { - u32 hash = pc; - for (u32 i = 0; i < length; i++) { - hash = hash * 31 + opcodes[i]; - } - return hash % SEQUENCE_CACHE_SIZE; -} - -// Analyze instruction sequence for optimization opportunities -static inline void analyze_sequence(InstructionSequence* seq) { - seq->is_pure_arithmetic = true; - seq->is_register_shuffle = true; - seq->is_memory_block = true; - seq->uses_neon = false; - - int mov_count = 0; - int arith_count = 0; - int mem_count = 0; - - for (u32 i = 0; i < seq->length; i++) { - u16 op = seq->opcodes[i]; - u32 op_high = (op >> 12) & 0xF; - u32 op_low = op & 0xF; - - // Check instruction types - if (op_high == 0x6 && op_low == 0x3) { - mov_count++; // mov , - } else if (op_high == 0x3 && (op_low == 0xC || op_low == 0x8)) { - arith_count++; // add/sub - seq->is_register_shuffle = false; - } else if (op_high == 0x6 && (op_low == 0x2 || op_low == 0x6)) { - mem_count++; // memory load - seq->is_pure_arithmetic = false; - seq->is_register_shuffle = false; - } else { - // Complex instruction - disable optimizations - seq->is_pure_arithmetic = false; - seq->is_register_shuffle = false; - seq->is_memory_block = false; - } - } - - // Enable NEON if we have enough parallel operations - if (mov_count >= 4 || arith_count >= 4) { - seq->uses_neon = true; - } - - // Set memory block flag based on memory operations - if (mem_count < seq->length / 2) { - seq->is_memory_block = false; - } - - seq->is_optimized = true; -} - -// Execute optimized instruction sequence -static inline u32 execute_optimized_sequence(InstructionSequence* seq, u32 pc) { - if (!seq->is_optimized) { - analyze_sequence(seq); - } - - // Fast path for register shuffles with NEON - if (seq->uses_neon && seq->is_register_shuffle) { - int processed = execute_bulk_mov_neon(seq->opcodes, seq->length); - if (processed > 0) { - return processed * 2; // Each instruction is 2 bytes - } - } - - // Fast path for pure arithmetic - if (seq->is_pure_arithmetic && seq->length <= 4) { - for (u32 i = 0; i < seq->length; i++) { - ultra_execute_hot_opcode(seq->opcodes[i]); - } - return seq->length * 2; - } - - // Fallback to individual execution - for (u32 i = 0; i < seq->length; i++) { - ultra_execute_hot_opcode(seq->opcodes[i]); - } - return seq->length * 2; -} - -// Try to find or create a cached sequence -static inline InstructionSequence* find_cached_sequence(u32 pc) { - // Look ahead and try to build a sequence - u16 opcodes[MAX_SEQUENCE_LENGTH]; - u32 length = 0; - u32 current_pc = pc; - - // Build sequence until we hit a branch or complex instruction - for (length = 0; length < MAX_SEQUENCE_LENGTH; length++) { - u16 op = ultra_fetch_instruction(current_pc); - opcodes[length] = op; - current_pc += 2; - - // Stop at branches or complex instructions - u32 op_high = (op >> 12) & 0xF; - if (op_high == 0xA || op_high == 0xB || // Branch instructions - op_high == 0xF || // FPU instructions - op == 0x001B) { // sleep instruction - length++; // Include this instruction - break; - } - } - - if (length < 2) return nullptr; // Too short to optimize - - // Check cache - u32 hash = hash_sequence(pc, opcodes, length); - InstructionSequence* seq = &g_sequence_cache[hash]; - - if (seq->start_pc == pc && seq->length == length) { - // Cache hit - verify opcodes match - bool match = true; - for (u32 i = 0; i < length; i++) { - if (seq->opcodes[i] != opcodes[i]) { - match = false; - break; - } - } - if (match) { - seq->execution_count++; - return seq; - } - } - - // Cache miss - create new sequence - seq->start_pc = pc; - seq->length = length; - seq->execution_count = 1; - seq->is_optimized = false; - for (u32 i = 0; i < length; i++) { - seq->opcodes[i] = opcodes[i]; - } - - return seq; -} - -// Build a new instruction sequence starting at the given PC -static inline void build_instruction_sequence(u32 start_pc, u32 length) { - // Don't build sequences if we're in an exception handler or delay slot - if (sr.BL || next_pc != start_pc + 2) { - return; - } - - // Find an empty slot in the cache - u32 hash = start_pc % SEQUENCE_CACHE_SIZE; - InstructionSequence* seq = &g_sequence_cache[hash]; - - // If slot is occupied by a different sequence, check if we should replace it - if (seq->start_pc != 0 && seq->start_pc != start_pc) { - // Only replace if the existing sequence has low execution count - if (seq->execution_count > 10) { - return; // Keep the existing sequence - } - } - - // Initialize the sequence - seq->start_pc = start_pc; - seq->length = 0; - seq->execution_count = 1; - seq->is_optimized = false; - - // Fetch instructions for the sequence - u32 current_pc = start_pc; - for (u32 i = 0; i < length && i < MAX_SEQUENCE_LENGTH; i++) { - u16 op = ultra_fetch_instruction(current_pc); - seq->opcodes[i] = op; - seq->length++; - current_pc += 2; - - // Stop building if we hit a branch or jump - u32 op_high = (op >> 12) & 0xF; - if (op_high == 0x8 || op_high == 0x9 || op_high == 0xA || op_high == 0xB) { - // Branch instructions - stop sequence here - break; - } - - // Stop if we hit a system call or privileged instruction - if (op == 0x000B || op == 0x0093 || op == 0x0083) { - // sleep, rte, pref - stop sequence - break; - } - } - - // Only keep sequences with at least 2 instructions - if (seq->length < 2) { - seq->start_pc = 0; // Mark as empty - return; - } - - // Analyze the sequence for optimization opportunities - analyze_sequence(seq); - -#ifdef DEBUG - INFO_LOG(INTERPRETER, "🔗 Built sequence at PC=0x%08X, length=%d, arithmetic=%s, shuffle=%s", - start_pc, seq->length, - seq->is_pure_arithmetic ? "yes" : "no", - seq->is_register_shuffle ? "yes" : "no"); -#endif -} - -// === ULTRA-FAST MAIN EXECUTION LOOP WITH BLOCK CACHING === -// This uses block caching like the dynarec for maximum performance -static void ultra_interpreter_run() { - INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Starting block-cached execution"); - - // Reset stats - g_stats.reset(); - g_icache.reset(); - g_block_stats.reset(); - - // Main execution loop - BLOCK-BASED like dynarec! - while (sh4_int_bCpuRun) { - try { - // Inner loop with block execution - do { - // CRITICAL: Check for system updates and interrupts - if (UpdateSystem()) { - break; // System update occurred, restart loop - } - - // Get current PC - u32 current_pc = next_pc; - - // Look up block in cache first - auto it = g_block_cache.find(current_pc); - if (it != g_block_cache.end()) { - // CACHE HIT: Execute cached block - g_block_stats.cache_hits++; - execute_cached_block(it->second); - } else { - // CACHE MISS: Create new block and execute it - g_block_stats.cache_misses++; - - // Create new block - CachedBlock new_block = create_cached_block(current_pc); - - // Add to cache - g_block_cache[current_pc] = std::move(new_block); - - // Execute the new block - execute_cached_block(g_block_cache[current_pc]); - } - - // CRITICAL: Check if we're stuck in an infinite loop - if (next_pc == current_pc) { - // PC hasn't changed - this could be an infinite loop - // Fall back to single instruction execution - u16 op = IReadMem16(current_pc); - Sh4cntx.pc = current_pc; - next_pc = current_pc + 2; - OpPtr[op](op); - sh4cycles.executeCycles(op); - } - - // Periodic stats reporting (every 10000 blocks) - static u32 stats_counter = 0; - if ((++stats_counter % 10000) == 0) { - INFO_LOG(INTERPRETER, "📊 BLOCK STATS: %llu executed, %llu hot, %llu cold, %llu created, %.1f%% hit ratio", - g_block_stats.total_blocks_executed, g_block_stats.hot_block_executions, - g_block_stats.cold_block_executions, g_block_stats.blocks_created, - (g_block_stats.cache_hits + g_block_stats.cache_misses) > 0 ? - (float)g_block_stats.cache_hits / (g_block_stats.cache_hits + g_block_stats.cache_misses) * 100.0f : 0.0f); - } - - } while (p_sh4rcb->cntx.cycle_counter > 0 && sh4_int_bCpuRun); - - // Update system timing - p_sh4rcb->cntx.cycle_counter += SH4_TIMESLICE; - - } catch (const SH4ThrownException& ex) { - Do_Exception(ex.epc, ex.expEvn); - // Exception requires pipeline drain, so approx 5 cycles - sh4cycles.addCycles(5 * 8); // 8 = CPU_RATIO from legacy - } - } - - INFO_LOG(INTERPRETER, "🏁 ULTRA-INTERPRETER: Finished block-cached execution"); - - // Print final block cache statistics - INFO_LOG(INTERPRETER, "📊 FINAL BLOCK STATS:"); - INFO_LOG(INTERPRETER, " Total blocks executed: %llu", g_block_stats.total_blocks_executed); - INFO_LOG(INTERPRETER, " Hot block executions: %llu (%.1f%%)", - g_block_stats.hot_block_executions, - g_block_stats.total_blocks_executed > 0 ? - (double)g_block_stats.hot_block_executions / g_block_stats.total_blocks_executed * 100.0 : 0.0); - INFO_LOG(INTERPRETER, " Cold block executions: %llu (%.1f%%)", - g_block_stats.cold_block_executions, - g_block_stats.total_blocks_executed > 0 ? - (double)g_block_stats.cold_block_executions / g_block_stats.total_blocks_executed * 100.0 : 0.0); - INFO_LOG(INTERPRETER, " Blocks created: %llu", g_block_stats.blocks_created); - INFO_LOG(INTERPRETER, " Cache hit ratio: %.1f%%", - (g_block_stats.cache_hits + g_block_stats.cache_misses) > 0 ? - (float)g_block_stats.cache_hits / (g_block_stats.cache_hits + g_block_stats.cache_misses) * 100.0f : 0.0f); - -#ifdef DEBUG - INFO_LOG(INTERPRETER, "📊 Final stats: %llu instructions, %llu cycles, %d MMU changes", - g_stats.instructions, g_stats.cycles, g_stats.mmu_state_changes); - - float cache_hit_ratio = (g_icache.hits + g_icache.misses) > 0 ? - (float)g_icache.hits / (g_icache.hits + g_icache.misses) * 100.0f : 0.0f; - INFO_LOG(INTERPRETER, "📊 Instruction cache: %llu hits, %llu misses, %.1f%% hit ratio", - g_icache.hits, g_icache.misses, cache_hit_ratio); -#endif -} - -// === ULTRA-INTERPRETER INTERFACE === -void* Get_UltraInterpreter() { - INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Get_UltraInterpreter called — linking block-cached interpreter!"); - INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Block caching: ENABLED (%d max blocks, %d max size)", BLOCK_CACHE_SIZE, MAX_BLOCK_SIZE); - INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Hot block threshold: %d executions", HOT_BLOCK_THRESHOLD); - INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Instruction caching: ENABLED (%d entries)", ICACHE_SIZE); - INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: ARM64 prefetching: ENABLED"); - INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: MMU-aware optimizations: ENABLED"); - INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Block-based execution like dynarec but simpler!"); - - return (void*)ultra_interpreter_run; -} - -// SECONDARY HOT PATH: Handle next tier of opcodes with minimal overhead -static inline bool ultra_execute_hot_opcode(u16 op) { - u32 op_high = op >> 12; - u32 op_low = op & 0xF; - - // Memory operations with pre-decrement/post-increment - if (op_high == 0x2) { - if (op_low == 0x6) { // mov.l ,@- - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] -= 4; - WriteMem32(r[n], r[m]); - return true; - } else if (op_low == 0x9) { // and , - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] &= r[m]; - return true; - } else if (op_low == 0xB) { // or , - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] |= r[m]; - return true; - } else if (op_low == 0xA) { // xor , - u32 n = (op >> 8) & 0xF; - u32 m = (op >> 4) & 0xF; - r[n] ^= r[m]; - return true; - } - } - - // Shift operations - else if (op_high == 0x4) { - u32 low_byte = op & 0xFF; - if (low_byte == 0x00) { // shll - u32 n = (op >> 8) & 0xF; - sr.T = r[n] >> 31; - r[n] <<= 1; - return true; - } else if (low_byte == 0x01) { // shlr - u32 n = (op >> 8) & 0xF; - sr.T = r[n] & 1; - r[n] >>= 1; - return true; - } - } - - // Control flow and special operations - else if (op_high == 0x0) { - u32 low_byte = op & 0xFF; - if (low_byte == 0x08) { // clrt - sr.T = 0; - return true; - } else if (low_byte == 0x18) { // sett - sr.T = 1; - return true; - } - } - - return false; // Not handled in hot path -} \ No newline at end of file diff --git a/core/hw/sh4/interpr/sh4_ultra_interpreter.h b/core/hw/sh4/interpr/sh4_ultra_interpreter.h deleted file mode 100644 index 983183d0c..000000000 --- a/core/hw/sh4/interpr/sh4_ultra_interpreter.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -#include "types.h" - -// === NEXT-GENERATION ULTRA-FAST SH4 INTERPRETER === -// This interpreter beats the legacy interpreter using modern optimization techniques - -#ifdef __cplusplus -extern "C" { -#endif - -// Ultra-interpreter factory function -void* Get_UltraInterpreter(); - -#ifdef __cplusplus -} -#endif - -// Enable ultra-interpreter by default -#define USE_ULTRA_INTERPRETER 1 \ No newline at end of file