remove ultra shit

2025-07-06 21:42:45 -04:00 · 2025-07-06 21:42:45 -04:00 · 0a17aee1d1
parent 31c8fcf010
commit 0a17aee1d1
4 changed files with 51 additions and 1026 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -986,8 +986,6 @@ target_sources(${PROJECT_NAME} PRIVATE
 		core/hw/sh4/interpr/sh4_interpreter.cpp
 		core/hw/sh4/interpr/sh4_opcodes.cpp
 		core/hw/sh4/interpr/sh4_opcodes.h
-		core/hw/sh4/interpr/sh4_ultra_interpreter.cpp
-		core/hw/sh4/interpr/sh4_ultra_interpreter.h
 		core/hw/sh4/modules/bsc.cpp
 		core/hw/sh4/modules/ccn.cpp
 		core/hw/sh4/modules/ccn.h
--- a/core/hw/sh4/interpr/sh4_interpreter.cpp
+++ b/core/hw/sh4/interpr/sh4_interpreter.cpp
@ -1,5 +1,5 @@
 /*
-	Highly inefficient and boring interpreter. Nothing special here
+	Optimized SH4 interpreter for better performance
 */

 #include "types.h"
@ -13,7 +13,6 @@
 #include "../sh4_cache.h"
 #include "debug/gdb_server.h"
 #include "../sh4_cycles.h"
-#include "sh4_ultra_interpreter.h"

 // SH4 underclock factor when using the interpreter so that it's somewhat usable
 #ifdef STRICT_MODE
@ -25,40 +24,76 @@ constexpr int CPU_RATIO = 8;
 Sh4ICache icache;
 Sh4OCache ocache;

-static void ExecuteOpcode(u16 op)
+// === SIMPLE INSTRUCTION CACHE FOR BETTER PERFORMANCE ===
+#define SIMPLE_ICACHE_SIZE 1024
+#define SIMPLE_ICACHE_MASK (SIMPLE_ICACHE_SIZE - 1)
+
+struct SimpleInstructionCache {
+    u32 pc[SIMPLE_ICACHE_SIZE];
+    u16 opcode[SIMPLE_ICACHE_SIZE];
+    
+    void reset() {
+        for (int i = 0; i < SIMPLE_ICACHE_SIZE; i++) {
+            pc[i] = 0xFFFFFFFF;
+        }
+    }
+    
+    u16 fetch(u32 addr) {
+        u32 index = (addr >> 1) & SIMPLE_ICACHE_MASK;
+        
+        if (__builtin_expect(pc[index] == addr, 1)) {
+            return opcode[index];
+        }
+        
+        // Cache miss - fetch from memory
+        u16 op = IReadMem16(addr);
+        pc[index] = addr;
+        opcode[index] = op;
+        return op;
+    }
+};
+
+static SimpleInstructionCache g_simple_icache;
+
+static inline void ExecuteOpcode(u16 op)
 {
-	if (sr.FD == 1 && OpDesc[op]->IsFloatingPoint())
+	if (__builtin_expect(sr.FD == 1 && OpDesc[op]->IsFloatingPoint(), 0))
 		RaiseFPUDisableException();
 	OpPtr[op](op);
 	sh4cycles.executeCycles(op);
 }

-static u16 ReadNexOp()
+static inline u16 ReadNexOp()
 {
-	if (!mmu_enabled() && (next_pc & 1))
+	if (__builtin_expect(!mmu_enabled() && (next_pc & 1), 0))
 		// address error
 		throw SH4ThrownException(next_pc, Sh4Ex_AddressErrorRead);

 	u32 addr = next_pc;
 	next_pc += 2;

-	return IReadMem16(addr);
+	// Use simple instruction cache for better performance
+	return g_simple_icache.fetch(addr);
 }

 static void Sh4_int_Run()
 {
 	RestoreHostRoundingMode();

+	// Reset instruction cache at start
+	g_simple_icache.reset();
+
 	try {
 		do
 		{
 			try {
+				// Optimized inner loop with minimal overhead
 				do
 				{
 					u32 op = ReadNexOp();
-
 					ExecuteOpcode(op);
-				} while (p_sh4rcb->cntx.cycle_counter > 0);
+				} while (__builtin_expect(p_sh4rcb->cntx.cycle_counter > 0, 1));
+				
 				p_sh4rcb->cntx.cycle_counter += SH4_TIMESLICE;
 				UpdateSystem_INTC();
 			} catch (const SH4ThrownException& ex) {
@ -66,7 +101,7 @@ static void Sh4_int_Run()
 				// an exception requires the instruction pipeline to drain, so approx 5 cycles
 				sh4cycles.addCycles(5 * CPU_RATIO);
 			}
-		} while (sh4_int_bCpuRun);
+		} while (__builtin_expect(sh4_int_bCpuRun, 1));
 	} catch (const debugger::Stop&) {
 	}

@ -128,6 +163,9 @@ static void Sh4_int_Reset(bool hard)
 	ocache.Reset(hard);
 	sh4cycles.reset();
 	p_sh4rcb->cntx.cycle_counter = SH4_TIMESLICE;
+	
+	// Reset simple instruction cache
+	g_simple_icache.reset();

 	INFO_LOG(INTERPRETER, "Sh4 Reset");
 }
@ -193,6 +231,7 @@ int UpdateSystem_INTC()
 }

 static void sh4_int_resetcache() {
+	g_simple_icache.reset();
 }

 static void Sh4_int_Init()
@ -200,6 +239,7 @@ static void Sh4_int_Init()
 	static_assert(sizeof(Sh4cntx) == 448, "Invalid Sh4Cntx size");

 	memset(&p_sh4rcb->cntx, 0, sizeof(p_sh4rcb->cntx));
+	g_simple_icache.reset();
 }

 static void Sh4_int_Term()
@ -211,21 +251,7 @@ static void Sh4_int_Term()
 #ifndef ENABLE_SH4_CACHED_IR
 void Get_Sh4Interpreter(sh4_if* cpu)
 {
-#ifdef USE_ULTRA_INTERPRETER
-    fprintf(stderr, "🚀 ULTRA-INTERPRETER: Get_Sh4Interpreter called — linking ultra-fast interpreter!\n");
-    
-    // Use the ultra-interpreter instead of legacy
-    cpu->Start = Sh4_int_Start;
-    cpu->Run = (void(*)())Get_UltraInterpreter();  // Use ultra-interpreter run function
-    cpu->Stop = Sh4_int_Stop;
-    cpu->Step = Sh4_int_Step;
-    cpu->Reset = Sh4_int_Reset;
-    cpu->Init = Sh4_int_Init;
-    cpu->Term = Sh4_int_Term;
-    cpu->IsCpuRunning = Sh4_int_IsCpuRunning;
-    cpu->ResetCache = sh4_int_resetcache;
-#else
-    fprintf(stderr, "[LEGACY-INT] Get_Sh4Interpreter called — linking legacy interpreter!\n");
+    INFO_LOG(INTERPRETER, "🚀 OPTIMIZED-INTERPRETER: Get_Sh4Interpreter called — linking optimized interpreter!");
    cpu->Start = Sh4_int_Start;
    cpu->Run = Sh4_int_Run;
    cpu->Stop = Sh4_int_Stop;
@ -235,6 +261,5 @@ void Get_Sh4Interpreter(sh4_if* cpu)
    cpu->Term = Sh4_int_Term;
    cpu->IsCpuRunning = Sh4_int_IsCpuRunning;
    cpu->ResetCache = sh4_int_resetcache;
-#endif
 }
 #endif // ENABLE_SH4_CACHED_IR
--- a/core/hw/sh4/interpr/sh4_ultra_interpreter.cpp
+++ b/core/hw/sh4/interpr/sh4_ultra_interpreter.cpp
@ -1,978 +0,0 @@
-// === NEXT-GENERATION ULTRA-FAST SH4 INTERPRETER ===
-// This interpreter beats the legacy interpreter by being SIMPLER and FASTER
-// - Direct SH4 execution like legacy but with optimizations that actually work
-// - Simple instruction caching without complex block compilation
-// - BLOCK CACHING: Groups instructions into blocks for dynarec-like performance
-// - MMU-aware optimizations
-// - ARM64 prefetching and branch prediction
-
-#include "sh4_ultra_interpreter.h"
-#include "hw/sh4/sh4_interpreter.h"
-#include "hw/sh4/sh4_opcode_list.h"
-#include "hw/sh4/sh4_core.h"
-#include "hw/sh4/sh4_interrupts.h"
-#include "hw/sh4/sh4_mem.h"
-#include "hw/sh4/sh4_sched.h"
-#include "hw/sh4/sh4_cache.h"
-#include "hw/sh4/modules/mmu.h"
-#include <unordered_map>
-#include <vector>
-
-// === BLOCK CACHING SYSTEM ===
-// This is the key to achieving dynarec-like performance!
-#define MAX_BLOCK_SIZE 32
-#define BLOCK_CACHE_SIZE 2048
-#define HOT_BLOCK_THRESHOLD 10
-
-struct CachedBlock {
-    u32 pc_start;
-    u32 pc_end;
-    u32 execution_count;
-    bool is_hot_block;
-    std::vector<u16> opcodes;
-    
-    // Block analysis
-    bool has_branches;
-    bool has_memory_ops;
-    bool is_pure_arithmetic;
-    bool can_use_fast_path;
-    
-    CachedBlock() : pc_start(0), pc_end(0), execution_count(0), is_hot_block(false),
-                   has_branches(false), has_memory_ops(false), is_pure_arithmetic(true), can_use_fast_path(true) {}
-};
-
-// Block cache using unordered_map for fast lookups
-static std::unordered_map<u32, CachedBlock> g_block_cache;
-
-// Block cache statistics
-struct BlockCacheStats {
-    u64 total_blocks_executed;
-    u64 hot_block_executions;
-    u64 cold_block_executions;
-    u64 blocks_created;
-    u64 cache_hits;
-    u64 cache_misses;
-    
-    void reset() {
-        total_blocks_executed = 0;
-        hot_block_executions = 0;
-        cold_block_executions = 0;
-        blocks_created = 0;
-        cache_hits = 0;
-        cache_misses = 0;
-    }
-};
-
-static BlockCacheStats g_block_stats;
-
-// === INSTRUCTION SEQUENCE CACHING ===
-#define MAX_SEQUENCE_LENGTH 8
-#define SEQUENCE_CACHE_SIZE 256
-
-struct InstructionSequence {
-    u32 start_pc;
-    u16 opcodes[MAX_SEQUENCE_LENGTH];
-    u32 length;
-    u32 execution_count;
-    bool is_optimized;
-    
-    // Optimization flags
-    bool is_pure_arithmetic;    // Only arithmetic ops, no memory/branches
-    bool is_register_shuffle;   // Only register moves
-    bool is_memory_block;      // Memory operations that can be batched
-    bool uses_neon;           // Can use NEON SIMD optimization
-};
-
-static InstructionSequence g_sequence_cache[SEQUENCE_CACHE_SIZE];
-
-// === ULTRA-SIMPLE INSTRUCTION CACHE ===
-#define ICACHE_SIZE 1024
-#define ICACHE_MASK (ICACHE_SIZE - 1)
-
-struct UltraInstructionCache {
-    u32 pc[ICACHE_SIZE];
-    u16 opcode[ICACHE_SIZE];
-#ifdef DEBUG
-    u64 hits;
-    u64 misses;
-#endif
-    
-    void reset() {
-        for (int i = 0; i < ICACHE_SIZE; i++) {
-            pc[i] = 0xFFFFFFFF;
-            opcode[i] = 0;
-        }
-#ifdef DEBUG
-        hits = 0;
-        misses = 0;
-#endif
-    }
-    
-    u16 fetch(u32 addr) {
-        u32 index = (addr >> 1) & ICACHE_MASK;
-        
-        if (pc[index] == addr) {
-#ifdef DEBUG
-            hits++;
-#endif
-            return opcode[index];
-        }
-        
-        // Cache miss - fetch from memory
-#ifdef DEBUG
-        misses++;
-#endif
-        u16 op = IReadMem16(addr);
-        pc[index] = addr;
-        opcode[index] = op;
-        return op;
-    }
-};
-
-static UltraInstructionCache g_icache;
-
-// === PERFORMANCE STATS ===
-struct UltraStats {
-#ifdef DEBUG
-    u64 instructions;
-    u64 cycles;
-    u32 mmu_state_changes;
-#endif
-    bool mmu_enabled;
-    
-    void reset() {
-#ifdef DEBUG
-        instructions = 0;
-        cycles = 0;
-        mmu_state_changes = 0;
-#endif
-        mmu_enabled = ::mmu_enabled();
-    }
-    
-    void check_mmu() {
-        bool current_mmu = ::mmu_enabled();
-        if (current_mmu != mmu_enabled) {
-#ifdef DEBUG
-            mmu_state_changes++;
-            INFO_LOG(INTERPRETER, "🔄 MMU state changed: %s", current_mmu ? "ENABLED" : "DISABLED");
-#endif
-            mmu_enabled = current_mmu;
-        }
-    }
-};
-
-static UltraStats g_stats;
-
-// === FORWARD DECLARATIONS ===
-static inline bool ultra_execute_hot_opcode(u16 op);
-
-// === BLOCK CACHING FUNCTIONS ===
-
-// Create a new cached block starting at the given PC
-static CachedBlock create_cached_block(u32 start_pc) {
-    CachedBlock block;
-    block.pc_start = start_pc;
-    block.execution_count = 0;
-    block.is_hot_block = false;
-    
-    // Decode instructions until we hit a branch or reach max block size
-    u32 current_pc = start_pc;
-    for (u32 i = 0; i < MAX_BLOCK_SIZE; i++) {
-        u16 op = IReadMem16(current_pc);
-        block.opcodes.push_back(op);
-        current_pc += 2;
-        
-        // Analyze the instruction
-        if (OpDesc[op]->SetPC()) {
-            // This is a branch instruction - end the block
-            block.has_branches = true;
-            break;
-        }
-        
-        // Analyze instruction type based on opcode patterns
-        u32 op_high = op >> 12;
-        u32 op_low = op & 0xF;
-        
-        // Check for memory operations (rough heuristic)
-        if (op_high == 0x2 || op_high == 0x6 || op_high == 0x8 || op_high == 0x9 || op_high == 0xC || op_high == 0xD) {
-            block.has_memory_ops = true;
-        }
-        
-        // Check if it's arithmetic/logical operation
-        if (!(op_high == 0x3 || op_high == 0x7 || (op_high == 0x2 && (op_low == 0x9 || op_low == 0xA || op_low == 0xB)))) {
-            block.is_pure_arithmetic = false;
-        }
-    }
-    
-    block.pc_end = current_pc;
-    
-    // Determine if this block can use fast path execution
-    block.can_use_fast_path = !block.has_branches && block.opcodes.size() <= 16;
-    
-    g_block_stats.blocks_created++;
-    
-    INFO_LOG(INTERPRETER, "🔨 Created block PC=0x%08X-0x%08X (%d opcodes, branches=%s, memory=%s)", 
-             block.pc_start, block.pc_end, (int)block.opcodes.size(),
-             block.has_branches ? "yes" : "no", block.has_memory_ops ? "yes" : "no");
-    
-    return block;
-}
-
-// Execute a cached block with proper exception and control flow handling
-static void execute_cached_block(CachedBlock& block) {
-    block.execution_count++;
-    g_block_stats.total_blocks_executed++;
-    
-    // Promote to hot block if executed frequently
-    if (block.execution_count >= HOT_BLOCK_THRESHOLD && !block.is_hot_block) {
-        block.is_hot_block = true;
-        INFO_LOG(INTERPRETER, "🔥 Block at PC=0x%08X promoted to HOT BLOCK (%u executions)", 
-                 block.pc_start, block.execution_count);
-    }
-    
-    // Track hot vs cold execution
-    if (block.is_hot_block) {
-        g_block_stats.hot_block_executions++;
-    } else {
-        g_block_stats.cold_block_executions++;
-    }
-    
-    // CRITICAL: Execute instructions one by one to handle exceptions properly
-    u32 block_pc = block.pc_start;
-    
-    try {
-        for (size_t i = 0; i < block.opcodes.size(); i++) {
-            u16 op = block.opcodes[i];
-            
-            // Update PC and next_pc for this instruction
-            Sh4cntx.pc = block_pc;
-            next_pc = block_pc + 2;
-            
-            // Check for interrupts before each instruction
-            if (__builtin_expect(UpdateSystem_INTC(), 0)) {
-                // Interrupt pending - must break out of block
-                return;
-            }
-            
-            // Check for floating point disable exception
-            if (__builtin_expect(sr.FD == 1 && OpDesc[op]->IsFloatingPoint(), 0)) {
-                RaiseFPUDisableException();
-            }
-            
-            if (block.is_hot_block && block.can_use_fast_path) {
-                // Try ultra-fast inline execution first
-                if (!ultra_execute_hot_opcode(op)) {
-                    // Fall back to legacy handler
-                    OpPtr[op](op);
-                }
-            } else {
-                // Execute using legacy handler
-                OpPtr[op](op);
-            }
-            
-            // Execute cycles
-            sh4cycles.executeCycles(op);
-            
-            // CRITICAL: Check if PC was changed by instruction (jumps, branches, exceptions)
-            if (next_pc != block_pc + 2) {
-                // Control flow changed - instruction modified PC
-                // This means we have a jump, branch, or exception
-                return; // Exit block execution immediately
-            }
-            
-            // Move to next instruction in block
-            block_pc += 2;
-        }
-    } catch (const SH4ThrownException& ex) {
-        // Exception occurred during block execution
-        Do_Exception(ex.epc, ex.expEvn);
-        // Exception requires pipeline drain, so approx 5 cycles
-        sh4cycles.addCycles(5 * 8); // 8 = CPU_RATIO from legacy
-        return; // Exit block execution
-    }
-}
-
-// === ULTRA-FAST INSTRUCTION FETCH ===
-static inline u16 ultra_fetch_instruction(u32 pc) {
-    // Use instruction cache for frequently accessed instructions
-    return g_icache.fetch(pc);
-}
-
-// === MEMORY ACCESS OPTIMIZATIONS ===
-// Fast paths for common memory operations
-
-// Fast path for main RAM access (0x0C000000-0x0CFFFFFF)
-static inline u32 ultra_read_mem32_fast(u32 addr) {
-    // Fast path for main RAM (most common case)
-    if (__builtin_expect((addr & 0xFF000000) == 0x0C000000, 1)) {
-        return ReadMem32(addr); // Direct access to main RAM
-    }
-    
-    // Fallback for other memory regions
-    return ReadMem32(addr);
-}
-
-static inline void ultra_write_mem32_fast(u32 addr, u32 data) {
-    // Fast path for main RAM (most common case)
-    if (__builtin_expect((addr & 0xFF000000) == 0x0C000000, 1)) {
-        WriteMem32(addr, data); // Direct access to main RAM
-        return;
-    }
-    
-    // Fallback for other memory regions
-    WriteMem32(addr, data);
-}
-
-// === HOT OPCODE SPECIALIZATION ===
-// Inline optimized versions of the most common opcodes to bypass function call overhead
-
-// ULTRA-HOT PATH: Inline the top 10 most critical opcodes for massive speedup
-static inline bool ultra_execute_superhot_opcode(u16 op) {
-    // Fast decode without switches for maximum performance
-    u32 op_high = op >> 12;
-    u32 op_low = op & 0xF;
-    
-    // TOP 1: mov <REG_M>,<REG_N> (0x6xx3) - 25% of all instructions
-    if (__builtin_expect((op & 0xF00F) == 0x6003, 1)) {
-        u32 n = (op >> 8) & 0xF;
-        u32 m = (op >> 4) & 0xF;
-        r[n] = r[m];
-        return true;
-    }
-    
-    // TOP 2: add #<imm>,<REG_N> (0x7xxx) - 15% of all instructions  
-    if (__builtin_expect(op_high == 0x7, 1)) {
-        u32 n = (op >> 8) & 0xF;
-        s32 imm = (s32)(s8)(op & 0xFF);
-        r[n] += imm;
-        return true;
-    }
-    
-    // TOP 3: add <REG_M>,<REG_N> (0x3xxC) - 10% of all instructions
-    if (__builtin_expect((op & 0xF00F) == 0x300C, 1)) {
-        u32 n = (op >> 8) & 0xF;
-        u32 m = (op >> 4) & 0xF;
-        r[n] += r[m];
-        return true;
-    }
-    
-    // TOP 4: mov.l @<REG_M>,<REG_N> (0x6xx2) - 8% of all instructions
-    if (__builtin_expect((op & 0xF00F) == 0x6002, 1)) {
-        u32 n = (op >> 8) & 0xF;
-        u32 m = (op >> 4) & 0xF;
-        r[n] = ReadMem32(r[m]);
-        return true;
-    }
-    
-    // TOP 5: mov.l <REG_M>,@<REG_N> (0x2xx2) - 6% of all instructions
-    if (__builtin_expect((op & 0xF00F) == 0x2002, 1)) {
-        u32 n = (op >> 8) & 0xF;
-        u32 m = (op >> 4) & 0xF;
-        WriteMem32(r[n], r[m]);
-        return true;
-    }
-    
-    // TOP 6: cmp/eq <REG_M>,<REG_N> (0x3xx0) - 5% of all instructions
-    if (__builtin_expect((op & 0xF00F) == 0x3000, 1)) {
-        u32 n = (op >> 8) & 0xF;
-        u32 m = (op >> 4) & 0xF;
-        sr.T = (r[m] == r[n]) ? 1 : 0;
-        return true;
-    }
-    
-    // TOP 7: sub <REG_M>,<REG_N> (0x3xx8) - 4% of all instructions
-    if (__builtin_expect((op & 0xF00F) == 0x3008, 1)) {
-        u32 n = (op >> 8) & 0xF;
-        u32 m = (op >> 4) & 0xF;
-        r[n] -= r[m];
-        return true;
-    }
-    
-    // TOP 8: nop (0x0009) - 4% of all instructions
-    if (__builtin_expect(op == 0x0009, 1)) {
-        return true; // Nothing to do
-    }
-    
-    // TOP 9: mov.l @<REG_M>+,<REG_N> (0x6xx6) - 3% of all instructions
-    if (__builtin_expect((op & 0xF00F) == 0x6006, 1)) {
-        u32 n = (op >> 8) & 0xF;
-        u32 m = (op >> 4) & 0xF;
-        r[n] = ReadMem32(r[m]);
-        if (n != m) r[m] += 4;
-        return true;
-    }
-    
-    // TOP 10: dt <REG_N> (0x4xx0 with low byte 0x10) - 3% of all instructions
-    if (__builtin_expect((op & 0xF0FF) == 0x4010, 1)) {
-        u32 n = (op >> 8) & 0xF;
-        r[n]--;
-        sr.T = (r[n] == 0) ? 1 : 0;
-        return true;
-    }
-    
-    return false; // Not a superhot opcode
-}
-
-// Optimized memory operations for hot opcodes
-static inline void ultra_execute_hot_opcode_with_mem_opt(u16 op) {
-    // Extract opcode patterns for fastest common instructions
-    u32 opcode_high = (op >> 12) & 0xF;
-    u32 opcode_low = op & 0xF;
-    
-    switch (opcode_high) {
-        case 0x6: // 6xxx - MOV instructions (super hot!)
-            switch (opcode_low) {
-                case 0x3: { // mov <REG_M>,<REG_N> - HOTTEST OPCODE
-                    u32 n = (op >> 8) & 0xF;
-                    u32 m = (op >> 4) & 0xF;
-                    r[n] = r[m];
-                    return;
-                }
-                case 0x2: { // mov.l @<REG_M>,<REG_N> - OPTIMIZED
-                    u32 n = (op >> 8) & 0xF;
-                    u32 m = (op >> 4) & 0xF;
-                    r[n] = ultra_read_mem32_fast(r[m]);
-                    
-                    // Prefetch next cache line if sequential access pattern
-                    #ifdef __aarch64__
-                    __builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[m] + 32)), 0, 1);
-                    #endif
-                    return;
-                }
-                case 0x6: { // mov.l @<REG_M>+,<REG_N> - OPTIMIZED
-                    u32 n = (op >> 8) & 0xF;
-                    u32 m = (op >> 4) & 0xF;
-                    r[n] = ultra_read_mem32_fast(r[m]);
-                    if (n != m) {
-                        r[m] += 4;
-                        // Prefetch next memory location
-                        #ifdef __aarch64__
-                        __builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[m] + 16)), 0, 1);
-                        #endif
-                    }
-                    return;
-                }
-            }
-            break;
-            
-        case 0x2: // 2xxx - Memory operations and logic
-            switch (opcode_low) {
-                case 0x2: { // mov.l <REG_M>,@<REG_N> - OPTIMIZED
-                    u32 n = (op >> 8) & 0xF;
-                    u32 m = (op >> 4) & 0xF;
-                    ultra_write_mem32_fast(r[n], r[m]);
-                    
-                    // Prefetch next cache line for sequential writes
-                    #ifdef __aarch64__
-                    __builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[n] + 32)), 1, 1);
-                    #endif
-                    return;
-                }
-                case 0x6: { // mov.l <REG_M>,@-<REG_N> - OPTIMIZED
-                    u32 n = (op >> 8) & 0xF;
-                    u32 m = (op >> 4) & 0xF;
-                    r[n] -= 4;
-                    ultra_write_mem32_fast(r[n], r[m]);
-                    
-                    // Prefetch previous cache line for stack operations
-                    #ifdef __aarch64__
-                    __builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[n] - 32)), 1, 1);
-                    #endif
-                    return;
-                }
-            }
-            break;
-    }
-    
-    // Fallback to standard hot opcode execution
-    ultra_execute_hot_opcode(op);
-}
-
-// === FORWARD DECLARATIONS ===
-static void ultra_interpreter_run();
-
-// === ARM64 NEON SIMD OPTIMIZATIONS ===
-// Use ARM64 NEON to process multiple registers simultaneously
-
-#ifdef __aarch64__
-#include <arm_neon.h>
-
-// Bulk register clear using NEON (4 registers at once)
-static inline void neon_clear_registers(u32* reg_base, int count) {
-    uint32x4_t zero = vdupq_n_u32(0);
-    for (int i = 0; i < count; i += 4) {
-        vst1q_u32(&reg_base[i], zero);
-    }
-}
-
-// Bulk register copy using NEON (4 registers at once)  
-static inline void neon_copy_registers(u32* dst, const u32* src, int count) {
-    for (int i = 0; i < count; i += 4) {
-        uint32x4_t data = vld1q_u32(&src[i]);
-        vst1q_u32(&dst[i], data);
-    }
-}
-
-// NEON-optimized register bank switching
-static inline void neon_switch_register_bank() {
-    // Save current bank using NEON (only 8 registers in r_bank)
-    uint32x4_t bank0_3 = vld1q_u32(&r[0]);
-    uint32x4_t bank4_7 = vld1q_u32(&r[4]);
-    
-    // Load shadow bank using NEON (only 8 registers)
-    vst1q_u32(&r[0], vld1q_u32(&r_bank[0]));
-    vst1q_u32(&r[4], vld1q_u32(&r_bank[4]));
-    
-    // Store old bank to shadow (only 8 registers)
-    vst1q_u32(&r_bank[0], bank0_3);
-    vst1q_u32(&r_bank[4], bank4_7);
-}
-
-// Detect patterns for NEON optimization
-static inline bool is_bulk_mov_pattern(u16* opcodes, int count) {
-    // Check if we have 4+ consecutive MOV operations
-    int mov_count = 0;
-    for (int i = 0; i < count && i < 8; i++) {
-        if ((opcodes[i] & 0xF00F) == 0x6003) { // mov <REG_M>,<REG_N>
-            mov_count++;
-        } else {
-            break;
-        }
-    }
-    return mov_count >= 4;
-}
-
-// Execute bulk MOV operations with NEON
-static inline int execute_bulk_mov_neon(u16* opcodes, int count) {
-    // Extract source and destination registers
-    u32 src_regs[4], dst_regs[4];
-    for (int i = 0; i < 4; i++) {
-        src_regs[i] = (opcodes[i] >> 4) & 0xF;
-        dst_regs[i] = (opcodes[i] >> 8) & 0xF;
-    }
-    
-    // Load source values using NEON gather (simulated)
-    uint32x4_t values = {r[src_regs[0]], r[src_regs[1]], r[src_regs[2]], r[src_regs[3]]};
-    
-    // Store to destinations
-    r[dst_regs[0]] = vgetq_lane_u32(values, 0);
-    r[dst_regs[1]] = vgetq_lane_u32(values, 1);
-    r[dst_regs[2]] = vgetq_lane_u32(values, 2);
-    r[dst_regs[3]] = vgetq_lane_u32(values, 3);
-    
-    return 4; // Processed 4 instructions
-}
-
-#else
-// Fallback for non-ARM64 platforms
-static inline void neon_clear_registers(u32* reg_base, int count) {
-    for (int i = 0; i < count; i++) {
-        reg_base[i] = 0;
-    }
-}
-
-static inline void neon_copy_registers(u32* dst, const u32* src, int count) {
-    for (int i = 0; i < count; i++) {
-        dst[i] = src[i];
-    }
-}
-
-static inline void neon_switch_register_bank() {
-    // Standard register bank switch
-    for (int i = 0; i < 16; i++) {
-        u32 temp = r[i];
-        r[i] = r_bank[i];
-        r_bank[i] = temp;
-    }
-}
-
-static inline bool is_bulk_mov_pattern(u16* opcodes, int count) { return false; }
-static inline int execute_bulk_mov_neon(u16* opcodes, int count) { return 0; }
-#endif
-
-// === SEQUENCE CACHING IMPLEMENTATION ===
-
-// Hash function for instruction sequences
-static inline u32 hash_sequence(u32 pc, u16* opcodes, u32 length) {
-    u32 hash = pc;
-    for (u32 i = 0; i < length; i++) {
-        hash = hash * 31 + opcodes[i];
-    }
-    return hash % SEQUENCE_CACHE_SIZE;
-}
-
-// Analyze instruction sequence for optimization opportunities
-static inline void analyze_sequence(InstructionSequence* seq) {
-    seq->is_pure_arithmetic = true;
-    seq->is_register_shuffle = true;
-    seq->is_memory_block = true;
-    seq->uses_neon = false;
-    
-    int mov_count = 0;
-    int arith_count = 0;
-    int mem_count = 0;
-    
-    for (u32 i = 0; i < seq->length; i++) {
-        u16 op = seq->opcodes[i];
-        u32 op_high = (op >> 12) & 0xF;
-        u32 op_low = op & 0xF;
-        
-        // Check instruction types
-        if (op_high == 0x6 && op_low == 0x3) {
-            mov_count++; // mov <REG_M>,<REG_N>
-        } else if (op_high == 0x3 && (op_low == 0xC || op_low == 0x8)) {
-            arith_count++; // add/sub
-            seq->is_register_shuffle = false;
-        } else if (op_high == 0x6 && (op_low == 0x2 || op_low == 0x6)) {
-            mem_count++; // memory load
-            seq->is_pure_arithmetic = false;
-            seq->is_register_shuffle = false;
-        } else {
-            // Complex instruction - disable optimizations
-            seq->is_pure_arithmetic = false;
-            seq->is_register_shuffle = false;
-            seq->is_memory_block = false;
-        }
-    }
-    
-    // Enable NEON if we have enough parallel operations
-    if (mov_count >= 4 || arith_count >= 4) {
-        seq->uses_neon = true;
-    }
-    
-    // Set memory block flag based on memory operations
-    if (mem_count < seq->length / 2) {
-        seq->is_memory_block = false;
-    }
-    
-    seq->is_optimized = true;
-}
-
-// Execute optimized instruction sequence
-static inline u32 execute_optimized_sequence(InstructionSequence* seq, u32 pc) {
-    if (!seq->is_optimized) {
-        analyze_sequence(seq);
-    }
-    
-    // Fast path for register shuffles with NEON
-    if (seq->uses_neon && seq->is_register_shuffle) {
-        int processed = execute_bulk_mov_neon(seq->opcodes, seq->length);
-        if (processed > 0) {
-            return processed * 2; // Each instruction is 2 bytes
-        }
-    }
-    
-    // Fast path for pure arithmetic
-    if (seq->is_pure_arithmetic && seq->length <= 4) {
-        for (u32 i = 0; i < seq->length; i++) {
-            ultra_execute_hot_opcode(seq->opcodes[i]);
-        }
-        return seq->length * 2;
-    }
-    
-    // Fallback to individual execution
-    for (u32 i = 0; i < seq->length; i++) {
-        ultra_execute_hot_opcode(seq->opcodes[i]);
-    }
-    return seq->length * 2;
-}
-
-// Try to find or create a cached sequence
-static inline InstructionSequence* find_cached_sequence(u32 pc) {
-    // Look ahead and try to build a sequence
-    u16 opcodes[MAX_SEQUENCE_LENGTH];
-    u32 length = 0;
-    u32 current_pc = pc;
-    
-    // Build sequence until we hit a branch or complex instruction
-    for (length = 0; length < MAX_SEQUENCE_LENGTH; length++) {
-        u16 op = ultra_fetch_instruction(current_pc);
-        opcodes[length] = op;
-        current_pc += 2;
-        
-        // Stop at branches or complex instructions
-        u32 op_high = (op >> 12) & 0xF;
-        if (op_high == 0xA || op_high == 0xB || // Branch instructions
-            op_high == 0xF ||                    // FPU instructions
-            op == 0x001B) {                     // sleep instruction
-            length++; // Include this instruction
-            break;
-        }
-    }
-    
-    if (length < 2) return nullptr; // Too short to optimize
-    
-    // Check cache
-    u32 hash = hash_sequence(pc, opcodes, length);
-    InstructionSequence* seq = &g_sequence_cache[hash];
-    
-    if (seq->start_pc == pc && seq->length == length) {
-        // Cache hit - verify opcodes match
-        bool match = true;
-        for (u32 i = 0; i < length; i++) {
-            if (seq->opcodes[i] != opcodes[i]) {
-                match = false;
-                break;
-            }
-        }
-        if (match) {
-            seq->execution_count++;
-            return seq;
-        }
-    }
-    
-    // Cache miss - create new sequence
-    seq->start_pc = pc;
-    seq->length = length;
-    seq->execution_count = 1;
-    seq->is_optimized = false;
-    for (u32 i = 0; i < length; i++) {
-        seq->opcodes[i] = opcodes[i];
-    }
-    
-    return seq;
-}
-
-// Build a new instruction sequence starting at the given PC
-static inline void build_instruction_sequence(u32 start_pc, u32 length) {
-    // Don't build sequences if we're in an exception handler or delay slot
-    if (sr.BL || next_pc != start_pc + 2) {
-        return;
-    }
-    
-    // Find an empty slot in the cache
-    u32 hash = start_pc % SEQUENCE_CACHE_SIZE;
-    InstructionSequence* seq = &g_sequence_cache[hash];
-    
-    // If slot is occupied by a different sequence, check if we should replace it
-    if (seq->start_pc != 0 && seq->start_pc != start_pc) {
-        // Only replace if the existing sequence has low execution count
-        if (seq->execution_count > 10) {
-            return; // Keep the existing sequence
-        }
-    }
-    
-    // Initialize the sequence
-    seq->start_pc = start_pc;
-    seq->length = 0;
-    seq->execution_count = 1;
-    seq->is_optimized = false;
-    
-    // Fetch instructions for the sequence
-    u32 current_pc = start_pc;
-    for (u32 i = 0; i < length && i < MAX_SEQUENCE_LENGTH; i++) {
-        u16 op = ultra_fetch_instruction(current_pc);
-        seq->opcodes[i] = op;
-        seq->length++;
-        current_pc += 2;
-        
-        // Stop building if we hit a branch or jump
-        u32 op_high = (op >> 12) & 0xF;
-        if (op_high == 0x8 || op_high == 0x9 || op_high == 0xA || op_high == 0xB) {
-            // Branch instructions - stop sequence here
-            break;
-        }
-        
-        // Stop if we hit a system call or privileged instruction
-        if (op == 0x000B || op == 0x0093 || op == 0x0083) {
-            // sleep, rte, pref - stop sequence
-            break;
-        }
-    }
-    
-    // Only keep sequences with at least 2 instructions
-    if (seq->length < 2) {
-        seq->start_pc = 0; // Mark as empty
-        return;
-    }
-    
-    // Analyze the sequence for optimization opportunities
-    analyze_sequence(seq);
-    
-#ifdef DEBUG
-    INFO_LOG(INTERPRETER, "🔗 Built sequence at PC=0x%08X, length=%d, arithmetic=%s, shuffle=%s", 
-             start_pc, seq->length, 
-             seq->is_pure_arithmetic ? "yes" : "no",
-             seq->is_register_shuffle ? "yes" : "no");
-#endif
-}
-
-// === ULTRA-FAST MAIN EXECUTION LOOP WITH BLOCK CACHING ===
-// This uses block caching like the dynarec for maximum performance
-static void ultra_interpreter_run() {
-    INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Starting block-cached execution");
-    
-    // Reset stats
-    g_stats.reset();
-    g_icache.reset();
-    g_block_stats.reset();
-    
-    // Main execution loop - BLOCK-BASED like dynarec!
-    while (sh4_int_bCpuRun) {
-        try {
-            // Inner loop with block execution
-            do {
-                // CRITICAL: Check for system updates and interrupts
-                if (UpdateSystem()) {
-                    break; // System update occurred, restart loop
-                }
-                
-                // Get current PC
-                u32 current_pc = next_pc;
-                
-                // Look up block in cache first
-                auto it = g_block_cache.find(current_pc);
-                if (it != g_block_cache.end()) {
-                    // CACHE HIT: Execute cached block
-                    g_block_stats.cache_hits++;
-                    execute_cached_block(it->second);
-                } else {
-                    // CACHE MISS: Create new block and execute it
-                    g_block_stats.cache_misses++;
-                    
-                    // Create new block
-                    CachedBlock new_block = create_cached_block(current_pc);
-                    
-                    // Add to cache
-                    g_block_cache[current_pc] = std::move(new_block);
-                    
-                    // Execute the new block
-                    execute_cached_block(g_block_cache[current_pc]);
-                }
-                
-                // CRITICAL: Check if we're stuck in an infinite loop
-                if (next_pc == current_pc) {
-                    // PC hasn't changed - this could be an infinite loop
-                    // Fall back to single instruction execution
-                    u16 op = IReadMem16(current_pc);
-                    Sh4cntx.pc = current_pc;
-                    next_pc = current_pc + 2;
-                    OpPtr[op](op);
-                    sh4cycles.executeCycles(op);
-                }
-                
-                // Periodic stats reporting (every 10000 blocks)
-                static u32 stats_counter = 0;
-                if ((++stats_counter % 10000) == 0) {
-                    INFO_LOG(INTERPRETER, "📊 BLOCK STATS: %llu executed, %llu hot, %llu cold, %llu created, %.1f%% hit ratio",
-                             g_block_stats.total_blocks_executed, g_block_stats.hot_block_executions, 
-                             g_block_stats.cold_block_executions, g_block_stats.blocks_created,
-                             (g_block_stats.cache_hits + g_block_stats.cache_misses) > 0 ?
-                             (float)g_block_stats.cache_hits / (g_block_stats.cache_hits + g_block_stats.cache_misses) * 100.0f : 0.0f);
-                }
-                
-            } while (p_sh4rcb->cntx.cycle_counter > 0 && sh4_int_bCpuRun);
-            
-            // Update system timing
-            p_sh4rcb->cntx.cycle_counter += SH4_TIMESLICE;
-            
-        } catch (const SH4ThrownException& ex) {
-            Do_Exception(ex.epc, ex.expEvn);
-            // Exception requires pipeline drain, so approx 5 cycles
-            sh4cycles.addCycles(5 * 8); // 8 = CPU_RATIO from legacy
-        }
-    }
-    
-    INFO_LOG(INTERPRETER, "🏁 ULTRA-INTERPRETER: Finished block-cached execution");
-    
-    // Print final block cache statistics
-    INFO_LOG(INTERPRETER, "📊 FINAL BLOCK STATS:");
-    INFO_LOG(INTERPRETER, "  Total blocks executed: %llu", g_block_stats.total_blocks_executed);
-    INFO_LOG(INTERPRETER, "  Hot block executions: %llu (%.1f%%)", 
-             g_block_stats.hot_block_executions,
-             g_block_stats.total_blocks_executed > 0 ? 
-             (double)g_block_stats.hot_block_executions / g_block_stats.total_blocks_executed * 100.0 : 0.0);
-    INFO_LOG(INTERPRETER, "  Cold block executions: %llu (%.1f%%)", 
-             g_block_stats.cold_block_executions,
-             g_block_stats.total_blocks_executed > 0 ? 
-             (double)g_block_stats.cold_block_executions / g_block_stats.total_blocks_executed * 100.0 : 0.0);
-    INFO_LOG(INTERPRETER, "  Blocks created: %llu", g_block_stats.blocks_created);
-    INFO_LOG(INTERPRETER, "  Cache hit ratio: %.1f%%", 
-             (g_block_stats.cache_hits + g_block_stats.cache_misses) > 0 ?
-             (float)g_block_stats.cache_hits / (g_block_stats.cache_hits + g_block_stats.cache_misses) * 100.0f : 0.0f);
-
-#ifdef DEBUG
-    INFO_LOG(INTERPRETER, "📊 Final stats: %llu instructions, %llu cycles, %d MMU changes", 
-            g_stats.instructions, g_stats.cycles, g_stats.mmu_state_changes);
-    
-    float cache_hit_ratio = (g_icache.hits + g_icache.misses) > 0 ? 
-        (float)g_icache.hits / (g_icache.hits + g_icache.misses) * 100.0f : 0.0f;
-    INFO_LOG(INTERPRETER, "📊 Instruction cache: %llu hits, %llu misses, %.1f%% hit ratio", 
-            g_icache.hits, g_icache.misses, cache_hit_ratio);
-#endif
-}
-
-// === ULTRA-INTERPRETER INTERFACE ===
-void* Get_UltraInterpreter() {
-    INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Get_UltraInterpreter called — linking block-cached interpreter!");
-    INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Block caching: ENABLED (%d max blocks, %d max size)", BLOCK_CACHE_SIZE, MAX_BLOCK_SIZE);
-    INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Hot block threshold: %d executions", HOT_BLOCK_THRESHOLD);
-    INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Instruction caching: ENABLED (%d entries)", ICACHE_SIZE);
-    INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: ARM64 prefetching: ENABLED");
-    INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: MMU-aware optimizations: ENABLED");
-    INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Block-based execution like dynarec but simpler!");
-    
-    return (void*)ultra_interpreter_run;
-}
-
-// SECONDARY HOT PATH: Handle next tier of opcodes with minimal overhead
-static inline bool ultra_execute_hot_opcode(u16 op) {
-    u32 op_high = op >> 12;
-    u32 op_low = op & 0xF;
-    
-    // Memory operations with pre-decrement/post-increment
-    if (op_high == 0x2) {
-        if (op_low == 0x6) { // mov.l <REG_M>,@-<REG_N>
-            u32 n = (op >> 8) & 0xF;
-            u32 m = (op >> 4) & 0xF;
-            r[n] -= 4;
-            WriteMem32(r[n], r[m]);
-            return true;
-        } else if (op_low == 0x9) { // and <REG_M>,<REG_N>
-            u32 n = (op >> 8) & 0xF;
-            u32 m = (op >> 4) & 0xF;
-            r[n] &= r[m];
-            return true;
-        } else if (op_low == 0xB) { // or <REG_M>,<REG_N>
-            u32 n = (op >> 8) & 0xF;
-            u32 m = (op >> 4) & 0xF;
-            r[n] |= r[m];
-            return true;
-        } else if (op_low == 0xA) { // xor <REG_M>,<REG_N>
-            u32 n = (op >> 8) & 0xF;
-            u32 m = (op >> 4) & 0xF;
-            r[n] ^= r[m];
-            return true;
-        }
-    }
-    
-    // Shift operations
-    else if (op_high == 0x4) {
-        u32 low_byte = op & 0xFF;
-        if (low_byte == 0x00) { // shll <REG_N>
-            u32 n = (op >> 8) & 0xF;
-            sr.T = r[n] >> 31;
-            r[n] <<= 1;
-            return true;
-        } else if (low_byte == 0x01) { // shlr <REG_N>
-            u32 n = (op >> 8) & 0xF;
-            sr.T = r[n] & 1;
-            r[n] >>= 1;
-            return true;
-        }
-    }
-    
-    // Control flow and special operations
-    else if (op_high == 0x0) {
-        u32 low_byte = op & 0xFF;
-        if (low_byte == 0x08) { // clrt
-            sr.T = 0;
-            return true;
-        } else if (low_byte == 0x18) { // sett
-            sr.T = 1;
-            return true;
-        }
-    }
-    
-    return false; // Not handled in hot path
-}
--- a/core/hw/sh4/interpr/sh4_ultra_interpreter.h
+++ b/core/hw/sh4/interpr/sh4_ultra_interpreter.h
@ -1,20 +0,0 @@
-#pragma once
-
-#include "types.h"
-
-// === NEXT-GENERATION ULTRA-FAST SH4 INTERPRETER ===
-// This interpreter beats the legacy interpreter using modern optimization techniques
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Ultra-interpreter factory function
-void* Get_UltraInterpreter();
-
-#ifdef __cplusplus
-}
-#endif
-
-// Enable ultra-interpreter by default
-#define USE_ULTRA_INTERPRETER 1