remove ultra shit

This commit is contained in:
Joseph Mattiello 2025-07-06 21:42:45 -04:00
parent 31c8fcf010
commit 0a17aee1d1
4 changed files with 51 additions and 1026 deletions

View File

@ -986,8 +986,6 @@ target_sources(${PROJECT_NAME} PRIVATE
core/hw/sh4/interpr/sh4_interpreter.cpp
core/hw/sh4/interpr/sh4_opcodes.cpp
core/hw/sh4/interpr/sh4_opcodes.h
core/hw/sh4/interpr/sh4_ultra_interpreter.cpp
core/hw/sh4/interpr/sh4_ultra_interpreter.h
core/hw/sh4/modules/bsc.cpp
core/hw/sh4/modules/ccn.cpp
core/hw/sh4/modules/ccn.h

View File

@ -1,5 +1,5 @@
/*
Highly inefficient and boring interpreter. Nothing special here
Optimized SH4 interpreter for better performance
*/
#include "types.h"
@ -13,7 +13,6 @@
#include "../sh4_cache.h"
#include "debug/gdb_server.h"
#include "../sh4_cycles.h"
#include "sh4_ultra_interpreter.h"
// SH4 underclock factor when using the interpreter so that it's somewhat usable
#ifdef STRICT_MODE
@ -25,40 +24,76 @@ constexpr int CPU_RATIO = 8;
Sh4ICache icache;
Sh4OCache ocache;
static void ExecuteOpcode(u16 op)
// === SIMPLE INSTRUCTION CACHE FOR BETTER PERFORMANCE ===
#define SIMPLE_ICACHE_SIZE 1024
#define SIMPLE_ICACHE_MASK (SIMPLE_ICACHE_SIZE - 1)
struct SimpleInstructionCache {
u32 pc[SIMPLE_ICACHE_SIZE];
u16 opcode[SIMPLE_ICACHE_SIZE];
void reset() {
for (int i = 0; i < SIMPLE_ICACHE_SIZE; i++) {
pc[i] = 0xFFFFFFFF;
}
}
u16 fetch(u32 addr) {
u32 index = (addr >> 1) & SIMPLE_ICACHE_MASK;
if (__builtin_expect(pc[index] == addr, 1)) {
return opcode[index];
}
// Cache miss - fetch from memory
u16 op = IReadMem16(addr);
pc[index] = addr;
opcode[index] = op;
return op;
}
};
static SimpleInstructionCache g_simple_icache;
static inline void ExecuteOpcode(u16 op)
{
if (sr.FD == 1 && OpDesc[op]->IsFloatingPoint())
if (__builtin_expect(sr.FD == 1 && OpDesc[op]->IsFloatingPoint(), 0))
RaiseFPUDisableException();
OpPtr[op](op);
sh4cycles.executeCycles(op);
}
static u16 ReadNexOp()
static inline u16 ReadNexOp()
{
if (!mmu_enabled() && (next_pc & 1))
if (__builtin_expect(!mmu_enabled() && (next_pc & 1), 0))
// address error
throw SH4ThrownException(next_pc, Sh4Ex_AddressErrorRead);
u32 addr = next_pc;
next_pc += 2;
return IReadMem16(addr);
// Use simple instruction cache for better performance
return g_simple_icache.fetch(addr);
}
static void Sh4_int_Run()
{
RestoreHostRoundingMode();
// Reset instruction cache at start
g_simple_icache.reset();
try {
do
{
try {
// Optimized inner loop with minimal overhead
do
{
u32 op = ReadNexOp();
ExecuteOpcode(op);
} while (p_sh4rcb->cntx.cycle_counter > 0);
} while (__builtin_expect(p_sh4rcb->cntx.cycle_counter > 0, 1));
p_sh4rcb->cntx.cycle_counter += SH4_TIMESLICE;
UpdateSystem_INTC();
} catch (const SH4ThrownException& ex) {
@ -66,7 +101,7 @@ static void Sh4_int_Run()
// an exception requires the instruction pipeline to drain, so approx 5 cycles
sh4cycles.addCycles(5 * CPU_RATIO);
}
} while (sh4_int_bCpuRun);
} while (__builtin_expect(sh4_int_bCpuRun, 1));
} catch (const debugger::Stop&) {
}
@ -128,6 +163,9 @@ static void Sh4_int_Reset(bool hard)
ocache.Reset(hard);
sh4cycles.reset();
p_sh4rcb->cntx.cycle_counter = SH4_TIMESLICE;
// Reset simple instruction cache
g_simple_icache.reset();
INFO_LOG(INTERPRETER, "Sh4 Reset");
}
@ -193,6 +231,7 @@ int UpdateSystem_INTC()
}
static void sh4_int_resetcache() {
g_simple_icache.reset();
}
static void Sh4_int_Init()
@ -200,6 +239,7 @@ static void Sh4_int_Init()
static_assert(sizeof(Sh4cntx) == 448, "Invalid Sh4Cntx size");
memset(&p_sh4rcb->cntx, 0, sizeof(p_sh4rcb->cntx));
g_simple_icache.reset();
}
static void Sh4_int_Term()
@ -211,21 +251,7 @@ static void Sh4_int_Term()
#ifndef ENABLE_SH4_CACHED_IR
void Get_Sh4Interpreter(sh4_if* cpu)
{
#ifdef USE_ULTRA_INTERPRETER
fprintf(stderr, "🚀 ULTRA-INTERPRETER: Get_Sh4Interpreter called — linking ultra-fast interpreter!\n");
// Use the ultra-interpreter instead of legacy
cpu->Start = Sh4_int_Start;
cpu->Run = (void(*)())Get_UltraInterpreter(); // Use ultra-interpreter run function
cpu->Stop = Sh4_int_Stop;
cpu->Step = Sh4_int_Step;
cpu->Reset = Sh4_int_Reset;
cpu->Init = Sh4_int_Init;
cpu->Term = Sh4_int_Term;
cpu->IsCpuRunning = Sh4_int_IsCpuRunning;
cpu->ResetCache = sh4_int_resetcache;
#else
fprintf(stderr, "[LEGACY-INT] Get_Sh4Interpreter called — linking legacy interpreter!\n");
INFO_LOG(INTERPRETER, "🚀 OPTIMIZED-INTERPRETER: Get_Sh4Interpreter called — linking optimized interpreter!");
cpu->Start = Sh4_int_Start;
cpu->Run = Sh4_int_Run;
cpu->Stop = Sh4_int_Stop;
@ -235,6 +261,5 @@ void Get_Sh4Interpreter(sh4_if* cpu)
cpu->Term = Sh4_int_Term;
cpu->IsCpuRunning = Sh4_int_IsCpuRunning;
cpu->ResetCache = sh4_int_resetcache;
#endif
}
#endif // ENABLE_SH4_CACHED_IR

View File

@ -1,978 +0,0 @@
// === NEXT-GENERATION ULTRA-FAST SH4 INTERPRETER ===
// This interpreter beats the legacy interpreter by being SIMPLER and FASTER
// - Direct SH4 execution like legacy but with optimizations that actually work
// - Simple instruction caching without complex block compilation
// - BLOCK CACHING: Groups instructions into blocks for dynarec-like performance
// - MMU-aware optimizations
// - ARM64 prefetching and branch prediction
#include "sh4_ultra_interpreter.h"
#include "hw/sh4/sh4_interpreter.h"
#include "hw/sh4/sh4_opcode_list.h"
#include "hw/sh4/sh4_core.h"
#include "hw/sh4/sh4_interrupts.h"
#include "hw/sh4/sh4_mem.h"
#include "hw/sh4/sh4_sched.h"
#include "hw/sh4/sh4_cache.h"
#include "hw/sh4/modules/mmu.h"
#include <unordered_map>
#include <vector>
// === BLOCK CACHING SYSTEM ===
// This is the key to achieving dynarec-like performance!
#define MAX_BLOCK_SIZE 32
#define BLOCK_CACHE_SIZE 2048
#define HOT_BLOCK_THRESHOLD 10
struct CachedBlock {
u32 pc_start;
u32 pc_end;
u32 execution_count;
bool is_hot_block;
std::vector<u16> opcodes;
// Block analysis
bool has_branches;
bool has_memory_ops;
bool is_pure_arithmetic;
bool can_use_fast_path;
CachedBlock() : pc_start(0), pc_end(0), execution_count(0), is_hot_block(false),
has_branches(false), has_memory_ops(false), is_pure_arithmetic(true), can_use_fast_path(true) {}
};
// Block cache using unordered_map for fast lookups
static std::unordered_map<u32, CachedBlock> g_block_cache;
// Block cache statistics
struct BlockCacheStats {
u64 total_blocks_executed;
u64 hot_block_executions;
u64 cold_block_executions;
u64 blocks_created;
u64 cache_hits;
u64 cache_misses;
void reset() {
total_blocks_executed = 0;
hot_block_executions = 0;
cold_block_executions = 0;
blocks_created = 0;
cache_hits = 0;
cache_misses = 0;
}
};
static BlockCacheStats g_block_stats;
// === INSTRUCTION SEQUENCE CACHING ===
#define MAX_SEQUENCE_LENGTH 8
#define SEQUENCE_CACHE_SIZE 256
struct InstructionSequence {
u32 start_pc;
u16 opcodes[MAX_SEQUENCE_LENGTH];
u32 length;
u32 execution_count;
bool is_optimized;
// Optimization flags
bool is_pure_arithmetic; // Only arithmetic ops, no memory/branches
bool is_register_shuffle; // Only register moves
bool is_memory_block; // Memory operations that can be batched
bool uses_neon; // Can use NEON SIMD optimization
};
static InstructionSequence g_sequence_cache[SEQUENCE_CACHE_SIZE];
// === ULTRA-SIMPLE INSTRUCTION CACHE ===
#define ICACHE_SIZE 1024
#define ICACHE_MASK (ICACHE_SIZE - 1)
struct UltraInstructionCache {
u32 pc[ICACHE_SIZE];
u16 opcode[ICACHE_SIZE];
#ifdef DEBUG
u64 hits;
u64 misses;
#endif
void reset() {
for (int i = 0; i < ICACHE_SIZE; i++) {
pc[i] = 0xFFFFFFFF;
opcode[i] = 0;
}
#ifdef DEBUG
hits = 0;
misses = 0;
#endif
}
u16 fetch(u32 addr) {
u32 index = (addr >> 1) & ICACHE_MASK;
if (pc[index] == addr) {
#ifdef DEBUG
hits++;
#endif
return opcode[index];
}
// Cache miss - fetch from memory
#ifdef DEBUG
misses++;
#endif
u16 op = IReadMem16(addr);
pc[index] = addr;
opcode[index] = op;
return op;
}
};
static UltraInstructionCache g_icache;
// === PERFORMANCE STATS ===
struct UltraStats {
#ifdef DEBUG
u64 instructions;
u64 cycles;
u32 mmu_state_changes;
#endif
bool mmu_enabled;
void reset() {
#ifdef DEBUG
instructions = 0;
cycles = 0;
mmu_state_changes = 0;
#endif
mmu_enabled = ::mmu_enabled();
}
void check_mmu() {
bool current_mmu = ::mmu_enabled();
if (current_mmu != mmu_enabled) {
#ifdef DEBUG
mmu_state_changes++;
INFO_LOG(INTERPRETER, "🔄 MMU state changed: %s", current_mmu ? "ENABLED" : "DISABLED");
#endif
mmu_enabled = current_mmu;
}
}
};
static UltraStats g_stats;
// === FORWARD DECLARATIONS ===
static inline bool ultra_execute_hot_opcode(u16 op);
// === BLOCK CACHING FUNCTIONS ===
// Create a new cached block starting at the given PC
static CachedBlock create_cached_block(u32 start_pc) {
CachedBlock block;
block.pc_start = start_pc;
block.execution_count = 0;
block.is_hot_block = false;
// Decode instructions until we hit a branch or reach max block size
u32 current_pc = start_pc;
for (u32 i = 0; i < MAX_BLOCK_SIZE; i++) {
u16 op = IReadMem16(current_pc);
block.opcodes.push_back(op);
current_pc += 2;
// Analyze the instruction
if (OpDesc[op]->SetPC()) {
// This is a branch instruction - end the block
block.has_branches = true;
break;
}
// Analyze instruction type based on opcode patterns
u32 op_high = op >> 12;
u32 op_low = op & 0xF;
// Check for memory operations (rough heuristic)
if (op_high == 0x2 || op_high == 0x6 || op_high == 0x8 || op_high == 0x9 || op_high == 0xC || op_high == 0xD) {
block.has_memory_ops = true;
}
// Check if it's arithmetic/logical operation
if (!(op_high == 0x3 || op_high == 0x7 || (op_high == 0x2 && (op_low == 0x9 || op_low == 0xA || op_low == 0xB)))) {
block.is_pure_arithmetic = false;
}
}
block.pc_end = current_pc;
// Determine if this block can use fast path execution
block.can_use_fast_path = !block.has_branches && block.opcodes.size() <= 16;
g_block_stats.blocks_created++;
INFO_LOG(INTERPRETER, "🔨 Created block PC=0x%08X-0x%08X (%d opcodes, branches=%s, memory=%s)",
block.pc_start, block.pc_end, (int)block.opcodes.size(),
block.has_branches ? "yes" : "no", block.has_memory_ops ? "yes" : "no");
return block;
}
// Execute a cached block with proper exception and control flow handling
static void execute_cached_block(CachedBlock& block) {
block.execution_count++;
g_block_stats.total_blocks_executed++;
// Promote to hot block if executed frequently
if (block.execution_count >= HOT_BLOCK_THRESHOLD && !block.is_hot_block) {
block.is_hot_block = true;
INFO_LOG(INTERPRETER, "🔥 Block at PC=0x%08X promoted to HOT BLOCK (%u executions)",
block.pc_start, block.execution_count);
}
// Track hot vs cold execution
if (block.is_hot_block) {
g_block_stats.hot_block_executions++;
} else {
g_block_stats.cold_block_executions++;
}
// CRITICAL: Execute instructions one by one to handle exceptions properly
u32 block_pc = block.pc_start;
try {
for (size_t i = 0; i < block.opcodes.size(); i++) {
u16 op = block.opcodes[i];
// Update PC and next_pc for this instruction
Sh4cntx.pc = block_pc;
next_pc = block_pc + 2;
// Check for interrupts before each instruction
if (__builtin_expect(UpdateSystem_INTC(), 0)) {
// Interrupt pending - must break out of block
return;
}
// Check for floating point disable exception
if (__builtin_expect(sr.FD == 1 && OpDesc[op]->IsFloatingPoint(), 0)) {
RaiseFPUDisableException();
}
if (block.is_hot_block && block.can_use_fast_path) {
// Try ultra-fast inline execution first
if (!ultra_execute_hot_opcode(op)) {
// Fall back to legacy handler
OpPtr[op](op);
}
} else {
// Execute using legacy handler
OpPtr[op](op);
}
// Execute cycles
sh4cycles.executeCycles(op);
// CRITICAL: Check if PC was changed by instruction (jumps, branches, exceptions)
if (next_pc != block_pc + 2) {
// Control flow changed - instruction modified PC
// This means we have a jump, branch, or exception
return; // Exit block execution immediately
}
// Move to next instruction in block
block_pc += 2;
}
} catch (const SH4ThrownException& ex) {
// Exception occurred during block execution
Do_Exception(ex.epc, ex.expEvn);
// Exception requires pipeline drain, so approx 5 cycles
sh4cycles.addCycles(5 * 8); // 8 = CPU_RATIO from legacy
return; // Exit block execution
}
}
// === ULTRA-FAST INSTRUCTION FETCH ===
static inline u16 ultra_fetch_instruction(u32 pc) {
// Use instruction cache for frequently accessed instructions
return g_icache.fetch(pc);
}
// === MEMORY ACCESS OPTIMIZATIONS ===
// Fast paths for common memory operations
// Fast path for main RAM access (0x0C000000-0x0CFFFFFF)
static inline u32 ultra_read_mem32_fast(u32 addr) {
// Fast path for main RAM (most common case)
if (__builtin_expect((addr & 0xFF000000) == 0x0C000000, 1)) {
return ReadMem32(addr); // Direct access to main RAM
}
// Fallback for other memory regions
return ReadMem32(addr);
}
static inline void ultra_write_mem32_fast(u32 addr, u32 data) {
// Fast path for main RAM (most common case)
if (__builtin_expect((addr & 0xFF000000) == 0x0C000000, 1)) {
WriteMem32(addr, data); // Direct access to main RAM
return;
}
// Fallback for other memory regions
WriteMem32(addr, data);
}
// === HOT OPCODE SPECIALIZATION ===
// Inline optimized versions of the most common opcodes to bypass function call overhead
// ULTRA-HOT PATH: Inline the top 10 most critical opcodes for massive speedup
static inline bool ultra_execute_superhot_opcode(u16 op) {
// Fast decode without switches for maximum performance
u32 op_high = op >> 12;
u32 op_low = op & 0xF;
// TOP 1: mov <REG_M>,<REG_N> (0x6xx3) - 25% of all instructions
if (__builtin_expect((op & 0xF00F) == 0x6003, 1)) {
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] = r[m];
return true;
}
// TOP 2: add #<imm>,<REG_N> (0x7xxx) - 15% of all instructions
if (__builtin_expect(op_high == 0x7, 1)) {
u32 n = (op >> 8) & 0xF;
s32 imm = (s32)(s8)(op & 0xFF);
r[n] += imm;
return true;
}
// TOP 3: add <REG_M>,<REG_N> (0x3xxC) - 10% of all instructions
if (__builtin_expect((op & 0xF00F) == 0x300C, 1)) {
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] += r[m];
return true;
}
// TOP 4: mov.l @<REG_M>,<REG_N> (0x6xx2) - 8% of all instructions
if (__builtin_expect((op & 0xF00F) == 0x6002, 1)) {
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] = ReadMem32(r[m]);
return true;
}
// TOP 5: mov.l <REG_M>,@<REG_N> (0x2xx2) - 6% of all instructions
if (__builtin_expect((op & 0xF00F) == 0x2002, 1)) {
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
WriteMem32(r[n], r[m]);
return true;
}
// TOP 6: cmp/eq <REG_M>,<REG_N> (0x3xx0) - 5% of all instructions
if (__builtin_expect((op & 0xF00F) == 0x3000, 1)) {
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
sr.T = (r[m] == r[n]) ? 1 : 0;
return true;
}
// TOP 7: sub <REG_M>,<REG_N> (0x3xx8) - 4% of all instructions
if (__builtin_expect((op & 0xF00F) == 0x3008, 1)) {
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] -= r[m];
return true;
}
// TOP 8: nop (0x0009) - 4% of all instructions
if (__builtin_expect(op == 0x0009, 1)) {
return true; // Nothing to do
}
// TOP 9: mov.l @<REG_M>+,<REG_N> (0x6xx6) - 3% of all instructions
if (__builtin_expect((op & 0xF00F) == 0x6006, 1)) {
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] = ReadMem32(r[m]);
if (n != m) r[m] += 4;
return true;
}
// TOP 10: dt <REG_N> (0x4xx0 with low byte 0x10) - 3% of all instructions
if (__builtin_expect((op & 0xF0FF) == 0x4010, 1)) {
u32 n = (op >> 8) & 0xF;
r[n]--;
sr.T = (r[n] == 0) ? 1 : 0;
return true;
}
return false; // Not a superhot opcode
}
// Optimized memory operations for hot opcodes
static inline void ultra_execute_hot_opcode_with_mem_opt(u16 op) {
// Extract opcode patterns for fastest common instructions
u32 opcode_high = (op >> 12) & 0xF;
u32 opcode_low = op & 0xF;
switch (opcode_high) {
case 0x6: // 6xxx - MOV instructions (super hot!)
switch (opcode_low) {
case 0x3: { // mov <REG_M>,<REG_N> - HOTTEST OPCODE
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] = r[m];
return;
}
case 0x2: { // mov.l @<REG_M>,<REG_N> - OPTIMIZED
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] = ultra_read_mem32_fast(r[m]);
// Prefetch next cache line if sequential access pattern
#ifdef __aarch64__
__builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[m] + 32)), 0, 1);
#endif
return;
}
case 0x6: { // mov.l @<REG_M>+,<REG_N> - OPTIMIZED
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] = ultra_read_mem32_fast(r[m]);
if (n != m) {
r[m] += 4;
// Prefetch next memory location
#ifdef __aarch64__
__builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[m] + 16)), 0, 1);
#endif
}
return;
}
}
break;
case 0x2: // 2xxx - Memory operations and logic
switch (opcode_low) {
case 0x2: { // mov.l <REG_M>,@<REG_N> - OPTIMIZED
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
ultra_write_mem32_fast(r[n], r[m]);
// Prefetch next cache line for sequential writes
#ifdef __aarch64__
__builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[n] + 32)), 1, 1);
#endif
return;
}
case 0x6: { // mov.l <REG_M>,@-<REG_N> - OPTIMIZED
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] -= 4;
ultra_write_mem32_fast(r[n], r[m]);
// Prefetch previous cache line for stack operations
#ifdef __aarch64__
__builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[n] - 32)), 1, 1);
#endif
return;
}
}
break;
}
// Fallback to standard hot opcode execution
ultra_execute_hot_opcode(op);
}
// === FORWARD DECLARATIONS ===
static void ultra_interpreter_run();
// === ARM64 NEON SIMD OPTIMIZATIONS ===
// Use ARM64 NEON to process multiple registers simultaneously
#ifdef __aarch64__
#include <arm_neon.h>
// Bulk register clear using NEON (4 registers at once)
static inline void neon_clear_registers(u32* reg_base, int count) {
uint32x4_t zero = vdupq_n_u32(0);
for (int i = 0; i < count; i += 4) {
vst1q_u32(&reg_base[i], zero);
}
}
// Bulk register copy using NEON (4 registers at once)
static inline void neon_copy_registers(u32* dst, const u32* src, int count) {
for (int i = 0; i < count; i += 4) {
uint32x4_t data = vld1q_u32(&src[i]);
vst1q_u32(&dst[i], data);
}
}
// NEON-optimized register bank switching
static inline void neon_switch_register_bank() {
// Save current bank using NEON (only 8 registers in r_bank)
uint32x4_t bank0_3 = vld1q_u32(&r[0]);
uint32x4_t bank4_7 = vld1q_u32(&r[4]);
// Load shadow bank using NEON (only 8 registers)
vst1q_u32(&r[0], vld1q_u32(&r_bank[0]));
vst1q_u32(&r[4], vld1q_u32(&r_bank[4]));
// Store old bank to shadow (only 8 registers)
vst1q_u32(&r_bank[0], bank0_3);
vst1q_u32(&r_bank[4], bank4_7);
}
// Detect patterns for NEON optimization
static inline bool is_bulk_mov_pattern(u16* opcodes, int count) {
// Check if we have 4+ consecutive MOV operations
int mov_count = 0;
for (int i = 0; i < count && i < 8; i++) {
if ((opcodes[i] & 0xF00F) == 0x6003) { // mov <REG_M>,<REG_N>
mov_count++;
} else {
break;
}
}
return mov_count >= 4;
}
// Execute bulk MOV operations with NEON
static inline int execute_bulk_mov_neon(u16* opcodes, int count) {
// Extract source and destination registers
u32 src_regs[4], dst_regs[4];
for (int i = 0; i < 4; i++) {
src_regs[i] = (opcodes[i] >> 4) & 0xF;
dst_regs[i] = (opcodes[i] >> 8) & 0xF;
}
// Load source values using NEON gather (simulated)
uint32x4_t values = {r[src_regs[0]], r[src_regs[1]], r[src_regs[2]], r[src_regs[3]]};
// Store to destinations
r[dst_regs[0]] = vgetq_lane_u32(values, 0);
r[dst_regs[1]] = vgetq_lane_u32(values, 1);
r[dst_regs[2]] = vgetq_lane_u32(values, 2);
r[dst_regs[3]] = vgetq_lane_u32(values, 3);
return 4; // Processed 4 instructions
}
#else
// Fallback for non-ARM64 platforms
static inline void neon_clear_registers(u32* reg_base, int count) {
for (int i = 0; i < count; i++) {
reg_base[i] = 0;
}
}
static inline void neon_copy_registers(u32* dst, const u32* src, int count) {
for (int i = 0; i < count; i++) {
dst[i] = src[i];
}
}
static inline void neon_switch_register_bank() {
// Standard register bank switch
for (int i = 0; i < 16; i++) {
u32 temp = r[i];
r[i] = r_bank[i];
r_bank[i] = temp;
}
}
static inline bool is_bulk_mov_pattern(u16* opcodes, int count) { return false; }
static inline int execute_bulk_mov_neon(u16* opcodes, int count) { return 0; }
#endif
// === SEQUENCE CACHING IMPLEMENTATION ===
// Hash function for instruction sequences
static inline u32 hash_sequence(u32 pc, u16* opcodes, u32 length) {
u32 hash = pc;
for (u32 i = 0; i < length; i++) {
hash = hash * 31 + opcodes[i];
}
return hash % SEQUENCE_CACHE_SIZE;
}
// Analyze instruction sequence for optimization opportunities
static inline void analyze_sequence(InstructionSequence* seq) {
seq->is_pure_arithmetic = true;
seq->is_register_shuffle = true;
seq->is_memory_block = true;
seq->uses_neon = false;
int mov_count = 0;
int arith_count = 0;
int mem_count = 0;
for (u32 i = 0; i < seq->length; i++) {
u16 op = seq->opcodes[i];
u32 op_high = (op >> 12) & 0xF;
u32 op_low = op & 0xF;
// Check instruction types
if (op_high == 0x6 && op_low == 0x3) {
mov_count++; // mov <REG_M>,<REG_N>
} else if (op_high == 0x3 && (op_low == 0xC || op_low == 0x8)) {
arith_count++; // add/sub
seq->is_register_shuffle = false;
} else if (op_high == 0x6 && (op_low == 0x2 || op_low == 0x6)) {
mem_count++; // memory load
seq->is_pure_arithmetic = false;
seq->is_register_shuffle = false;
} else {
// Complex instruction - disable optimizations
seq->is_pure_arithmetic = false;
seq->is_register_shuffle = false;
seq->is_memory_block = false;
}
}
// Enable NEON if we have enough parallel operations
if (mov_count >= 4 || arith_count >= 4) {
seq->uses_neon = true;
}
// Set memory block flag based on memory operations
if (mem_count < seq->length / 2) {
seq->is_memory_block = false;
}
seq->is_optimized = true;
}
// Execute optimized instruction sequence
static inline u32 execute_optimized_sequence(InstructionSequence* seq, u32 pc) {
if (!seq->is_optimized) {
analyze_sequence(seq);
}
// Fast path for register shuffles with NEON
if (seq->uses_neon && seq->is_register_shuffle) {
int processed = execute_bulk_mov_neon(seq->opcodes, seq->length);
if (processed > 0) {
return processed * 2; // Each instruction is 2 bytes
}
}
// Fast path for pure arithmetic
if (seq->is_pure_arithmetic && seq->length <= 4) {
for (u32 i = 0; i < seq->length; i++) {
ultra_execute_hot_opcode(seq->opcodes[i]);
}
return seq->length * 2;
}
// Fallback to individual execution
for (u32 i = 0; i < seq->length; i++) {
ultra_execute_hot_opcode(seq->opcodes[i]);
}
return seq->length * 2;
}
// Try to find or create a cached sequence
static inline InstructionSequence* find_cached_sequence(u32 pc) {
// Look ahead and try to build a sequence
u16 opcodes[MAX_SEQUENCE_LENGTH];
u32 length = 0;
u32 current_pc = pc;
// Build sequence until we hit a branch or complex instruction
for (length = 0; length < MAX_SEQUENCE_LENGTH; length++) {
u16 op = ultra_fetch_instruction(current_pc);
opcodes[length] = op;
current_pc += 2;
// Stop at branches or complex instructions
u32 op_high = (op >> 12) & 0xF;
if (op_high == 0xA || op_high == 0xB || // Branch instructions
op_high == 0xF || // FPU instructions
op == 0x001B) { // sleep instruction
length++; // Include this instruction
break;
}
}
if (length < 2) return nullptr; // Too short to optimize
// Check cache
u32 hash = hash_sequence(pc, opcodes, length);
InstructionSequence* seq = &g_sequence_cache[hash];
if (seq->start_pc == pc && seq->length == length) {
// Cache hit - verify opcodes match
bool match = true;
for (u32 i = 0; i < length; i++) {
if (seq->opcodes[i] != opcodes[i]) {
match = false;
break;
}
}
if (match) {
seq->execution_count++;
return seq;
}
}
// Cache miss - create new sequence
seq->start_pc = pc;
seq->length = length;
seq->execution_count = 1;
seq->is_optimized = false;
for (u32 i = 0; i < length; i++) {
seq->opcodes[i] = opcodes[i];
}
return seq;
}
// Build a new instruction sequence starting at the given PC
static inline void build_instruction_sequence(u32 start_pc, u32 length) {
// Don't build sequences if we're in an exception handler or delay slot
if (sr.BL || next_pc != start_pc + 2) {
return;
}
// Find an empty slot in the cache
u32 hash = start_pc % SEQUENCE_CACHE_SIZE;
InstructionSequence* seq = &g_sequence_cache[hash];
// If slot is occupied by a different sequence, check if we should replace it
if (seq->start_pc != 0 && seq->start_pc != start_pc) {
// Only replace if the existing sequence has low execution count
if (seq->execution_count > 10) {
return; // Keep the existing sequence
}
}
// Initialize the sequence
seq->start_pc = start_pc;
seq->length = 0;
seq->execution_count = 1;
seq->is_optimized = false;
// Fetch instructions for the sequence
u32 current_pc = start_pc;
for (u32 i = 0; i < length && i < MAX_SEQUENCE_LENGTH; i++) {
u16 op = ultra_fetch_instruction(current_pc);
seq->opcodes[i] = op;
seq->length++;
current_pc += 2;
// Stop building if we hit a branch or jump
u32 op_high = (op >> 12) & 0xF;
if (op_high == 0x8 || op_high == 0x9 || op_high == 0xA || op_high == 0xB) {
// Branch instructions - stop sequence here
break;
}
// Stop if we hit a system call or privileged instruction
if (op == 0x000B || op == 0x0093 || op == 0x0083) {
// sleep, rte, pref - stop sequence
break;
}
}
// Only keep sequences with at least 2 instructions
if (seq->length < 2) {
seq->start_pc = 0; // Mark as empty
return;
}
// Analyze the sequence for optimization opportunities
analyze_sequence(seq);
#ifdef DEBUG
INFO_LOG(INTERPRETER, "🔗 Built sequence at PC=0x%08X, length=%d, arithmetic=%s, shuffle=%s",
start_pc, seq->length,
seq->is_pure_arithmetic ? "yes" : "no",
seq->is_register_shuffle ? "yes" : "no");
#endif
}
// === ULTRA-FAST MAIN EXECUTION LOOP WITH BLOCK CACHING ===
// This uses block caching like the dynarec for maximum performance
static void ultra_interpreter_run() {
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Starting block-cached execution");
// Reset stats
g_stats.reset();
g_icache.reset();
g_block_stats.reset();
// Main execution loop - BLOCK-BASED like dynarec!
while (sh4_int_bCpuRun) {
try {
// Inner loop with block execution
do {
// CRITICAL: Check for system updates and interrupts
if (UpdateSystem()) {
break; // System update occurred, restart loop
}
// Get current PC
u32 current_pc = next_pc;
// Look up block in cache first
auto it = g_block_cache.find(current_pc);
if (it != g_block_cache.end()) {
// CACHE HIT: Execute cached block
g_block_stats.cache_hits++;
execute_cached_block(it->second);
} else {
// CACHE MISS: Create new block and execute it
g_block_stats.cache_misses++;
// Create new block
CachedBlock new_block = create_cached_block(current_pc);
// Add to cache
g_block_cache[current_pc] = std::move(new_block);
// Execute the new block
execute_cached_block(g_block_cache[current_pc]);
}
// CRITICAL: Check if we're stuck in an infinite loop
if (next_pc == current_pc) {
// PC hasn't changed - this could be an infinite loop
// Fall back to single instruction execution
u16 op = IReadMem16(current_pc);
Sh4cntx.pc = current_pc;
next_pc = current_pc + 2;
OpPtr[op](op);
sh4cycles.executeCycles(op);
}
// Periodic stats reporting (every 10000 blocks)
static u32 stats_counter = 0;
if ((++stats_counter % 10000) == 0) {
INFO_LOG(INTERPRETER, "📊 BLOCK STATS: %llu executed, %llu hot, %llu cold, %llu created, %.1f%% hit ratio",
g_block_stats.total_blocks_executed, g_block_stats.hot_block_executions,
g_block_stats.cold_block_executions, g_block_stats.blocks_created,
(g_block_stats.cache_hits + g_block_stats.cache_misses) > 0 ?
(float)g_block_stats.cache_hits / (g_block_stats.cache_hits + g_block_stats.cache_misses) * 100.0f : 0.0f);
}
} while (p_sh4rcb->cntx.cycle_counter > 0 && sh4_int_bCpuRun);
// Update system timing
p_sh4rcb->cntx.cycle_counter += SH4_TIMESLICE;
} catch (const SH4ThrownException& ex) {
Do_Exception(ex.epc, ex.expEvn);
// Exception requires pipeline drain, so approx 5 cycles
sh4cycles.addCycles(5 * 8); // 8 = CPU_RATIO from legacy
}
}
INFO_LOG(INTERPRETER, "🏁 ULTRA-INTERPRETER: Finished block-cached execution");
// Print final block cache statistics
INFO_LOG(INTERPRETER, "📊 FINAL BLOCK STATS:");
INFO_LOG(INTERPRETER, " Total blocks executed: %llu", g_block_stats.total_blocks_executed);
INFO_LOG(INTERPRETER, " Hot block executions: %llu (%.1f%%)",
g_block_stats.hot_block_executions,
g_block_stats.total_blocks_executed > 0 ?
(double)g_block_stats.hot_block_executions / g_block_stats.total_blocks_executed * 100.0 : 0.0);
INFO_LOG(INTERPRETER, " Cold block executions: %llu (%.1f%%)",
g_block_stats.cold_block_executions,
g_block_stats.total_blocks_executed > 0 ?
(double)g_block_stats.cold_block_executions / g_block_stats.total_blocks_executed * 100.0 : 0.0);
INFO_LOG(INTERPRETER, " Blocks created: %llu", g_block_stats.blocks_created);
INFO_LOG(INTERPRETER, " Cache hit ratio: %.1f%%",
(g_block_stats.cache_hits + g_block_stats.cache_misses) > 0 ?
(float)g_block_stats.cache_hits / (g_block_stats.cache_hits + g_block_stats.cache_misses) * 100.0f : 0.0f);
#ifdef DEBUG
INFO_LOG(INTERPRETER, "📊 Final stats: %llu instructions, %llu cycles, %d MMU changes",
g_stats.instructions, g_stats.cycles, g_stats.mmu_state_changes);
float cache_hit_ratio = (g_icache.hits + g_icache.misses) > 0 ?
(float)g_icache.hits / (g_icache.hits + g_icache.misses) * 100.0f : 0.0f;
INFO_LOG(INTERPRETER, "📊 Instruction cache: %llu hits, %llu misses, %.1f%% hit ratio",
g_icache.hits, g_icache.misses, cache_hit_ratio);
#endif
}
// === ULTRA-INTERPRETER INTERFACE ===
void* Get_UltraInterpreter() {
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Get_UltraInterpreter called — linking block-cached interpreter!");
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Block caching: ENABLED (%d max blocks, %d max size)", BLOCK_CACHE_SIZE, MAX_BLOCK_SIZE);
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Hot block threshold: %d executions", HOT_BLOCK_THRESHOLD);
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Instruction caching: ENABLED (%d entries)", ICACHE_SIZE);
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: ARM64 prefetching: ENABLED");
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: MMU-aware optimizations: ENABLED");
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Block-based execution like dynarec but simpler!");
return (void*)ultra_interpreter_run;
}
// SECONDARY HOT PATH: Handle next tier of opcodes with minimal overhead
static inline bool ultra_execute_hot_opcode(u16 op) {
u32 op_high = op >> 12;
u32 op_low = op & 0xF;
// Memory operations with pre-decrement/post-increment
if (op_high == 0x2) {
if (op_low == 0x6) { // mov.l <REG_M>,@-<REG_N>
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] -= 4;
WriteMem32(r[n], r[m]);
return true;
} else if (op_low == 0x9) { // and <REG_M>,<REG_N>
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] &= r[m];
return true;
} else if (op_low == 0xB) { // or <REG_M>,<REG_N>
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] |= r[m];
return true;
} else if (op_low == 0xA) { // xor <REG_M>,<REG_N>
u32 n = (op >> 8) & 0xF;
u32 m = (op >> 4) & 0xF;
r[n] ^= r[m];
return true;
}
}
// Shift operations
else if (op_high == 0x4) {
u32 low_byte = op & 0xFF;
if (low_byte == 0x00) { // shll <REG_N>
u32 n = (op >> 8) & 0xF;
sr.T = r[n] >> 31;
r[n] <<= 1;
return true;
} else if (low_byte == 0x01) { // shlr <REG_N>
u32 n = (op >> 8) & 0xF;
sr.T = r[n] & 1;
r[n] >>= 1;
return true;
}
}
// Control flow and special operations
else if (op_high == 0x0) {
u32 low_byte = op & 0xFF;
if (low_byte == 0x08) { // clrt
sr.T = 0;
return true;
} else if (low_byte == 0x18) { // sett
sr.T = 1;
return true;
}
}
return false; // Not handled in hot path
}

View File

@ -1,20 +0,0 @@
#pragma once
#include "types.h"
// === NEXT-GENERATION ULTRA-FAST SH4 INTERPRETER ===
// This interpreter beats the legacy interpreter using modern optimization techniques
#ifdef __cplusplus
extern "C" {
#endif
// Ultra-interpreter factory function
void* Get_UltraInterpreter();
#ifdef __cplusplus
}
#endif
// Enable ultra-interpreter by default
#define USE_ULTRA_INTERPRETER 1