remove ultra shit
This commit is contained in:
parent
31c8fcf010
commit
0a17aee1d1
|
@ -986,8 +986,6 @@ target_sources(${PROJECT_NAME} PRIVATE
|
|||
core/hw/sh4/interpr/sh4_interpreter.cpp
|
||||
core/hw/sh4/interpr/sh4_opcodes.cpp
|
||||
core/hw/sh4/interpr/sh4_opcodes.h
|
||||
core/hw/sh4/interpr/sh4_ultra_interpreter.cpp
|
||||
core/hw/sh4/interpr/sh4_ultra_interpreter.h
|
||||
core/hw/sh4/modules/bsc.cpp
|
||||
core/hw/sh4/modules/ccn.cpp
|
||||
core/hw/sh4/modules/ccn.h
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Highly inefficient and boring interpreter. Nothing special here
|
||||
Optimized SH4 interpreter for better performance
|
||||
*/
|
||||
|
||||
#include "types.h"
|
||||
|
@ -13,7 +13,6 @@
|
|||
#include "../sh4_cache.h"
|
||||
#include "debug/gdb_server.h"
|
||||
#include "../sh4_cycles.h"
|
||||
#include "sh4_ultra_interpreter.h"
|
||||
|
||||
// SH4 underclock factor when using the interpreter so that it's somewhat usable
|
||||
#ifdef STRICT_MODE
|
||||
|
@ -25,40 +24,76 @@ constexpr int CPU_RATIO = 8;
|
|||
Sh4ICache icache;
|
||||
Sh4OCache ocache;
|
||||
|
||||
static void ExecuteOpcode(u16 op)
|
||||
// === SIMPLE INSTRUCTION CACHE FOR BETTER PERFORMANCE ===
|
||||
#define SIMPLE_ICACHE_SIZE 1024
|
||||
#define SIMPLE_ICACHE_MASK (SIMPLE_ICACHE_SIZE - 1)
|
||||
|
||||
struct SimpleInstructionCache {
|
||||
u32 pc[SIMPLE_ICACHE_SIZE];
|
||||
u16 opcode[SIMPLE_ICACHE_SIZE];
|
||||
|
||||
void reset() {
|
||||
for (int i = 0; i < SIMPLE_ICACHE_SIZE; i++) {
|
||||
pc[i] = 0xFFFFFFFF;
|
||||
}
|
||||
}
|
||||
|
||||
u16 fetch(u32 addr) {
|
||||
u32 index = (addr >> 1) & SIMPLE_ICACHE_MASK;
|
||||
|
||||
if (__builtin_expect(pc[index] == addr, 1)) {
|
||||
return opcode[index];
|
||||
}
|
||||
|
||||
// Cache miss - fetch from memory
|
||||
u16 op = IReadMem16(addr);
|
||||
pc[index] = addr;
|
||||
opcode[index] = op;
|
||||
return op;
|
||||
}
|
||||
};
|
||||
|
||||
static SimpleInstructionCache g_simple_icache;
|
||||
|
||||
static inline void ExecuteOpcode(u16 op)
|
||||
{
|
||||
if (sr.FD == 1 && OpDesc[op]->IsFloatingPoint())
|
||||
if (__builtin_expect(sr.FD == 1 && OpDesc[op]->IsFloatingPoint(), 0))
|
||||
RaiseFPUDisableException();
|
||||
OpPtr[op](op);
|
||||
sh4cycles.executeCycles(op);
|
||||
}
|
||||
|
||||
static u16 ReadNexOp()
|
||||
static inline u16 ReadNexOp()
|
||||
{
|
||||
if (!mmu_enabled() && (next_pc & 1))
|
||||
if (__builtin_expect(!mmu_enabled() && (next_pc & 1), 0))
|
||||
// address error
|
||||
throw SH4ThrownException(next_pc, Sh4Ex_AddressErrorRead);
|
||||
|
||||
u32 addr = next_pc;
|
||||
next_pc += 2;
|
||||
|
||||
return IReadMem16(addr);
|
||||
// Use simple instruction cache for better performance
|
||||
return g_simple_icache.fetch(addr);
|
||||
}
|
||||
|
||||
static void Sh4_int_Run()
|
||||
{
|
||||
RestoreHostRoundingMode();
|
||||
|
||||
// Reset instruction cache at start
|
||||
g_simple_icache.reset();
|
||||
|
||||
try {
|
||||
do
|
||||
{
|
||||
try {
|
||||
// Optimized inner loop with minimal overhead
|
||||
do
|
||||
{
|
||||
u32 op = ReadNexOp();
|
||||
|
||||
ExecuteOpcode(op);
|
||||
} while (p_sh4rcb->cntx.cycle_counter > 0);
|
||||
} while (__builtin_expect(p_sh4rcb->cntx.cycle_counter > 0, 1));
|
||||
|
||||
p_sh4rcb->cntx.cycle_counter += SH4_TIMESLICE;
|
||||
UpdateSystem_INTC();
|
||||
} catch (const SH4ThrownException& ex) {
|
||||
|
@ -66,7 +101,7 @@ static void Sh4_int_Run()
|
|||
// an exception requires the instruction pipeline to drain, so approx 5 cycles
|
||||
sh4cycles.addCycles(5 * CPU_RATIO);
|
||||
}
|
||||
} while (sh4_int_bCpuRun);
|
||||
} while (__builtin_expect(sh4_int_bCpuRun, 1));
|
||||
} catch (const debugger::Stop&) {
|
||||
}
|
||||
|
||||
|
@ -128,6 +163,9 @@ static void Sh4_int_Reset(bool hard)
|
|||
ocache.Reset(hard);
|
||||
sh4cycles.reset();
|
||||
p_sh4rcb->cntx.cycle_counter = SH4_TIMESLICE;
|
||||
|
||||
// Reset simple instruction cache
|
||||
g_simple_icache.reset();
|
||||
|
||||
INFO_LOG(INTERPRETER, "Sh4 Reset");
|
||||
}
|
||||
|
@ -193,6 +231,7 @@ int UpdateSystem_INTC()
|
|||
}
|
||||
|
||||
static void sh4_int_resetcache() {
|
||||
g_simple_icache.reset();
|
||||
}
|
||||
|
||||
static void Sh4_int_Init()
|
||||
|
@ -200,6 +239,7 @@ static void Sh4_int_Init()
|
|||
static_assert(sizeof(Sh4cntx) == 448, "Invalid Sh4Cntx size");
|
||||
|
||||
memset(&p_sh4rcb->cntx, 0, sizeof(p_sh4rcb->cntx));
|
||||
g_simple_icache.reset();
|
||||
}
|
||||
|
||||
static void Sh4_int_Term()
|
||||
|
@ -211,21 +251,7 @@ static void Sh4_int_Term()
|
|||
#ifndef ENABLE_SH4_CACHED_IR
|
||||
void Get_Sh4Interpreter(sh4_if* cpu)
|
||||
{
|
||||
#ifdef USE_ULTRA_INTERPRETER
|
||||
fprintf(stderr, "🚀 ULTRA-INTERPRETER: Get_Sh4Interpreter called — linking ultra-fast interpreter!\n");
|
||||
|
||||
// Use the ultra-interpreter instead of legacy
|
||||
cpu->Start = Sh4_int_Start;
|
||||
cpu->Run = (void(*)())Get_UltraInterpreter(); // Use ultra-interpreter run function
|
||||
cpu->Stop = Sh4_int_Stop;
|
||||
cpu->Step = Sh4_int_Step;
|
||||
cpu->Reset = Sh4_int_Reset;
|
||||
cpu->Init = Sh4_int_Init;
|
||||
cpu->Term = Sh4_int_Term;
|
||||
cpu->IsCpuRunning = Sh4_int_IsCpuRunning;
|
||||
cpu->ResetCache = sh4_int_resetcache;
|
||||
#else
|
||||
fprintf(stderr, "[LEGACY-INT] Get_Sh4Interpreter called — linking legacy interpreter!\n");
|
||||
INFO_LOG(INTERPRETER, "🚀 OPTIMIZED-INTERPRETER: Get_Sh4Interpreter called — linking optimized interpreter!");
|
||||
cpu->Start = Sh4_int_Start;
|
||||
cpu->Run = Sh4_int_Run;
|
||||
cpu->Stop = Sh4_int_Stop;
|
||||
|
@ -235,6 +261,5 @@ void Get_Sh4Interpreter(sh4_if* cpu)
|
|||
cpu->Term = Sh4_int_Term;
|
||||
cpu->IsCpuRunning = Sh4_int_IsCpuRunning;
|
||||
cpu->ResetCache = sh4_int_resetcache;
|
||||
#endif
|
||||
}
|
||||
#endif // ENABLE_SH4_CACHED_IR
|
||||
|
|
|
@ -1,978 +0,0 @@
|
|||
// === NEXT-GENERATION ULTRA-FAST SH4 INTERPRETER ===
|
||||
// This interpreter beats the legacy interpreter by being SIMPLER and FASTER
|
||||
// - Direct SH4 execution like legacy but with optimizations that actually work
|
||||
// - Simple instruction caching without complex block compilation
|
||||
// - BLOCK CACHING: Groups instructions into blocks for dynarec-like performance
|
||||
// - MMU-aware optimizations
|
||||
// - ARM64 prefetching and branch prediction
|
||||
|
||||
#include "sh4_ultra_interpreter.h"
|
||||
#include "hw/sh4/sh4_interpreter.h"
|
||||
#include "hw/sh4/sh4_opcode_list.h"
|
||||
#include "hw/sh4/sh4_core.h"
|
||||
#include "hw/sh4/sh4_interrupts.h"
|
||||
#include "hw/sh4/sh4_mem.h"
|
||||
#include "hw/sh4/sh4_sched.h"
|
||||
#include "hw/sh4/sh4_cache.h"
|
||||
#include "hw/sh4/modules/mmu.h"
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
// === BLOCK CACHING SYSTEM ===
|
||||
// This is the key to achieving dynarec-like performance!
|
||||
#define MAX_BLOCK_SIZE 32
|
||||
#define BLOCK_CACHE_SIZE 2048
|
||||
#define HOT_BLOCK_THRESHOLD 10
|
||||
|
||||
struct CachedBlock {
|
||||
u32 pc_start;
|
||||
u32 pc_end;
|
||||
u32 execution_count;
|
||||
bool is_hot_block;
|
||||
std::vector<u16> opcodes;
|
||||
|
||||
// Block analysis
|
||||
bool has_branches;
|
||||
bool has_memory_ops;
|
||||
bool is_pure_arithmetic;
|
||||
bool can_use_fast_path;
|
||||
|
||||
CachedBlock() : pc_start(0), pc_end(0), execution_count(0), is_hot_block(false),
|
||||
has_branches(false), has_memory_ops(false), is_pure_arithmetic(true), can_use_fast_path(true) {}
|
||||
};
|
||||
|
||||
// Block cache using unordered_map for fast lookups
|
||||
static std::unordered_map<u32, CachedBlock> g_block_cache;
|
||||
|
||||
// Block cache statistics
|
||||
struct BlockCacheStats {
|
||||
u64 total_blocks_executed;
|
||||
u64 hot_block_executions;
|
||||
u64 cold_block_executions;
|
||||
u64 blocks_created;
|
||||
u64 cache_hits;
|
||||
u64 cache_misses;
|
||||
|
||||
void reset() {
|
||||
total_blocks_executed = 0;
|
||||
hot_block_executions = 0;
|
||||
cold_block_executions = 0;
|
||||
blocks_created = 0;
|
||||
cache_hits = 0;
|
||||
cache_misses = 0;
|
||||
}
|
||||
};
|
||||
|
||||
static BlockCacheStats g_block_stats;
|
||||
|
||||
// === INSTRUCTION SEQUENCE CACHING ===
|
||||
#define MAX_SEQUENCE_LENGTH 8
|
||||
#define SEQUENCE_CACHE_SIZE 256
|
||||
|
||||
struct InstructionSequence {
|
||||
u32 start_pc;
|
||||
u16 opcodes[MAX_SEQUENCE_LENGTH];
|
||||
u32 length;
|
||||
u32 execution_count;
|
||||
bool is_optimized;
|
||||
|
||||
// Optimization flags
|
||||
bool is_pure_arithmetic; // Only arithmetic ops, no memory/branches
|
||||
bool is_register_shuffle; // Only register moves
|
||||
bool is_memory_block; // Memory operations that can be batched
|
||||
bool uses_neon; // Can use NEON SIMD optimization
|
||||
};
|
||||
|
||||
static InstructionSequence g_sequence_cache[SEQUENCE_CACHE_SIZE];
|
||||
|
||||
// === ULTRA-SIMPLE INSTRUCTION CACHE ===
|
||||
#define ICACHE_SIZE 1024
|
||||
#define ICACHE_MASK (ICACHE_SIZE - 1)
|
||||
|
||||
struct UltraInstructionCache {
|
||||
u32 pc[ICACHE_SIZE];
|
||||
u16 opcode[ICACHE_SIZE];
|
||||
#ifdef DEBUG
|
||||
u64 hits;
|
||||
u64 misses;
|
||||
#endif
|
||||
|
||||
void reset() {
|
||||
for (int i = 0; i < ICACHE_SIZE; i++) {
|
||||
pc[i] = 0xFFFFFFFF;
|
||||
opcode[i] = 0;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
hits = 0;
|
||||
misses = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
u16 fetch(u32 addr) {
|
||||
u32 index = (addr >> 1) & ICACHE_MASK;
|
||||
|
||||
if (pc[index] == addr) {
|
||||
#ifdef DEBUG
|
||||
hits++;
|
||||
#endif
|
||||
return opcode[index];
|
||||
}
|
||||
|
||||
// Cache miss - fetch from memory
|
||||
#ifdef DEBUG
|
||||
misses++;
|
||||
#endif
|
||||
u16 op = IReadMem16(addr);
|
||||
pc[index] = addr;
|
||||
opcode[index] = op;
|
||||
return op;
|
||||
}
|
||||
};
|
||||
|
||||
static UltraInstructionCache g_icache;
|
||||
|
||||
// === PERFORMANCE STATS ===
|
||||
struct UltraStats {
|
||||
#ifdef DEBUG
|
||||
u64 instructions;
|
||||
u64 cycles;
|
||||
u32 mmu_state_changes;
|
||||
#endif
|
||||
bool mmu_enabled;
|
||||
|
||||
void reset() {
|
||||
#ifdef DEBUG
|
||||
instructions = 0;
|
||||
cycles = 0;
|
||||
mmu_state_changes = 0;
|
||||
#endif
|
||||
mmu_enabled = ::mmu_enabled();
|
||||
}
|
||||
|
||||
void check_mmu() {
|
||||
bool current_mmu = ::mmu_enabled();
|
||||
if (current_mmu != mmu_enabled) {
|
||||
#ifdef DEBUG
|
||||
mmu_state_changes++;
|
||||
INFO_LOG(INTERPRETER, "🔄 MMU state changed: %s", current_mmu ? "ENABLED" : "DISABLED");
|
||||
#endif
|
||||
mmu_enabled = current_mmu;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static UltraStats g_stats;
|
||||
|
||||
// === FORWARD DECLARATIONS ===
|
||||
static inline bool ultra_execute_hot_opcode(u16 op);
|
||||
|
||||
// === BLOCK CACHING FUNCTIONS ===
|
||||
|
||||
// Create a new cached block starting at the given PC
|
||||
static CachedBlock create_cached_block(u32 start_pc) {
|
||||
CachedBlock block;
|
||||
block.pc_start = start_pc;
|
||||
block.execution_count = 0;
|
||||
block.is_hot_block = false;
|
||||
|
||||
// Decode instructions until we hit a branch or reach max block size
|
||||
u32 current_pc = start_pc;
|
||||
for (u32 i = 0; i < MAX_BLOCK_SIZE; i++) {
|
||||
u16 op = IReadMem16(current_pc);
|
||||
block.opcodes.push_back(op);
|
||||
current_pc += 2;
|
||||
|
||||
// Analyze the instruction
|
||||
if (OpDesc[op]->SetPC()) {
|
||||
// This is a branch instruction - end the block
|
||||
block.has_branches = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// Analyze instruction type based on opcode patterns
|
||||
u32 op_high = op >> 12;
|
||||
u32 op_low = op & 0xF;
|
||||
|
||||
// Check for memory operations (rough heuristic)
|
||||
if (op_high == 0x2 || op_high == 0x6 || op_high == 0x8 || op_high == 0x9 || op_high == 0xC || op_high == 0xD) {
|
||||
block.has_memory_ops = true;
|
||||
}
|
||||
|
||||
// Check if it's arithmetic/logical operation
|
||||
if (!(op_high == 0x3 || op_high == 0x7 || (op_high == 0x2 && (op_low == 0x9 || op_low == 0xA || op_low == 0xB)))) {
|
||||
block.is_pure_arithmetic = false;
|
||||
}
|
||||
}
|
||||
|
||||
block.pc_end = current_pc;
|
||||
|
||||
// Determine if this block can use fast path execution
|
||||
block.can_use_fast_path = !block.has_branches && block.opcodes.size() <= 16;
|
||||
|
||||
g_block_stats.blocks_created++;
|
||||
|
||||
INFO_LOG(INTERPRETER, "🔨 Created block PC=0x%08X-0x%08X (%d opcodes, branches=%s, memory=%s)",
|
||||
block.pc_start, block.pc_end, (int)block.opcodes.size(),
|
||||
block.has_branches ? "yes" : "no", block.has_memory_ops ? "yes" : "no");
|
||||
|
||||
return block;
|
||||
}
|
||||
|
||||
// Execute a cached block with proper exception and control flow handling
|
||||
static void execute_cached_block(CachedBlock& block) {
|
||||
block.execution_count++;
|
||||
g_block_stats.total_blocks_executed++;
|
||||
|
||||
// Promote to hot block if executed frequently
|
||||
if (block.execution_count >= HOT_BLOCK_THRESHOLD && !block.is_hot_block) {
|
||||
block.is_hot_block = true;
|
||||
INFO_LOG(INTERPRETER, "🔥 Block at PC=0x%08X promoted to HOT BLOCK (%u executions)",
|
||||
block.pc_start, block.execution_count);
|
||||
}
|
||||
|
||||
// Track hot vs cold execution
|
||||
if (block.is_hot_block) {
|
||||
g_block_stats.hot_block_executions++;
|
||||
} else {
|
||||
g_block_stats.cold_block_executions++;
|
||||
}
|
||||
|
||||
// CRITICAL: Execute instructions one by one to handle exceptions properly
|
||||
u32 block_pc = block.pc_start;
|
||||
|
||||
try {
|
||||
for (size_t i = 0; i < block.opcodes.size(); i++) {
|
||||
u16 op = block.opcodes[i];
|
||||
|
||||
// Update PC and next_pc for this instruction
|
||||
Sh4cntx.pc = block_pc;
|
||||
next_pc = block_pc + 2;
|
||||
|
||||
// Check for interrupts before each instruction
|
||||
if (__builtin_expect(UpdateSystem_INTC(), 0)) {
|
||||
// Interrupt pending - must break out of block
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for floating point disable exception
|
||||
if (__builtin_expect(sr.FD == 1 && OpDesc[op]->IsFloatingPoint(), 0)) {
|
||||
RaiseFPUDisableException();
|
||||
}
|
||||
|
||||
if (block.is_hot_block && block.can_use_fast_path) {
|
||||
// Try ultra-fast inline execution first
|
||||
if (!ultra_execute_hot_opcode(op)) {
|
||||
// Fall back to legacy handler
|
||||
OpPtr[op](op);
|
||||
}
|
||||
} else {
|
||||
// Execute using legacy handler
|
||||
OpPtr[op](op);
|
||||
}
|
||||
|
||||
// Execute cycles
|
||||
sh4cycles.executeCycles(op);
|
||||
|
||||
// CRITICAL: Check if PC was changed by instruction (jumps, branches, exceptions)
|
||||
if (next_pc != block_pc + 2) {
|
||||
// Control flow changed - instruction modified PC
|
||||
// This means we have a jump, branch, or exception
|
||||
return; // Exit block execution immediately
|
||||
}
|
||||
|
||||
// Move to next instruction in block
|
||||
block_pc += 2;
|
||||
}
|
||||
} catch (const SH4ThrownException& ex) {
|
||||
// Exception occurred during block execution
|
||||
Do_Exception(ex.epc, ex.expEvn);
|
||||
// Exception requires pipeline drain, so approx 5 cycles
|
||||
sh4cycles.addCycles(5 * 8); // 8 = CPU_RATIO from legacy
|
||||
return; // Exit block execution
|
||||
}
|
||||
}
|
||||
|
||||
// === ULTRA-FAST INSTRUCTION FETCH ===
|
||||
static inline u16 ultra_fetch_instruction(u32 pc) {
|
||||
// Use instruction cache for frequently accessed instructions
|
||||
return g_icache.fetch(pc);
|
||||
}
|
||||
|
||||
// === MEMORY ACCESS OPTIMIZATIONS ===
|
||||
// Fast paths for common memory operations
|
||||
|
||||
// Fast path for main RAM access (0x0C000000-0x0CFFFFFF)
|
||||
static inline u32 ultra_read_mem32_fast(u32 addr) {
|
||||
// Fast path for main RAM (most common case)
|
||||
if (__builtin_expect((addr & 0xFF000000) == 0x0C000000, 1)) {
|
||||
return ReadMem32(addr); // Direct access to main RAM
|
||||
}
|
||||
|
||||
// Fallback for other memory regions
|
||||
return ReadMem32(addr);
|
||||
}
|
||||
|
||||
static inline void ultra_write_mem32_fast(u32 addr, u32 data) {
|
||||
// Fast path for main RAM (most common case)
|
||||
if (__builtin_expect((addr & 0xFF000000) == 0x0C000000, 1)) {
|
||||
WriteMem32(addr, data); // Direct access to main RAM
|
||||
return;
|
||||
}
|
||||
|
||||
// Fallback for other memory regions
|
||||
WriteMem32(addr, data);
|
||||
}
|
||||
|
||||
// === HOT OPCODE SPECIALIZATION ===
|
||||
// Inline optimized versions of the most common opcodes to bypass function call overhead
|
||||
|
||||
// ULTRA-HOT PATH: Inline the top 10 most critical opcodes for massive speedup
|
||||
static inline bool ultra_execute_superhot_opcode(u16 op) {
|
||||
// Fast decode without switches for maximum performance
|
||||
u32 op_high = op >> 12;
|
||||
u32 op_low = op & 0xF;
|
||||
|
||||
// TOP 1: mov <REG_M>,<REG_N> (0x6xx3) - 25% of all instructions
|
||||
if (__builtin_expect((op & 0xF00F) == 0x6003, 1)) {
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] = r[m];
|
||||
return true;
|
||||
}
|
||||
|
||||
// TOP 2: add #<imm>,<REG_N> (0x7xxx) - 15% of all instructions
|
||||
if (__builtin_expect(op_high == 0x7, 1)) {
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
s32 imm = (s32)(s8)(op & 0xFF);
|
||||
r[n] += imm;
|
||||
return true;
|
||||
}
|
||||
|
||||
// TOP 3: add <REG_M>,<REG_N> (0x3xxC) - 10% of all instructions
|
||||
if (__builtin_expect((op & 0xF00F) == 0x300C, 1)) {
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] += r[m];
|
||||
return true;
|
||||
}
|
||||
|
||||
// TOP 4: mov.l @<REG_M>,<REG_N> (0x6xx2) - 8% of all instructions
|
||||
if (__builtin_expect((op & 0xF00F) == 0x6002, 1)) {
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] = ReadMem32(r[m]);
|
||||
return true;
|
||||
}
|
||||
|
||||
// TOP 5: mov.l <REG_M>,@<REG_N> (0x2xx2) - 6% of all instructions
|
||||
if (__builtin_expect((op & 0xF00F) == 0x2002, 1)) {
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
WriteMem32(r[n], r[m]);
|
||||
return true;
|
||||
}
|
||||
|
||||
// TOP 6: cmp/eq <REG_M>,<REG_N> (0x3xx0) - 5% of all instructions
|
||||
if (__builtin_expect((op & 0xF00F) == 0x3000, 1)) {
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
sr.T = (r[m] == r[n]) ? 1 : 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
// TOP 7: sub <REG_M>,<REG_N> (0x3xx8) - 4% of all instructions
|
||||
if (__builtin_expect((op & 0xF00F) == 0x3008, 1)) {
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] -= r[m];
|
||||
return true;
|
||||
}
|
||||
|
||||
// TOP 8: nop (0x0009) - 4% of all instructions
|
||||
if (__builtin_expect(op == 0x0009, 1)) {
|
||||
return true; // Nothing to do
|
||||
}
|
||||
|
||||
// TOP 9: mov.l @<REG_M>+,<REG_N> (0x6xx6) - 3% of all instructions
|
||||
if (__builtin_expect((op & 0xF00F) == 0x6006, 1)) {
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] = ReadMem32(r[m]);
|
||||
if (n != m) r[m] += 4;
|
||||
return true;
|
||||
}
|
||||
|
||||
// TOP 10: dt <REG_N> (0x4xx0 with low byte 0x10) - 3% of all instructions
|
||||
if (__builtin_expect((op & 0xF0FF) == 0x4010, 1)) {
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
r[n]--;
|
||||
sr.T = (r[n] == 0) ? 1 : 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false; // Not a superhot opcode
|
||||
}
|
||||
|
||||
// Optimized memory operations for hot opcodes
|
||||
static inline void ultra_execute_hot_opcode_with_mem_opt(u16 op) {
|
||||
// Extract opcode patterns for fastest common instructions
|
||||
u32 opcode_high = (op >> 12) & 0xF;
|
||||
u32 opcode_low = op & 0xF;
|
||||
|
||||
switch (opcode_high) {
|
||||
case 0x6: // 6xxx - MOV instructions (super hot!)
|
||||
switch (opcode_low) {
|
||||
case 0x3: { // mov <REG_M>,<REG_N> - HOTTEST OPCODE
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] = r[m];
|
||||
return;
|
||||
}
|
||||
case 0x2: { // mov.l @<REG_M>,<REG_N> - OPTIMIZED
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] = ultra_read_mem32_fast(r[m]);
|
||||
|
||||
// Prefetch next cache line if sequential access pattern
|
||||
#ifdef __aarch64__
|
||||
__builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[m] + 32)), 0, 1);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
case 0x6: { // mov.l @<REG_M>+,<REG_N> - OPTIMIZED
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] = ultra_read_mem32_fast(r[m]);
|
||||
if (n != m) {
|
||||
r[m] += 4;
|
||||
// Prefetch next memory location
|
||||
#ifdef __aarch64__
|
||||
__builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[m] + 16)), 0, 1);
|
||||
#endif
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 0x2: // 2xxx - Memory operations and logic
|
||||
switch (opcode_low) {
|
||||
case 0x2: { // mov.l <REG_M>,@<REG_N> - OPTIMIZED
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
ultra_write_mem32_fast(r[n], r[m]);
|
||||
|
||||
// Prefetch next cache line for sequential writes
|
||||
#ifdef __aarch64__
|
||||
__builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[n] + 32)), 1, 1);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
case 0x6: { // mov.l <REG_M>,@-<REG_N> - OPTIMIZED
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] -= 4;
|
||||
ultra_write_mem32_fast(r[n], r[m]);
|
||||
|
||||
// Prefetch previous cache line for stack operations
|
||||
#ifdef __aarch64__
|
||||
__builtin_prefetch(reinterpret_cast<void*>(static_cast<uintptr_t>(r[n] - 32)), 1, 1);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// Fallback to standard hot opcode execution
|
||||
ultra_execute_hot_opcode(op);
|
||||
}
|
||||
|
||||
// === FORWARD DECLARATIONS ===
|
||||
static void ultra_interpreter_run();
|
||||
|
||||
// === ARM64 NEON SIMD OPTIMIZATIONS ===
|
||||
// Use ARM64 NEON to process multiple registers simultaneously
|
||||
|
||||
#ifdef __aarch64__
|
||||
#include <arm_neon.h>
|
||||
|
||||
// Bulk register clear using NEON (4 registers at once)
|
||||
static inline void neon_clear_registers(u32* reg_base, int count) {
|
||||
uint32x4_t zero = vdupq_n_u32(0);
|
||||
for (int i = 0; i < count; i += 4) {
|
||||
vst1q_u32(®_base[i], zero);
|
||||
}
|
||||
}
|
||||
|
||||
// Bulk register copy using NEON (4 registers at once)
|
||||
static inline void neon_copy_registers(u32* dst, const u32* src, int count) {
|
||||
for (int i = 0; i < count; i += 4) {
|
||||
uint32x4_t data = vld1q_u32(&src[i]);
|
||||
vst1q_u32(&dst[i], data);
|
||||
}
|
||||
}
|
||||
|
||||
// NEON-optimized register bank switching
|
||||
static inline void neon_switch_register_bank() {
|
||||
// Save current bank using NEON (only 8 registers in r_bank)
|
||||
uint32x4_t bank0_3 = vld1q_u32(&r[0]);
|
||||
uint32x4_t bank4_7 = vld1q_u32(&r[4]);
|
||||
|
||||
// Load shadow bank using NEON (only 8 registers)
|
||||
vst1q_u32(&r[0], vld1q_u32(&r_bank[0]));
|
||||
vst1q_u32(&r[4], vld1q_u32(&r_bank[4]));
|
||||
|
||||
// Store old bank to shadow (only 8 registers)
|
||||
vst1q_u32(&r_bank[0], bank0_3);
|
||||
vst1q_u32(&r_bank[4], bank4_7);
|
||||
}
|
||||
|
||||
// Detect patterns for NEON optimization
|
||||
static inline bool is_bulk_mov_pattern(u16* opcodes, int count) {
|
||||
// Check if we have 4+ consecutive MOV operations
|
||||
int mov_count = 0;
|
||||
for (int i = 0; i < count && i < 8; i++) {
|
||||
if ((opcodes[i] & 0xF00F) == 0x6003) { // mov <REG_M>,<REG_N>
|
||||
mov_count++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return mov_count >= 4;
|
||||
}
|
||||
|
||||
// Execute bulk MOV operations with NEON
|
||||
static inline int execute_bulk_mov_neon(u16* opcodes, int count) {
|
||||
// Extract source and destination registers
|
||||
u32 src_regs[4], dst_regs[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
src_regs[i] = (opcodes[i] >> 4) & 0xF;
|
||||
dst_regs[i] = (opcodes[i] >> 8) & 0xF;
|
||||
}
|
||||
|
||||
// Load source values using NEON gather (simulated)
|
||||
uint32x4_t values = {r[src_regs[0]], r[src_regs[1]], r[src_regs[2]], r[src_regs[3]]};
|
||||
|
||||
// Store to destinations
|
||||
r[dst_regs[0]] = vgetq_lane_u32(values, 0);
|
||||
r[dst_regs[1]] = vgetq_lane_u32(values, 1);
|
||||
r[dst_regs[2]] = vgetq_lane_u32(values, 2);
|
||||
r[dst_regs[3]] = vgetq_lane_u32(values, 3);
|
||||
|
||||
return 4; // Processed 4 instructions
|
||||
}
|
||||
|
||||
#else
|
||||
// Fallback for non-ARM64 platforms
|
||||
static inline void neon_clear_registers(u32* reg_base, int count) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
reg_base[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void neon_copy_registers(u32* dst, const u32* src, int count) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
|
||||
static inline void neon_switch_register_bank() {
|
||||
// Standard register bank switch
|
||||
for (int i = 0; i < 16; i++) {
|
||||
u32 temp = r[i];
|
||||
r[i] = r_bank[i];
|
||||
r_bank[i] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool is_bulk_mov_pattern(u16* opcodes, int count) { return false; }
|
||||
static inline int execute_bulk_mov_neon(u16* opcodes, int count) { return 0; }
|
||||
#endif
|
||||
|
||||
// === SEQUENCE CACHING IMPLEMENTATION ===
|
||||
|
||||
// Hash function for instruction sequences
|
||||
static inline u32 hash_sequence(u32 pc, u16* opcodes, u32 length) {
|
||||
u32 hash = pc;
|
||||
for (u32 i = 0; i < length; i++) {
|
||||
hash = hash * 31 + opcodes[i];
|
||||
}
|
||||
return hash % SEQUENCE_CACHE_SIZE;
|
||||
}
|
||||
|
||||
// Analyze instruction sequence for optimization opportunities
|
||||
static inline void analyze_sequence(InstructionSequence* seq) {
|
||||
seq->is_pure_arithmetic = true;
|
||||
seq->is_register_shuffle = true;
|
||||
seq->is_memory_block = true;
|
||||
seq->uses_neon = false;
|
||||
|
||||
int mov_count = 0;
|
||||
int arith_count = 0;
|
||||
int mem_count = 0;
|
||||
|
||||
for (u32 i = 0; i < seq->length; i++) {
|
||||
u16 op = seq->opcodes[i];
|
||||
u32 op_high = (op >> 12) & 0xF;
|
||||
u32 op_low = op & 0xF;
|
||||
|
||||
// Check instruction types
|
||||
if (op_high == 0x6 && op_low == 0x3) {
|
||||
mov_count++; // mov <REG_M>,<REG_N>
|
||||
} else if (op_high == 0x3 && (op_low == 0xC || op_low == 0x8)) {
|
||||
arith_count++; // add/sub
|
||||
seq->is_register_shuffle = false;
|
||||
} else if (op_high == 0x6 && (op_low == 0x2 || op_low == 0x6)) {
|
||||
mem_count++; // memory load
|
||||
seq->is_pure_arithmetic = false;
|
||||
seq->is_register_shuffle = false;
|
||||
} else {
|
||||
// Complex instruction - disable optimizations
|
||||
seq->is_pure_arithmetic = false;
|
||||
seq->is_register_shuffle = false;
|
||||
seq->is_memory_block = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Enable NEON if we have enough parallel operations
|
||||
if (mov_count >= 4 || arith_count >= 4) {
|
||||
seq->uses_neon = true;
|
||||
}
|
||||
|
||||
// Set memory block flag based on memory operations
|
||||
if (mem_count < seq->length / 2) {
|
||||
seq->is_memory_block = false;
|
||||
}
|
||||
|
||||
seq->is_optimized = true;
|
||||
}
|
||||
|
||||
// Execute optimized instruction sequence
|
||||
static inline u32 execute_optimized_sequence(InstructionSequence* seq, u32 pc) {
|
||||
if (!seq->is_optimized) {
|
||||
analyze_sequence(seq);
|
||||
}
|
||||
|
||||
// Fast path for register shuffles with NEON
|
||||
if (seq->uses_neon && seq->is_register_shuffle) {
|
||||
int processed = execute_bulk_mov_neon(seq->opcodes, seq->length);
|
||||
if (processed > 0) {
|
||||
return processed * 2; // Each instruction is 2 bytes
|
||||
}
|
||||
}
|
||||
|
||||
// Fast path for pure arithmetic
|
||||
if (seq->is_pure_arithmetic && seq->length <= 4) {
|
||||
for (u32 i = 0; i < seq->length; i++) {
|
||||
ultra_execute_hot_opcode(seq->opcodes[i]);
|
||||
}
|
||||
return seq->length * 2;
|
||||
}
|
||||
|
||||
// Fallback to individual execution
|
||||
for (u32 i = 0; i < seq->length; i++) {
|
||||
ultra_execute_hot_opcode(seq->opcodes[i]);
|
||||
}
|
||||
return seq->length * 2;
|
||||
}
|
||||
|
||||
// Try to find or create a cached sequence
|
||||
static inline InstructionSequence* find_cached_sequence(u32 pc) {
|
||||
// Look ahead and try to build a sequence
|
||||
u16 opcodes[MAX_SEQUENCE_LENGTH];
|
||||
u32 length = 0;
|
||||
u32 current_pc = pc;
|
||||
|
||||
// Build sequence until we hit a branch or complex instruction
|
||||
for (length = 0; length < MAX_SEQUENCE_LENGTH; length++) {
|
||||
u16 op = ultra_fetch_instruction(current_pc);
|
||||
opcodes[length] = op;
|
||||
current_pc += 2;
|
||||
|
||||
// Stop at branches or complex instructions
|
||||
u32 op_high = (op >> 12) & 0xF;
|
||||
if (op_high == 0xA || op_high == 0xB || // Branch instructions
|
||||
op_high == 0xF || // FPU instructions
|
||||
op == 0x001B) { // sleep instruction
|
||||
length++; // Include this instruction
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (length < 2) return nullptr; // Too short to optimize
|
||||
|
||||
// Check cache
|
||||
u32 hash = hash_sequence(pc, opcodes, length);
|
||||
InstructionSequence* seq = &g_sequence_cache[hash];
|
||||
|
||||
if (seq->start_pc == pc && seq->length == length) {
|
||||
// Cache hit - verify opcodes match
|
||||
bool match = true;
|
||||
for (u32 i = 0; i < length; i++) {
|
||||
if (seq->opcodes[i] != opcodes[i]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (match) {
|
||||
seq->execution_count++;
|
||||
return seq;
|
||||
}
|
||||
}
|
||||
|
||||
// Cache miss - create new sequence
|
||||
seq->start_pc = pc;
|
||||
seq->length = length;
|
||||
seq->execution_count = 1;
|
||||
seq->is_optimized = false;
|
||||
for (u32 i = 0; i < length; i++) {
|
||||
seq->opcodes[i] = opcodes[i];
|
||||
}
|
||||
|
||||
return seq;
|
||||
}
|
||||
|
||||
// Build a new instruction sequence starting at the given PC
|
||||
static inline void build_instruction_sequence(u32 start_pc, u32 length) {
|
||||
// Don't build sequences if we're in an exception handler or delay slot
|
||||
if (sr.BL || next_pc != start_pc + 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Find an empty slot in the cache
|
||||
u32 hash = start_pc % SEQUENCE_CACHE_SIZE;
|
||||
InstructionSequence* seq = &g_sequence_cache[hash];
|
||||
|
||||
// If slot is occupied by a different sequence, check if we should replace it
|
||||
if (seq->start_pc != 0 && seq->start_pc != start_pc) {
|
||||
// Only replace if the existing sequence has low execution count
|
||||
if (seq->execution_count > 10) {
|
||||
return; // Keep the existing sequence
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the sequence
|
||||
seq->start_pc = start_pc;
|
||||
seq->length = 0;
|
||||
seq->execution_count = 1;
|
||||
seq->is_optimized = false;
|
||||
|
||||
// Fetch instructions for the sequence
|
||||
u32 current_pc = start_pc;
|
||||
for (u32 i = 0; i < length && i < MAX_SEQUENCE_LENGTH; i++) {
|
||||
u16 op = ultra_fetch_instruction(current_pc);
|
||||
seq->opcodes[i] = op;
|
||||
seq->length++;
|
||||
current_pc += 2;
|
||||
|
||||
// Stop building if we hit a branch or jump
|
||||
u32 op_high = (op >> 12) & 0xF;
|
||||
if (op_high == 0x8 || op_high == 0x9 || op_high == 0xA || op_high == 0xB) {
|
||||
// Branch instructions - stop sequence here
|
||||
break;
|
||||
}
|
||||
|
||||
// Stop if we hit a system call or privileged instruction
|
||||
if (op == 0x000B || op == 0x0093 || op == 0x0083) {
|
||||
// sleep, rte, pref - stop sequence
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Only keep sequences with at least 2 instructions
|
||||
if (seq->length < 2) {
|
||||
seq->start_pc = 0; // Mark as empty
|
||||
return;
|
||||
}
|
||||
|
||||
// Analyze the sequence for optimization opportunities
|
||||
analyze_sequence(seq);
|
||||
|
||||
#ifdef DEBUG
|
||||
INFO_LOG(INTERPRETER, "🔗 Built sequence at PC=0x%08X, length=%d, arithmetic=%s, shuffle=%s",
|
||||
start_pc, seq->length,
|
||||
seq->is_pure_arithmetic ? "yes" : "no",
|
||||
seq->is_register_shuffle ? "yes" : "no");
|
||||
#endif
|
||||
}
|
||||
|
||||
// === ULTRA-FAST MAIN EXECUTION LOOP WITH BLOCK CACHING ===
|
||||
// This uses block caching like the dynarec for maximum performance
|
||||
static void ultra_interpreter_run() {
|
||||
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Starting block-cached execution");
|
||||
|
||||
// Reset stats
|
||||
g_stats.reset();
|
||||
g_icache.reset();
|
||||
g_block_stats.reset();
|
||||
|
||||
// Main execution loop - BLOCK-BASED like dynarec!
|
||||
while (sh4_int_bCpuRun) {
|
||||
try {
|
||||
// Inner loop with block execution
|
||||
do {
|
||||
// CRITICAL: Check for system updates and interrupts
|
||||
if (UpdateSystem()) {
|
||||
break; // System update occurred, restart loop
|
||||
}
|
||||
|
||||
// Get current PC
|
||||
u32 current_pc = next_pc;
|
||||
|
||||
// Look up block in cache first
|
||||
auto it = g_block_cache.find(current_pc);
|
||||
if (it != g_block_cache.end()) {
|
||||
// CACHE HIT: Execute cached block
|
||||
g_block_stats.cache_hits++;
|
||||
execute_cached_block(it->second);
|
||||
} else {
|
||||
// CACHE MISS: Create new block and execute it
|
||||
g_block_stats.cache_misses++;
|
||||
|
||||
// Create new block
|
||||
CachedBlock new_block = create_cached_block(current_pc);
|
||||
|
||||
// Add to cache
|
||||
g_block_cache[current_pc] = std::move(new_block);
|
||||
|
||||
// Execute the new block
|
||||
execute_cached_block(g_block_cache[current_pc]);
|
||||
}
|
||||
|
||||
// CRITICAL: Check if we're stuck in an infinite loop
|
||||
if (next_pc == current_pc) {
|
||||
// PC hasn't changed - this could be an infinite loop
|
||||
// Fall back to single instruction execution
|
||||
u16 op = IReadMem16(current_pc);
|
||||
Sh4cntx.pc = current_pc;
|
||||
next_pc = current_pc + 2;
|
||||
OpPtr[op](op);
|
||||
sh4cycles.executeCycles(op);
|
||||
}
|
||||
|
||||
// Periodic stats reporting (every 10000 blocks)
|
||||
static u32 stats_counter = 0;
|
||||
if ((++stats_counter % 10000) == 0) {
|
||||
INFO_LOG(INTERPRETER, "📊 BLOCK STATS: %llu executed, %llu hot, %llu cold, %llu created, %.1f%% hit ratio",
|
||||
g_block_stats.total_blocks_executed, g_block_stats.hot_block_executions,
|
||||
g_block_stats.cold_block_executions, g_block_stats.blocks_created,
|
||||
(g_block_stats.cache_hits + g_block_stats.cache_misses) > 0 ?
|
||||
(float)g_block_stats.cache_hits / (g_block_stats.cache_hits + g_block_stats.cache_misses) * 100.0f : 0.0f);
|
||||
}
|
||||
|
||||
} while (p_sh4rcb->cntx.cycle_counter > 0 && sh4_int_bCpuRun);
|
||||
|
||||
// Update system timing
|
||||
p_sh4rcb->cntx.cycle_counter += SH4_TIMESLICE;
|
||||
|
||||
} catch (const SH4ThrownException& ex) {
|
||||
Do_Exception(ex.epc, ex.expEvn);
|
||||
// Exception requires pipeline drain, so approx 5 cycles
|
||||
sh4cycles.addCycles(5 * 8); // 8 = CPU_RATIO from legacy
|
||||
}
|
||||
}
|
||||
|
||||
INFO_LOG(INTERPRETER, "🏁 ULTRA-INTERPRETER: Finished block-cached execution");
|
||||
|
||||
// Print final block cache statistics
|
||||
INFO_LOG(INTERPRETER, "📊 FINAL BLOCK STATS:");
|
||||
INFO_LOG(INTERPRETER, " Total blocks executed: %llu", g_block_stats.total_blocks_executed);
|
||||
INFO_LOG(INTERPRETER, " Hot block executions: %llu (%.1f%%)",
|
||||
g_block_stats.hot_block_executions,
|
||||
g_block_stats.total_blocks_executed > 0 ?
|
||||
(double)g_block_stats.hot_block_executions / g_block_stats.total_blocks_executed * 100.0 : 0.0);
|
||||
INFO_LOG(INTERPRETER, " Cold block executions: %llu (%.1f%%)",
|
||||
g_block_stats.cold_block_executions,
|
||||
g_block_stats.total_blocks_executed > 0 ?
|
||||
(double)g_block_stats.cold_block_executions / g_block_stats.total_blocks_executed * 100.0 : 0.0);
|
||||
INFO_LOG(INTERPRETER, " Blocks created: %llu", g_block_stats.blocks_created);
|
||||
INFO_LOG(INTERPRETER, " Cache hit ratio: %.1f%%",
|
||||
(g_block_stats.cache_hits + g_block_stats.cache_misses) > 0 ?
|
||||
(float)g_block_stats.cache_hits / (g_block_stats.cache_hits + g_block_stats.cache_misses) * 100.0f : 0.0f);
|
||||
|
||||
#ifdef DEBUG
|
||||
INFO_LOG(INTERPRETER, "📊 Final stats: %llu instructions, %llu cycles, %d MMU changes",
|
||||
g_stats.instructions, g_stats.cycles, g_stats.mmu_state_changes);
|
||||
|
||||
float cache_hit_ratio = (g_icache.hits + g_icache.misses) > 0 ?
|
||||
(float)g_icache.hits / (g_icache.hits + g_icache.misses) * 100.0f : 0.0f;
|
||||
INFO_LOG(INTERPRETER, "📊 Instruction cache: %llu hits, %llu misses, %.1f%% hit ratio",
|
||||
g_icache.hits, g_icache.misses, cache_hit_ratio);
|
||||
#endif
|
||||
}
|
||||
|
||||
// === ULTRA-INTERPRETER INTERFACE ===
|
||||
void* Get_UltraInterpreter() {
|
||||
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Get_UltraInterpreter called — linking block-cached interpreter!");
|
||||
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Block caching: ENABLED (%d max blocks, %d max size)", BLOCK_CACHE_SIZE, MAX_BLOCK_SIZE);
|
||||
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Hot block threshold: %d executions", HOT_BLOCK_THRESHOLD);
|
||||
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Instruction caching: ENABLED (%d entries)", ICACHE_SIZE);
|
||||
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: ARM64 prefetching: ENABLED");
|
||||
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: MMU-aware optimizations: ENABLED");
|
||||
INFO_LOG(INTERPRETER, "🚀 ULTRA-INTERPRETER: Block-based execution like dynarec but simpler!");
|
||||
|
||||
return (void*)ultra_interpreter_run;
|
||||
}
|
||||
|
||||
// SECONDARY HOT PATH: Handle next tier of opcodes with minimal overhead
|
||||
static inline bool ultra_execute_hot_opcode(u16 op) {
|
||||
u32 op_high = op >> 12;
|
||||
u32 op_low = op & 0xF;
|
||||
|
||||
// Memory operations with pre-decrement/post-increment
|
||||
if (op_high == 0x2) {
|
||||
if (op_low == 0x6) { // mov.l <REG_M>,@-<REG_N>
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] -= 4;
|
||||
WriteMem32(r[n], r[m]);
|
||||
return true;
|
||||
} else if (op_low == 0x9) { // and <REG_M>,<REG_N>
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] &= r[m];
|
||||
return true;
|
||||
} else if (op_low == 0xB) { // or <REG_M>,<REG_N>
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] |= r[m];
|
||||
return true;
|
||||
} else if (op_low == 0xA) { // xor <REG_M>,<REG_N>
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
u32 m = (op >> 4) & 0xF;
|
||||
r[n] ^= r[m];
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Shift operations
|
||||
else if (op_high == 0x4) {
|
||||
u32 low_byte = op & 0xFF;
|
||||
if (low_byte == 0x00) { // shll <REG_N>
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
sr.T = r[n] >> 31;
|
||||
r[n] <<= 1;
|
||||
return true;
|
||||
} else if (low_byte == 0x01) { // shlr <REG_N>
|
||||
u32 n = (op >> 8) & 0xF;
|
||||
sr.T = r[n] & 1;
|
||||
r[n] >>= 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Control flow and special operations
|
||||
else if (op_high == 0x0) {
|
||||
u32 low_byte = op & 0xFF;
|
||||
if (low_byte == 0x08) { // clrt
|
||||
sr.T = 0;
|
||||
return true;
|
||||
} else if (low_byte == 0x18) { // sett
|
||||
sr.T = 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false; // Not handled in hot path
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "types.h"
|
||||
|
||||
// === NEXT-GENERATION ULTRA-FAST SH4 INTERPRETER ===
|
||||
// This interpreter beats the legacy interpreter using modern optimization techniques
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Ultra-interpreter factory function
|
||||
void* Get_UltraInterpreter();
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
// Enable ultra-interpreter by default
|
||||
#define USE_ULTRA_INTERPRETER 1
|
Loading…
Reference in New Issue