This commit is contained in:
Joseph Mattiello 2025-07-06 17:21:24 -04:00
parent cef9d6c8cc
commit c9dd5eaebb
2 changed files with 187 additions and 454 deletions

View File

@ -11,6 +11,8 @@
#include "ngen.h"
#include <cmath>
#include <unordered_map>
#include "../interpr/sh4_opcodes.h"
#include "../sh4_opcode_list.h"
// Global flag to enable SHIL interpretation mode
bool enable_shil_interpreter = false;
@ -299,6 +301,13 @@ struct InstructionFuser {
g_massive_cache.r[fused.operands[2]] = g_massive_cache.r[fused.operands[3]];
#endif
break;
case FusedInstruction::FUSED_LOAD_USE:
case FusedInstruction::FUSED_STORE_UPDATE:
case FusedInstruction::FUSED_COMPARE_BRANCH:
case FusedInstruction::FUSED_ARITHMETIC_CHAIN:
// TODO: Implement these fusion patterns
break;
}
}
};
@ -318,8 +327,6 @@ struct HotPathOptimizer {
code_offset = 0; // Reset buffer
}
char* code_ptr = code_buffer + code_offset;
// Generate ARM64 assembly for common patterns
// This is a simplified version - in practice you'd use a proper assembler
@ -640,6 +647,9 @@ struct ShilCache {
case shop_jcond: case shop_jdyn:
branch_ops++;
break;
default:
// Handle all other opcodes
break;
}
}
@ -787,489 +797,202 @@ struct PatternExecutors {
}
};
// ARM64 ASSEMBLY OPTIMIZED EXECUTION KERNEL
#ifdef __aarch64__
// === HYBRID DIRECT EXECUTION SYSTEM ===
// This bypasses SHIL translation for hot paths and uses direct SH4 execution
// like the legacy interpreter for maximum performance
// MASSIVE CACHE IMPLEMENTATIONS
void MassiveRegisterCache::massive_load() {
// Load absolutely everything from SH4 context using SIMD when possible
#ifdef __aarch64__
// SIMD load of general purpose registers
uint32x4_t* src_gpr = (uint32x4_t*)&sh4rcb.cntx.r[0];
uint32x4_t* dst_gpr = (uint32x4_t*)r;
// Load all 16 general purpose registers in 4 SIMD ops
dst_gpr[0] = vld1q_u32((uint32_t*)&src_gpr[0]); // r0-r3
dst_gpr[1] = vld1q_u32((uint32_t*)&src_gpr[1]); // r4-r7
dst_gpr[2] = vld1q_u32((uint32_t*)&src_gpr[2]); // r8-r11
dst_gpr[3] = vld1q_u32((uint32_t*)&src_gpr[3]); // r12-r15
// Skip FP registers for now to avoid complexity
// TODO: Add FP register caching later
// SIMD load of banked registers
uint32x4_t* src_bank = (uint32x4_t*)sh4rcb.cntx.r_bank;
uint32x4_t* dst_bank = (uint32x4_t*)r_bank;
dst_bank[0] = vld1q_u32((uint32_t*)&src_bank[0]); // r0_bank-r3_bank
dst_bank[1] = vld1q_u32((uint32_t*)&src_bank[1]); // r4_bank-r7_bank
#else
// Fallback: bulk copy operations
memcpy(r, sh4rcb.cntx.r, sizeof(r));
// Skip FP registers for now
memcpy(r_bank, sh4rcb.cntx.r_bank, sizeof(r_bank));
#endif
// Load all control registers
ctrl[0] = sh4rcb.cntx.pc; ctrl[1] = sh4rcb.cntx.pr;
ctrl[2] = sh4rcb.cntx.sr.T; ctrl[3] = sh4rcb.cntx.gbr;
ctrl[4] = sh4rcb.cntx.vbr; ctrl[5] = sh4rcb.cntx.mac.l;
ctrl[6] = sh4rcb.cntx.mac.h; ctrl[7] = 0; // Skip sr.all for now
// Skip complex system registers for now
// fpscr = sh4rcb.cntx.fpscr; fpul = sh4rcb.cntx.fpul;
sr_saved = 0; pr_saved = sh4rcb.cntx.pr;
// Initialize cache state
current_block_pc = sh4rcb.cntx.pc;
total_instructions++;
}
// Track execution frequency to identify hot paths
static std::unordered_map<u32, u32> execution_frequency;
static constexpr u32 DIRECT_EXECUTION_THRESHOLD = 50; // Switch to direct execution after 50 runs
void MassiveRegisterCache::massive_store() {
// Store everything back to SH4 context using SIMD when possible
#ifdef __aarch64__
// SIMD store of general purpose registers
uint32x4_t* src_gpr = (uint32x4_t*)r;
uint32x4_t* dst_gpr = (uint32x4_t*)&sh4rcb.cntx.r[0];
// Store all 16 general purpose registers in 4 SIMD ops
vst1q_u32((uint32_t*)&dst_gpr[0], src_gpr[0]); // r0-r3
vst1q_u32((uint32_t*)&dst_gpr[1], src_gpr[1]); // r4-r7
vst1q_u32((uint32_t*)&dst_gpr[2], src_gpr[2]); // r8-r11
vst1q_u32((uint32_t*)&dst_gpr[3], src_gpr[3]); // r12-r15
// Skip FP registers for now to avoid complexity
// TODO: Add FP register caching later
// SIMD store of banked registers
uint32x4_t* src_bank = (uint32x4_t*)r_bank;
uint32x4_t* dst_bank = (uint32x4_t*)sh4rcb.cntx.r_bank;
vst1q_u32((uint32_t*)&dst_bank[0], src_bank[0]); // r0_bank-r3_bank
vst1q_u32((uint32_t*)&dst_bank[1], src_bank[1]); // r4_bank-r7_bank
#else
// Fallback: bulk copy operations
memcpy(sh4rcb.cntx.r, r, sizeof(r));
// Skip FP registers for now
memcpy(sh4rcb.cntx.r_bank, r_bank, sizeof(r_bank));
#endif
// Store all control registers
sh4rcb.cntx.pc = ctrl[0]; sh4rcb.cntx.pr = ctrl[1];
sh4rcb.cntx.sr.T = ctrl[2]; sh4rcb.cntx.gbr = ctrl[3];
sh4rcb.cntx.vbr = ctrl[4]; sh4rcb.cntx.mac.l = ctrl[5];
sh4rcb.cntx.mac.h = ctrl[6]; // Skip sr.all = ctrl[7];
// Skip complex system registers for now
// sh4rcb.cntx.fpscr = fpscr; sh4rcb.cntx.fpul = fpul;
}
// Direct SH4 execution functions (imported from legacy interpreter)
extern void (*OpPtr[65536])(u32 op);
bool MassiveRegisterCache::lookup_memory_cache(u32 addr, u32& value) {
// Fast memory cache lookup using hash
u32 hash = (addr >> 2) & 1023; // Simple hash function
if (memory_valid[hash] && memory_tags[hash] == addr) {
value = memory_cache[hash];
cache_hits++;
return true;
}
cache_misses++;
return false;
}
void MassiveRegisterCache::update_memory_cache(u32 addr, u32 value) {
// Update memory cache with LRU replacement
u32 hash = (addr >> 2) & 1023;
memory_cache[hash] = value;
memory_tags[hash] = addr;
memory_valid[hash] = true;
memory_lru[hash] = total_instructions; // Use instruction count as timestamp
}
void MassiveRegisterCache::prefetch_memory(u32 addr) {
// Prefetch likely memory addresses based on patterns
if (addr >= 0x0C000000 && addr < 0x0D000000) { // Main RAM
// Prefetch next cache line
u32 next_addr = (addr + 32) & ~31;
u32 dummy;
if (!lookup_memory_cache(next_addr, dummy)) {
// Could trigger actual prefetch here
last_memory_access = next_addr;
}
}
}
#else
// Fallback for non-ARM64 platforms
void HybridRegisterCache::asm_mega_load() {
// Standard SIMD fallback
for (int i = 0; i < 16; i++) {
r[i] = sh4rcb.cntx.r[i];
}
ctrl[0] = sh4rcb.cntx.pc; ctrl[1] = sh4rcb.cntx.pr; ctrl[2] = sh4rcb.cntx.sr.T;
ctrl[3] = sh4rcb.cntx.gbr; ctrl[4] = sh4rcb.cntx.vbr; ctrl[5] = sh4rcb.cntx.mac.l;
ctrl[6] = sh4rcb.cntx.mac.h;
}
void HybridRegisterCache::asm_mega_store() {
// Standard SIMD fallback
for (int i = 0; i < 16; i++) {
sh4rcb.cntx.r[i] = r[i];
}
sh4rcb.cntx.pc = ctrl[0]; sh4rcb.cntx.pr = ctrl[1]; sh4rcb.cntx.sr.T = ctrl[2];
sh4rcb.cntx.gbr = ctrl[3]; sh4rcb.cntx.vbr = ctrl[4]; sh4rcb.cntx.mac.l = ctrl[5];
sh4rcb.cntx.mac.h = ctrl[6];
}
#endif
// === CACHE-FRIENDLY SHIL SYSTEM ===
// This prevents excessive cache clearing that destroys performance
struct CacheFriendlyShil {
// Track cache clears to prevent excessive clearing
static u32 cache_clear_count;
static u32 last_clear_time;
static u32 blocks_compiled_since_clear;
// Cache clear prevention thresholds
static constexpr u32 MIN_CLEAR_INTERVAL_MS = 5000; // Don't clear more than once per 5 seconds
static constexpr u32 MIN_BLOCKS_BEFORE_CLEAR = 100; // Need at least 100 blocks before clearing
// Override the aggressive cache clearing behavior
static bool should_prevent_cache_clear(u32 pc) {
u32 current_time = sh4_sched_now64() / (SH4_MAIN_CLOCK / 1000); // Convert to milliseconds
// Check if we're clearing too frequently
if (current_time - last_clear_time < MIN_CLEAR_INTERVAL_MS) {
INFO_LOG(DYNAREC, "SHIL: Preventing cache clear - too frequent (last clear %u ms ago)",
current_time - last_clear_time);
return true;
}
// Check if we have enough blocks to justify clearing
if (blocks_compiled_since_clear < MIN_BLOCKS_BEFORE_CLEAR) {
INFO_LOG(DYNAREC, "SHIL: Preventing cache clear - not enough blocks (%u < %u)",
blocks_compiled_since_clear, MIN_BLOCKS_BEFORE_CLEAR);
return true;
}
// Allow the clear but update tracking
cache_clear_count++;
last_clear_time = current_time;
blocks_compiled_since_clear = 0;
INFO_LOG(DYNAREC, "SHIL: Allowing cache clear #%u at PC=0x%08X", cache_clear_count, pc);
return false;
}
// Called when a new block is compiled
static void on_block_compiled() {
blocks_compiled_since_clear++;
}
// Statistics
static void print_cache_stats() {
INFO_LOG(DYNAREC, "SHIL Cache Stats: %u total clears, %u blocks since last clear",
cache_clear_count, blocks_compiled_since_clear);
}
// Hybrid execution decision
enum class ExecutionMode {
SHIL_INTERPRETED, // Use SHIL translation (cold code)
DIRECT_SH4, // Use direct SH4 execution (hot code)
MIXED_BLOCK // Mix of both within a block
};
// Static member definitions
u32 CacheFriendlyShil::cache_clear_count = 0;
u32 CacheFriendlyShil::last_clear_time = 0;
u32 CacheFriendlyShil::blocks_compiled_since_clear = 0;
// === PERSISTENT SHIL CACHE WITH ZERO RE-TRANSLATION ===
// This is the key to beating legacy interpreter performance!
struct PersistentShilCache {
// Persistent cache that survives cache clears
static std::unordered_map<u32, PrecompiledShilBlock*> persistent_cache;
static std::unordered_map<u32, u32> pc_to_hash_map;
static u32 total_cache_hits;
static u32 total_cache_misses;
struct HybridBlockInfo {
ExecutionMode mode;
u32 execution_count;
u32 pc_start;
u32 pc_end;
bool is_hot_path;
// Ultra-fast block lookup - faster than legacy interpreter
static PrecompiledShilBlock* ultra_fast_lookup(u32 pc) {
// Step 1: Check if we have a hash for this PC
auto hash_it = pc_to_hash_map.find(pc);
if (hash_it == pc_to_hash_map.end()) {
total_cache_misses++;
return nullptr;
}
// Step 2: Use hash to lookup precompiled block
auto cache_it = persistent_cache.find(hash_it->second);
if (cache_it != persistent_cache.end()) {
total_cache_hits++;
cache_it->second->execution_count++;
return cache_it->second;
}
total_cache_misses++;
return nullptr;
}
// For direct execution
std::vector<u16> direct_opcodes;
// Store compiled block permanently
static void store_persistent_block(u32 pc, PrecompiledShilBlock* block) {
u32 hash = block->sh4_hash;
persistent_cache[hash] = block;
pc_to_hash_map[pc] = hash;
INFO_LOG(DYNAREC, "SHIL: Stored persistent block PC=0x%08X hash=0x%08X opcodes=%zu",
pc, hash, block->optimized_opcodes.size());
}
// For SHIL execution
std::vector<shil_opcode> shil_opcodes;
// Never clear persistent cache - this is the key advantage!
static void clear_temporary_cache_only() {
// Only clear temporary data, keep persistent blocks
INFO_LOG(DYNAREC, "SHIL: Keeping %zu persistent blocks across cache clear",
persistent_cache.size());
}
// Print statistics
static void print_performance_stats() {
u32 total = total_cache_hits + total_cache_misses;
if (total > 0) {
float hit_rate = (float)total_cache_hits / total * 100.0f;
INFO_LOG(DYNAREC, "SHIL Cache: %u hits, %u misses, %.1f%% hit rate, %zu blocks cached",
total_cache_hits, total_cache_misses, hit_rate, persistent_cache.size());
}
}
HybridBlockInfo() : mode(ExecutionMode::SHIL_INTERPRETED), execution_count(0),
pc_start(0), pc_end(0), is_hot_path(false) {}
};
// Static member definitions
std::unordered_map<u32, PrecompiledShilBlock*> PersistentShilCache::persistent_cache;
std::unordered_map<u32, u32> PersistentShilCache::pc_to_hash_map;
u32 PersistentShilCache::total_cache_hits = 0;
u32 PersistentShilCache::total_cache_misses = 0;
// Hybrid block cache
static std::unordered_map<u32, HybridBlockInfo> hybrid_cache;
// Helper function to calculate SH4 hash
u32 calculate_sh4_hash(RuntimeBlockInfo* block) {
u32 hash = 0x811C9DC5; // FNV-1a hash
for (const auto& op : block->oplist) {
hash ^= (u32)op.op;
hash *= 0x01000193;
hash ^= op.rd.reg_nofs();
hash *= 0x01000193;
hash ^= op.rs1.reg_nofs();
hash *= 0x01000193;
}
return hash;
}
// === ZERO-TRANSLATION EXECUTION PATH ===
// This path should be faster than legacy interpreter
void ShilInterpreter::executeBlock(RuntimeBlockInfo* block) {
const u32 pc = sh4rcb.cntx.pc;
// Ultra-fast direct SH4 execution (like legacy interpreter)
static void execute_direct_sh4_block(const HybridBlockInfo& block_info) {
// Set up context like legacy interpreter
u32 saved_pc = next_pc;
// Track block compilation for cache management
CacheFriendlyShil::on_block_compiled();
// **CRITICAL PATH**: Try persistent cache first - should be 90%+ hit rate
PrecompiledShilBlock* cached_block = PersistentShilCache::ultra_fast_lookup(pc);
if (__builtin_expect(cached_block != nullptr, 1)) {
// **ZERO-TRANSLATION PATH**: Execute pre-optimized SHIL directly
// This should be faster than legacy interpreter!
// Load massive cache once
g_massive_cache.massive_load();
// Execute optimized opcodes with zero overhead
const auto& opcodes = cached_block->optimized_opcodes;
for (size_t i = 0; i < opcodes.size(); i++) {
const auto& op = opcodes[i];
try {
// Execute each opcode directly using the legacy interpreter's optimized handlers
for (u16 op : block_info.direct_opcodes) {
// This is exactly what the legacy interpreter does - zero overhead!
OpPtr[op](op);
// Ultra-fast execution using register cache
switch (op.op) {
case shop_mov32:
g_massive_cache.r[op.rd.reg_nofs()] = g_massive_cache.r[op.rs1.reg_nofs()];
break;
case shop_add:
g_massive_cache.r[op.rd.reg_nofs()] = g_massive_cache.r[op.rs1.reg_nofs()] + g_massive_cache.r[op.rs2.reg_nofs()];
break;
case shop_sub:
g_massive_cache.r[op.rd.reg_nofs()] = g_massive_cache.r[op.rs1.reg_nofs()] - g_massive_cache.r[op.rs2.reg_nofs()];
break;
// Add more optimized cases...
default:
// Minimal fallback
g_massive_cache.massive_store();
executeOpcode(op);
g_massive_cache.massive_load();
break;
// Handle branch instructions that modify next_pc
if (next_pc != saved_pc + 2) {
break; // Branch taken, exit block
}
saved_pc = next_pc;
}
// Store massive cache once
g_massive_cache.massive_store();
return;
} catch (const SH4ThrownException& ex) {
// Handle exceptions like legacy interpreter
Do_Exception(ex.epc, ex.expEvn);
}
// **SLOW PATH**: Need to compile and cache this block
// This should happen rarely after warmup
const auto& oplist = block->oplist;
const size_t op_count = oplist.size();
// Create optimized block
PrecompiledShilBlock* new_block = new PrecompiledShilBlock();
new_block->optimized_opcodes = oplist; // Copy and optimize
new_block->sh4_hash = calculate_sh4_hash(block);
new_block->execution_count = 1;
new_block->is_hot = false;
// Store in persistent cache
PersistentShilCache::store_persistent_block(pc, new_block);
// Execute normally for first time
g_massive_cache.massive_load();
for (size_t i = 0; i < op_count; i++) {
executeOpcode(oplist[i]);
}
g_massive_cache.massive_store();
}
// HYBRID MAIN LOOP: Assembly-optimized with pattern recognition + SHIL caching
void shil_interpreter_mainloop(void* v_cntx) {
p_sh4rcb = (Sh4RCB*)((u8*)v_cntx - sizeof(Sh4Context));
// Determine if a block should use direct execution
static ExecutionMode determine_execution_mode(u32 pc, const std::vector<u16>& opcodes) {
// Check execution frequency
u32& freq = execution_frequency[pc];
freq++;
// Print cache stats periodically
static u32 stats_counter = 0;
if (++stats_counter % 10000 == 0) {
ShilCache::print_cache_stats();
if (freq < DIRECT_EXECUTION_THRESHOLD) {
return ExecutionMode::SHIL_INTERPRETED;
}
while (__builtin_expect(emu.running(), 1)) {
const u32 pc = sh4rcb.cntx.pc;
// Assembly-optimized block lookup
DynarecCodeEntryPtr code_ptr = bm_GetCodeByVAddr(pc);
if (__builtin_expect(code_ptr != ngen_FailedToFindBlock, 1)) {
if (__builtin_expect(reinterpret_cast<uintptr_t>(code_ptr) & 0x1, 1)) {
RuntimeBlockInfo* block = reinterpret_cast<RuntimeBlockInfo*>(reinterpret_cast<uintptr_t>(code_ptr) & ~0x1ULL);
// HYBRID execution: Assembly + Function Fusion + SHIL Caching
ShilInterpreter::executeBlock(block);
// Update PC
sh4rcb.cntx.pc += block->sh4_code_size * 2;
}
} else {
// Analyze opcodes to see if they're suitable for direct execution
bool has_complex_ops = false;
for (u16 op : opcodes) {
// Check if opcode is complex (FPU, etc.) - simplified check
if (OpDesc[op]->IsFloatingPoint()) {
has_complex_ops = true;
break;
}
}
// Hot path with simple opcodes -> direct execution
if (!has_complex_ops) {
return ExecutionMode::DIRECT_SH4;
}
// Mix of complex and simple -> mixed mode
return ExecutionMode::MIXED_BLOCK;
}
// Create hybrid block from SH4 code
static HybridBlockInfo create_hybrid_block(u32 pc) {
HybridBlockInfo block;
block.pc_start = pc;
block.execution_count = 0;
// Read SH4 opcodes directly from memory
u32 current_pc = pc;
std::vector<u16> opcodes;
// Decode basic block (until branch or max size)
constexpr u32 MAX_BLOCK_SIZE = 32;
for (u32 i = 0; i < MAX_BLOCK_SIZE; i++) {
u16 op = IReadMem16(current_pc);
opcodes.push_back(op);
current_pc += 2;
// Minimal cycle counting
sh4_sched_ffts();
}
}
// === SHIL CACHE MANAGEMENT ===
// This function should be called when the dynarec cache is cleared
void shil_interpreter_clear_cache() {
// CRITICAL: Don't clear persistent cache - this is our advantage!
PersistentShilCache::clear_temporary_cache_only();
INFO_LOG(DYNAREC, "SHIL interpreter: Preserved persistent cache across clear");
}
// This function should be called periodically to print cache statistics
void shil_interpreter_print_stats() {
PersistentShilCache::print_performance_stats();
CacheFriendlyShil::print_cache_stats();
}
// === CACHE-FRIENDLY WRAPPER FUNCTIONS ===
// These functions can be called instead of direct cache clearing
// Wrapper for rdv_CompilePC cache clearing
bool shil_should_clear_cache_on_compile(u32 pc, u32 free_space) {
// In jitless mode, we don't need much code buffer space
// Only clear if we're really running out of space
if (free_space < 4_MB) { // Much more conservative than 32MB
return !CacheFriendlyShil::should_prevent_cache_clear(pc);
}
// Don't clear for hardcoded PC addresses unless really necessary
if (pc == 0x8c0000e0 || pc == 0xac010000 || pc == 0xac008300) {
// These are boot/BIOS addresses - be very conservative
return free_space < 1_MB && !CacheFriendlyShil::should_prevent_cache_clear(pc);
}
return false; // Don't clear
}
// === CACHE-FRIENDLY BLOCK CHECK FAILURE HANDLING ===
// This prevents the devastating cache clears that happen every few seconds
// Track block check failures per address
static std::unordered_map<u32, u32> block_check_failure_counts;
static u32 total_block_check_failures = 0;
// Handle block check failure without nuking the entire cache
DynarecCodeEntryPtr shil_handle_block_check_fail(u32 addr) {
total_block_check_failures++;
// Track failures for this specific address
u32& failure_count = block_check_failure_counts[addr];
failure_count++;
INFO_LOG(DYNAREC, "SHIL: Block check fail @ 0x%08X (failure #%u for this addr, #%u total)",
addr, failure_count, total_block_check_failures);
// Only clear cache if this address has failed many times
if (failure_count > 20) { // Much more conservative than clearing every time
// Reset failure count for this address
failure_count = 0;
// Only clear if cache-friendly logic allows it
if (!CacheFriendlyShil::should_prevent_cache_clear(addr)) {
INFO_LOG(DYNAREC, "SHIL: Clearing cache due to persistent failures at 0x%08X", addr);
PersistentShilCache::clear_temporary_cache_only();
} else {
INFO_LOG(DYNAREC, "SHIL: Prevented cache clear despite persistent failures at 0x%08X", addr);
// Stop at branch instructions - simplified check
if (OpDesc[op]->SetPC()) {
break;
}
}
// Just discard the problematic block, don't clear everything
RuntimeBlockInfoPtr block = bm_GetBlock(addr);
if (block) {
bm_DiscardBlock(block.get());
INFO_LOG(DYNAREC, "SHIL: Discarded problematic block at 0x%08X", addr);
block.pc_end = current_pc;
block.mode = determine_execution_mode(pc, opcodes);
if (block.mode == ExecutionMode::DIRECT_SH4) {
// Store opcodes for direct execution
block.direct_opcodes = opcodes;
block.is_hot_path = true;
} else {
// Convert to SHIL for interpreted execution
// TODO: This would use the existing SHIL translation
// For now, fall back to direct execution
block.direct_opcodes = opcodes;
block.mode = ExecutionMode::DIRECT_SH4;
}
// Recompile the block
next_pc = addr;
return (DynarecCodeEntryPtr)CC_RW2RX(rdv_CompilePC(failure_count));
return block;
}
// Statistics function
void shil_print_block_check_stats() {
INFO_LOG(DYNAREC, "SHIL Block Check Stats: %u total failures, %zu unique addresses",
total_block_check_failures, block_check_failure_counts.size());
// Print top 5 problematic addresses
std::vector<std::pair<u32, u32>> sorted_failures;
for (const auto& pair : block_check_failure_counts) {
sorted_failures.push_back({pair.second, pair.first});
// Main hybrid execution function
void execute_hybrid_block(u32 pc) {
// Check hybrid cache first
auto it = hybrid_cache.find(pc);
if (it == hybrid_cache.end()) {
// Create new hybrid block
hybrid_cache[pc] = create_hybrid_block(pc);
it = hybrid_cache.find(pc);
}
std::sort(sorted_failures.rbegin(), sorted_failures.rend());
INFO_LOG(DYNAREC, "Top problematic addresses:");
for (size_t i = 0; i < std::min(size_t(5), sorted_failures.size()); i++) {
INFO_LOG(DYNAREC, " 0x%08X: %u failures", sorted_failures[i].second, sorted_failures[i].first);
HybridBlockInfo& block = it->second;
block.execution_count++;
// Execute based on mode
switch (block.mode) {
case ExecutionMode::DIRECT_SH4:
// Ultra-fast direct execution like legacy interpreter
execute_direct_sh4_block(block);
break;
case ExecutionMode::SHIL_INTERPRETED:
// Fall back to SHIL interpretation
// TODO: Execute SHIL opcodes
execute_direct_sh4_block(block); // Temporary fallback
break;
case ExecutionMode::MIXED_BLOCK:
// Mix of both approaches
execute_direct_sh4_block(block); // Temporary fallback
break;
}
}
// Statistics and monitoring
void print_hybrid_stats() {
u32 direct_blocks = 0;
u32 shil_blocks = 0;
u32 total_executions = 0;
for (const auto& [pc, block] : hybrid_cache) {
total_executions += block.execution_count;
if (block.mode == ExecutionMode::DIRECT_SH4) {
direct_blocks++;
} else {
shil_blocks++;
}
}
INFO_LOG(DYNAREC, "🚀 HYBRID STATS: %u direct blocks, %u SHIL blocks, %u total executions",
direct_blocks, shil_blocks, total_executions);
// Print top hot paths
std::vector<std::pair<u32, u32>> hot_paths;
for (const auto& [pc, block] : hybrid_cache) {
if (block.execution_count > 100) {
hot_paths.push_back({pc, block.execution_count});
}
}
std::sort(hot_paths.begin(), hot_paths.end(),
[](const auto& a, const auto& b) { return a.second > b.second; });
INFO_LOG(DYNAREC, "🔥 TOP HOT PATHS:");
for (size_t i = 0; i < std::min(hot_paths.size(), size_t(10)); i++) {
INFO_LOG(DYNAREC, " PC=0x%08X: %u executions", hot_paths[i].first, hot_paths[i].second);
}
}
@ -1289,7 +1012,13 @@ void shil_print_block_check_stats() {
// C-style wrapper for CacheFriendlyShil::on_block_compiled()
extern "C" void CacheFriendlyShil_on_block_compiled() {
CacheFriendlyShil::on_block_compiled();
// Simple block compilation tracking
static u32 blocks_compiled = 0;
blocks_compiled++;
if (blocks_compiled % 1000 == 0) {
INFO_LOG(DYNAREC, "HYBRID: Compiled %u blocks", blocks_compiled);
}
}
// C-style wrapper for shil_print_block_check_stats()

View File

@ -76,4 +76,8 @@ void shil_interpreter_print_stats();
/// SHIL cache-friendly wrapper functions
bool shil_should_clear_cache_on_compile(u32 pc, u32 free_space);
DynarecCodeEntryPtr shil_handle_block_check_fail(u32 addr);
void shil_print_block_check_stats();
void shil_print_block_check_stats();
/// Hybrid direct execution system
void execute_hybrid_block(u32 pc);
void print_hybrid_stats();