From 1205e6dacaaa6438734674c558bc4fe25397bed4 Mon Sep 17 00:00:00 2001
From: Joseph Mattiello <mail@joemattiello.com>
Date: Fri, 4 Jul 2025 22:26:17 -0400
Subject: [PATCH] add shil dynarec

---
 core/hw/sh4/arm64_simd.h              |  18 +
 core/hw/sh4/dyna/shil_dynarec.cpp     | 110 ++++++
 core/hw/sh4/dyna/shil_dynarec.h       |  36 ++
 core/hw/sh4/dyna/shil_interpreter.cpp | 537 ++++++++++++++++++++++++++
 core/hw/sh4/dyna/shil_interpreter.h   |  64 +++
 5 files changed, 765 insertions(+)
 create mode 100644 core/hw/sh4/arm64_simd.h
 create mode 100644 core/hw/sh4/dyna/shil_dynarec.cpp
 create mode 100644 core/hw/sh4/dyna/shil_dynarec.h
 create mode 100644 core/hw/sh4/dyna/shil_interpreter.cpp
 create mode 100644 core/hw/sh4/dyna/shil_interpreter.h

diff --git a/core/hw/sh4/arm64_simd.h b/core/hw/sh4/arm64_simd.h
new file mode 100644
index 000000000..42d4f20a2
--- /dev/null
+++ b/core/hw/sh4/arm64_simd.h
@@ -0,0 +1,18 @@
+#pragma once
+// Simple Arm-NEON helpers used by SH4 interpreter fast paths
+#if defined(__aarch64__)
+#include <arm_neon.h>
+
+// Multiply-accumulate 4 pairs of signed 16-bit values and return 64-bit sum
+static inline int64_t mac_w_4x(const uint16_t* __restrict pA,
+                               const uint16_t* __restrict pB)
+{
+    int16x4_t a = vld1_s16((const int16_t*)pA);   // load 4 halfwords
+    int16x4_t b = vld1_s16((const int16_t*)pB);
+    int32x4_t prod = vmull_s16(a, b);             // 4 × 32-bit products
+    // add horizontally: pairwise add, then widen to 64-bit and add again
+    int64x2_t sum2 = vpaddlq_s32(vcombine_s32(vget_low_s32(prod),
+                                             vget_high_s32(prod)));
+    return vgetq_lane_s64(sum2, 0) + vgetq_lane_s64(sum2, 1);
+}
+#endif // __aarch64__
diff --git a/core/hw/sh4/dyna/shil_dynarec.cpp b/core/hw/sh4/dyna/shil_dynarec.cpp
new file mode 100644
index 000000000..64558a144
--- /dev/null
+++ b/core/hw/sh4/dyna/shil_dynarec.cpp
@@ -0,0 +1,110 @@
+#include "shil_dynarec.h"
+#include "hw/sh4/sh4_core.h"
+#include "hw/sh4/sh4_interrupts.h"
+#include "hw/sh4/sh4_mem.h"
+#include "oslib/oslib.h"
+
+// Simple stub function that will be used as the "compiled" code pointer
+static void shil_interpreter_stub() {
+    // This function should never actually be called
+    // The mainloop will detect SHIL blocks and call the interpreter directly
+}
+
+void ShilDynarec::init(Sh4CodeBuffer& buffer) {
+    this->codeBuffer = &buffer;
+}
+
+void ShilDynarec::compile(RuntimeBlockInfo* block, bool smc_checks, bool optimise) {
+    // For SHIL interpreter, we don't actually compile anything
+    // We just set the code pointer to our stub function
+    // The real magic happens in mainloop() which detects this and calls the interpreter
+    
+    block->code = (DynarecCodeEntryPtr)shil_interpreter_stub;
+    block->host_code_size = 4; // Minimal size
+    block->host_opcodes = block->oplist.size(); // Number of SHIL opcodes
+}
+
+void ShilDynarec::executeShilBlock(RuntimeBlockInfo* block) {
+    ShilInterpreter::executeBlock(block);
+}
+
+void ShilDynarec::mainloop(void* cntx) {
+    // Set up context
+    p_sh4rcb = (Sh4RCB*)((u8*)cntx - sizeof(Sh4cntx));
+    
+    try {
+        while (sh4_int_bCpuRun) {
+            // Check for exceptions first
+            if (UpdateSystem()) {
+                // Exception occurred, continue with exception PC
+                continue;
+            }
+            
+            // Get the next block to execute
+            DynarecCodeEntryPtr code_ptr = rdv_FindOrCompile();
+            
+            if (code_ptr == ngen_FailedToFindBlock) {
+                code_ptr = rdv_FailedToFindBlock(next_pc);
+            }
+            
+            // Check if this is a SHIL interpreter block
+            if (code_ptr == (DynarecCodeEntryPtr)shil_interpreter_stub) {
+                // This is a SHIL block - execute via interpreter
+                RuntimeBlockInfoPtr block = bm_GetBlock(next_pc);
+                if (block) {
+                    executeShilBlock(block.get());
+                    
+                    // After executing the block, determine next PC based on block type
+                    switch (BET_GET_CLS(block->BlockType)) {
+                        case BET_CLS_Static:
+                            if (block->BlockType == BET_StaticIntr) {
+                                next_pc = block->NextBlock;
+                            } else {
+                                next_pc = block->BranchBlock;
+                            }
+                            break;
+                            
+                        case BET_CLS_Dynamic:
+                            // PC should have been set by the block execution
+                            next_pc = Sh4cntx.pc;
+                            break;
+                            
+                        case BET_CLS_COND:
+                            // Conditional branch - check the condition
+                            if (sr.T) {
+                                next_pc = block->BranchBlock;
+                            } else {
+                                next_pc = block->NextBlock;
+                            }
+                            break;
+                    }
+                } else {
+                    // Block not found - this shouldn't happen
+                    ERROR_LOG(DYNAREC, "SHIL block not found for PC %08X", next_pc);
+                    break;
+                }
+            } else {
+                // This is a regular JIT block - this shouldn't happen in SHIL mode
+                ERROR_LOG(DYNAREC, "Unexpected JIT block in SHIL mode at PC %08X", next_pc);
+                break;
+            }
+        }
+    } catch (const SH4ThrownException& ex) {
+        // Handle SH4 exceptions
+        Do_Exception(next_pc, ex.expEvn);
+    } catch (const FlycastException& ex) {
+        // Handle other exceptions
+        ERROR_LOG(DYNAREC, "Exception in SHIL mainloop: %s", ex.what());
+        sh4_int_bCpuRun = false;
+    }
+}
+
+void ShilDynarec::handleException(host_context_t& context) {
+    // For the interpreter, we don't need to do anything special
+    // Exceptions are handled normally in the mainloop
+}
+
+bool ShilDynarec::rewrite(host_context_t& context, void *faultAddress) {
+    // Memory rewriting is not needed for the interpreter
+    return false;
+} 
\ No newline at end of file
diff --git a/core/hw/sh4/dyna/shil_dynarec.h b/core/hw/sh4/dyna/shil_dynarec.h
new file mode 100644
index 000000000..55052c9ba
--- /dev/null
+++ b/core/hw/sh4/dyna/shil_dynarec.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "ngen.h"
+#include "shil_interpreter.h"
+
+/// A dynarec backend that interprets SHIL instead of compiling to native code
+class ShilDynarec : public Sh4Dynarec {
+private:
+    Sh4CodeBuffer* codeBuffer = nullptr;
+    
+public:
+    // Initialize the dynarec
+    void init(Sh4CodeBuffer& codeBuffer) override;
+    
+    // "Compile" the block by storing it for interpretation
+    void compile(RuntimeBlockInfo* block, bool smc_checks, bool optimise) override;
+    
+    // Run the main execution loop
+    void mainloop(void* cntx) override;
+    
+    // Handle exceptions
+    void handleException(host_context_t& context) override;
+    
+    // Memory rewrite (not needed for interpreter)
+    bool rewrite(host_context_t& context, void *faultAddress) override;
+    
+    // Canonical implementation callbacks (not used in interpreter mode)
+    void canonStart(const shil_opcode *op) override {}
+    void canonParam(const shil_opcode *op, const shil_param *param, CanonicalParamType paramType) override {}
+    void canonCall(const shil_opcode *op, void *function) override {}
+    void canonFinish(const shil_opcode *op) override {}
+    
+private:
+    /// Execute a SHIL block via interpretation
+    static void executeShilBlock(RuntimeBlockInfo* block);
+}; 
\ No newline at end of file
diff --git a/core/hw/sh4/dyna/shil_interpreter.cpp b/core/hw/sh4/dyna/shil_interpreter.cpp
new file mode 100644
index 000000000..76d50a322
--- /dev/null
+++ b/core/hw/sh4/dyna/shil_interpreter.cpp
@@ -0,0 +1,537 @@
+#include "shil_interpreter.h"
+#include "hw/sh4/sh4_core.h"
+#include "hw/sh4/sh4_mem.h"
+#include "hw/sh4/sh4_interrupts.h"
+#include "hw/sh4/sh4_opcode_list.h"
+#include "emulator.h"
+#include "cfg/cfg.h"
+#include "blockmanager.h"
+#include "ngen.h"
+#include <cmath>
+
+// Global flag to enable SHIL interpretation mode
+bool enable_shil_interpreter = false;
+
+// Initialize SHIL interpreter setting from config
+void init_shil_interpreter_setting() {
+    static bool initialized = false;
+    if (!initialized) {
+        enable_shil_interpreter = cfgLoadBool("dynarec", "UseShilInterpreter", false);
+        initialized = true;
+    }
+}
+
+// Global flag to indicate if we should exit block execution
+static bool should_exit_block = false;
+
+// OPTIMIZATION: Inline register access functions to avoid function call overhead
+static inline u32 getRegValue(const shil_param& param) {
+    if (param.is_imm()) {
+        return param.imm_value();
+    } else if (param.is_reg()) {
+        return *param.reg_ptr();
+    }
+    return 0;
+}
+
+static inline void setRegValue(const shil_param& param, u32 value) {
+    if (param.is_reg()) {
+        *param.reg_ptr() = value;
+    }
+}
+
+static inline f32 getFloatRegValue(const shil_param& param) {
+    if (param.is_imm()) {
+        return *(f32*)&param._imm;
+    } else if (param.is_reg()) {
+        return *(f32*)param.reg_ptr();
+    }
+    return 0.0f;
+}
+
+static inline void setFloatRegValue(const shil_param& param, f32 value) {
+    if (param.is_reg()) {
+        *(f32*)param.reg_ptr() = value;
+    }
+}
+
+// OPTIMIZATION: Fast path for common register-to-register moves
+static inline void fastMov32(const shil_param& dst, const shil_param& src) {
+    if (dst.is_reg() && src.is_reg()) {
+        *dst.reg_ptr() = *src.reg_ptr();
+    } else if (dst.is_reg() && src.is_imm()) {
+        *dst.reg_ptr() = src.imm_value();
+    } else {
+        setRegValue(dst, getRegValue(src));
+    }
+}
+
+// OPTIMIZATION: Fast path for common arithmetic operations
+static inline void fastAdd(const shil_param& dst, const shil_param& src1, const shil_param& src2) {
+    if (dst.is_reg() && src1.is_reg() && src2.is_reg()) {
+        *dst.reg_ptr() = *src1.reg_ptr() + *src2.reg_ptr();
+    } else if (dst.is_reg() && src1.is_reg() && src2.is_imm()) {
+        *dst.reg_ptr() = *src1.reg_ptr() + src2.imm_value();
+    } else {
+        setRegValue(dst, getRegValue(src1) + getRegValue(src2));
+    }
+}
+
+static inline void fastSub(const shil_param& dst, const shil_param& src1, const shil_param& src2) {
+    if (dst.is_reg() && src1.is_reg() && src2.is_reg()) {
+        *dst.reg_ptr() = *src1.reg_ptr() - *src2.reg_ptr();
+    } else if (dst.is_reg() && src1.is_reg() && src2.is_imm()) {
+        *dst.reg_ptr() = *src1.reg_ptr() - src2.imm_value();
+    } else {
+        setRegValue(dst, getRegValue(src1) - getRegValue(src2));
+    }
+}
+
+// OPTIMIZATION: Fast path for bitwise operations
+static inline void fastAnd(const shil_param& dst, const shil_param& src1, const shil_param& src2) {
+    if (dst.is_reg() && src1.is_reg() && src2.is_reg()) {
+        *dst.reg_ptr() = *src1.reg_ptr() & *src2.reg_ptr();
+    } else if (dst.is_reg() && src1.is_reg() && src2.is_imm()) {
+        *dst.reg_ptr() = *src1.reg_ptr() & src2.imm_value();
+    } else {
+        setRegValue(dst, getRegValue(src1) & getRegValue(src2));
+    }
+}
+
+static inline void fastOr(const shil_param& dst, const shil_param& src1, const shil_param& src2) {
+    if (dst.is_reg() && src1.is_reg() && src2.is_reg()) {
+        *dst.reg_ptr() = *src1.reg_ptr() | *src2.reg_ptr();
+    } else if (dst.is_reg() && src1.is_reg() && src2.is_imm()) {
+        *dst.reg_ptr() = *src1.reg_ptr() | src2.imm_value();
+    } else {
+        setRegValue(dst, getRegValue(src1) | getRegValue(src2));
+    }
+}
+
+static inline void fastXor(const shil_param& dst, const shil_param& src1, const shil_param& src2) {
+    if (dst.is_reg() && src1.is_reg() && src2.is_reg()) {
+        *dst.reg_ptr() = *src1.reg_ptr() ^ *src2.reg_ptr();
+    } else if (dst.is_reg() && src1.is_reg() && src2.is_imm()) {
+        *dst.reg_ptr() = *src1.reg_ptr() ^ src2.imm_value();
+    } else {
+        setRegValue(dst, getRegValue(src1) ^ getRegValue(src2));
+    }
+}
+
+// Remove old member functions - they're now static inline
+u32 ShilInterpreter::getRegValue(const shil_param& param) {
+    return ::getRegValue(param);
+}
+
+void ShilInterpreter::setRegValue(const shil_param& param, u32 value) {
+    ::setRegValue(param, value);
+}
+
+f32 ShilInterpreter::getFloatRegValue(const shil_param& param) {
+    return ::getFloatRegValue(param);
+}
+
+void ShilInterpreter::setFloatRegValue(const shil_param& param, f32 value) {
+    ::setFloatRegValue(param, value);
+}
+
+u64 ShilInterpreter::getReg64Value(const shil_param& param) {
+    if (param.is_imm()) {
+        return param.imm_value();
+    } else if (param.is_reg()) {
+        return *reinterpret_cast<u64*>(param.reg_ptr());
+    }
+    return 0;
+}
+
+void ShilInterpreter::setReg64Value(const shil_param& param, u64 value) {
+    if (param.is_reg()) {
+        *reinterpret_cast<u64*>(param.reg_ptr()) = value;
+    }
+}
+
+void* ShilInterpreter::getPointer(const shil_param& param) {
+    if (param.is_reg()) {
+        return param.reg_ptr();
+    }
+    return nullptr;
+}
+
+u32 ShilInterpreter::handleMemoryRead(const shil_param& addr, u32 size) {
+    u32 address = getRegValue(addr);
+    switch (size) {
+        case 1: return ReadMem8(address);
+        case 2: return ReadMem16(address);
+        case 4: return ReadMem32(address);
+        default: return 0;
+    }
+}
+
+void ShilInterpreter::handleMemoryWrite(const shil_param& addr, const shil_param& value, u32 size) {
+    u32 address = getRegValue(addr);
+    u32 val = getRegValue(value);
+    switch (size) {
+        case 1: WriteMem8(address, val); break;
+        case 2: WriteMem16(address, val); break;
+        case 4: WriteMem32(address, val); break;
+    }
+}
+
+void ShilInterpreter::handleInterpreterFallback(const shil_opcode& op) {
+    // Set PC if needed
+    if (op.rs1.imm_value()) {
+        next_pc = op.rs2.imm_value();
+    }
+    
+    // Call SH4 instruction handler directly
+    u32 opcode = op.rs3.imm_value();
+    OpDesc[opcode]->oph(opcode);
+    
+    // Exit block after fallback
+    should_exit_block = true;
+}
+
+void ShilInterpreter::handleDynamicJump(const shil_opcode& op) {
+    // Set dynamic PC
+    u32 target = getRegValue(op.rs1);
+    if (!op.rs2.is_null()) {
+        target += getRegValue(op.rs2);
+    }
+    *op.rd.reg_ptr() = target;
+    should_exit_block = true;
+}
+
+void ShilInterpreter::handleConditionalJump(const shil_opcode& op) {
+    // Set conditional jump target
+    u32 target = getRegValue(op.rs2);
+    *op.rd.reg_ptr() = target;
+    should_exit_block = true;
+}
+
+// OPTIMIZATION: Optimized executeOpcode with fast paths and better branch prediction
+void ShilInterpreter::executeOpcode(const shil_opcode& op) {
+    // OPTIMIZATION: Fast paths for the most common operations
+    switch (op.op) {
+        case shop_mov32:
+            fastMov32(op.rd, op.rs1);
+            return;
+            
+        case shop_add:
+            fastAdd(op.rd, op.rs1, op.rs2);
+            return;
+            
+        case shop_sub:
+            fastSub(op.rd, op.rs1, op.rs2);
+            return;
+            
+        case shop_and:
+            fastAnd(op.rd, op.rs1, op.rs2);
+            return;
+            
+        case shop_or:
+            fastOr(op.rd, op.rs1, op.rs2);
+            return;
+            
+        case shop_xor:
+            fastXor(op.rd, op.rs1, op.rs2);
+            return;
+            
+        case shop_readm: {
+            u32 addr = getRegValue(op.rs1);
+            u32 size = op.rs2._imm;
+            // OPTIMIZATION: Fast path for common 32-bit reads
+            if (__builtin_expect(size == 4, 1)) {
+                setRegValue(op.rd, ReadMem32(addr));
+                return;
+            }
+            // Handle other sizes
+            u32 value = 0;
+            switch (size) {
+                case 1: value = ReadMem8(addr); break;
+                case 2: value = ReadMem16(addr); break;
+                case 8: {
+                    u64 val64 = ReadMem64(addr);
+                    setRegValue(op.rd, (u32)val64);
+                    setRegValue(op.rd2, (u32)(val64 >> 32));
+                    return;
+                }
+            }
+            setRegValue(op.rd, value);
+            return;
+        }
+        
+        case shop_writem: {
+            u32 addr = getRegValue(op.rs1);
+            u32 size = op.rs2._imm;
+            // OPTIMIZATION: Fast path for common 32-bit writes
+            if (__builtin_expect(size == 4, 1)) {
+                WriteMem32(addr, getRegValue(op.rs3));
+                return;
+            }
+            // Handle other sizes
+            u32 value = getRegValue(op.rs3);
+            switch (size) {
+                case 1: WriteMem8(addr, value); break;
+                case 2: WriteMem16(addr, value); break;
+                case 8: {
+                    u64 val64 = ((u64)getRegValue(op.rs3) << 32) | getRegValue(op.rs2);
+                    WriteMem64(addr, val64);
+                    break;
+                }
+            }
+            return;
+        }
+        
+        case shop_jcond:
+            if (__builtin_expect(sr.T == op.rs2._imm, 0)) {
+                next_pc = getRegValue(op.rs1);
+                should_exit_block = true;
+            }
+            return;
+            
+        case shop_jdyn:
+            next_pc = getRegValue(op.rs1);
+            should_exit_block = true;
+            return;
+            
+        case shop_pref:
+            // Prefetch instruction - no-op for interpreter
+            return;
+            
+        // Continue with remaining opcodes
+        case shop_mov64:
+            setRegValue(op.rd, getRegValue(op.rs1));
+            setRegValue(op.rd2, getRegValue(op.rs2));
+            break;
+            
+        case shop_mul_u16:
+            setRegValue(op.rd, (u16)getRegValue(op.rs1) * (u16)getRegValue(op.rs2));
+            break;
+            
+        case shop_mul_s16:
+            setRegValue(op.rd, (s16)getRegValue(op.rs1) * (s16)getRegValue(op.rs2));
+            break;
+            
+        case shop_mul_i32:
+            setRegValue(op.rd, (s32)getRegValue(op.rs1) * (s32)getRegValue(op.rs2));
+            break;
+            
+        case shop_mul_u64:
+            {
+                u64 result = (u64)getRegValue(op.rs1) * (u64)getRegValue(op.rs2);
+                setRegValue(op.rd, (u32)result);
+                setRegValue(op.rd2, (u32)(result >> 32));
+            }
+            break;
+            
+        case shop_mul_s64:
+            {
+                s64 result = (s64)(s32)getRegValue(op.rs1) * (s64)(s32)getRegValue(op.rs2);
+                setRegValue(op.rd, (u32)result);
+                setRegValue(op.rd2, (u32)(result >> 32));
+            }
+            break;
+            
+        case shop_not:
+            setRegValue(op.rd, ~getRegValue(op.rs1));
+            break;
+            
+        case shop_shl:
+            setRegValue(op.rd, getRegValue(op.rs1) << (getRegValue(op.rs2) & 0x1F));
+            break;
+            
+        case shop_shr:
+            setRegValue(op.rd, getRegValue(op.rs1) >> (getRegValue(op.rs2) & 0x1F));
+            break;
+            
+        case shop_sar:
+            setRegValue(op.rd, (s32)getRegValue(op.rs1) >> (getRegValue(op.rs2) & 0x1F));
+            break;
+            
+        case shop_neg:
+            setRegValue(op.rd, -(s32)getRegValue(op.rs1));
+            break;
+            
+        case shop_swaplb:
+            {
+                u32 val = getRegValue(op.rs1);
+                setRegValue(op.rd, (val & 0xFFFF0000) | ((val & 0xFF) << 8) | ((val >> 8) & 0xFF));
+            }
+            break;
+            
+        case shop_test:
+            sr.T = (getRegValue(op.rs1) & getRegValue(op.rs2)) == 0 ? 1 : 0;
+            break;
+            
+        case shop_seteq:
+            sr.T = (getRegValue(op.rs1) == getRegValue(op.rs2)) ? 1 : 0;
+            break;
+            
+        case shop_setge:
+            sr.T = ((s32)getRegValue(op.rs1) >= (s32)getRegValue(op.rs2)) ? 1 : 0;
+            break;
+            
+        case shop_setgt:
+            sr.T = ((s32)getRegValue(op.rs1) > (s32)getRegValue(op.rs2)) ? 1 : 0;
+            break;
+            
+        case shop_setab:
+            sr.T = (getRegValue(op.rs1) > getRegValue(op.rs2)) ? 1 : 0;
+            break;
+            
+        case shop_setae:
+            sr.T = (getRegValue(op.rs1) >= getRegValue(op.rs2)) ? 1 : 0;
+            break;
+            
+        case shop_ext_s8:
+            setRegValue(op.rd, (s32)(s8)getRegValue(op.rs1));
+            break;
+            
+        case shop_ext_s16:
+            setRegValue(op.rd, (s32)(s16)getRegValue(op.rs1));
+            break;
+            
+        case shop_cvt_i2f_n:
+            setFloatRegValue(op.rd, (f32)(s32)getRegValue(op.rs1));
+            break;
+            
+        case shop_cvt_f2i_t:
+            setRegValue(op.rd, (u32)(s32)getFloatRegValue(op.rs1));
+            break;
+            
+        case shop_fadd:
+            setFloatRegValue(op.rd, getFloatRegValue(op.rs1) + getFloatRegValue(op.rs2));
+            break;
+            
+        case shop_fsub:
+            setFloatRegValue(op.rd, getFloatRegValue(op.rs1) - getFloatRegValue(op.rs2));
+            break;
+            
+        case shop_fmul:
+            setFloatRegValue(op.rd, getFloatRegValue(op.rs1) * getFloatRegValue(op.rs2));
+            break;
+            
+        case shop_fdiv:
+            setFloatRegValue(op.rd, getFloatRegValue(op.rs1) / getFloatRegValue(op.rs2));
+            break;
+            
+        case shop_fabs:
+            setFloatRegValue(op.rd, fabsf(getFloatRegValue(op.rs1)));
+            break;
+            
+        case shop_fneg:
+            setFloatRegValue(op.rd, -getFloatRegValue(op.rs1));
+            break;
+            
+        case shop_fsqrt:
+            setFloatRegValue(op.rd, sqrtf(getFloatRegValue(op.rs1)));
+            break;
+            
+        case shop_fmac:
+            setFloatRegValue(op.rd, getFloatRegValue(op.rs1) * getFloatRegValue(op.rs2) + getFloatRegValue(op.rs3));
+            break;
+            
+        case shop_fseteq:
+            sr.T = (getFloatRegValue(op.rs1) == getFloatRegValue(op.rs2)) ? 1 : 0;
+            break;
+            
+        case shop_fsetgt:
+            sr.T = (getFloatRegValue(op.rs1) > getFloatRegValue(op.rs2)) ? 1 : 0;
+            break;
+            
+        case shop_ifb:
+            // Interpreter fallback - execute original SH4 instruction
+            if (op.rs1._imm) {
+                next_pc = op.rs2._imm;
+            }
+            {
+                u32 opcode = op.rs3._imm;
+                OpDesc[opcode]->oph(opcode);
+            }
+            should_exit_block = true;
+            break;
+            
+        default:
+            // Unhandled opcode - fallback to interpreter
+            WARN_LOG(DYNAREC, "Unhandled SHIL opcode: %d", op.op);
+            should_exit_block = true;
+            break;
+    }
+}
+
+// OPTIMIZATION: Optimized block execution with reduced function call overhead
+void ShilInterpreter::executeBlock(RuntimeBlockInfo* block) {
+    should_exit_block = false;
+    
+    // OPTIMIZATION: Cache block size to avoid repeated size() calls
+    const size_t block_size = block->oplist.size();
+    
+    // OPTIMIZATION: Direct pointer access to opcodes
+    const shil_opcode* opcodes = block->oplist.data();
+    
+    for (size_t i = 0; i < block_size && __builtin_expect(!should_exit_block, 1); i++) {
+        executeOpcode(opcodes[i]);
+    }
+}
+
+// OPTIMIZATION: Optimized mainloop with reduced overhead
+void shil_interpreter_mainloop(void* v_cntx) {
+    // Set up context similar to JIT mainloop
+    p_sh4rcb = (Sh4RCB*)((u8*)v_cntx - sizeof(Sh4Context));
+    
+    try {
+        while (__builtin_expect(Sh4cntx.CpuRunning, 1)) {
+            // OPTIMIZATION: Reduce interrupt checking frequency
+            if (__builtin_expect(Sh4cntx.cycle_counter <= 0, 0)) {
+                Sh4cntx.cycle_counter += SH4_TIMESLICE;
+                if (UpdateSystem_INTC()) {
+                    // Interrupt occurred, continue to handle it
+                    continue;
+                }
+            }
+            
+            // Get current PC
+            u32 pc = next_pc;
+            
+            // Find or create block for current PC
+            RuntimeBlockInfoPtr blockPtr = bm_GetBlock(pc);
+            RuntimeBlockInfo* block = nullptr;
+            
+            if (__builtin_expect(blockPtr != nullptr, 1)) {
+                block = blockPtr.get();
+            } else {
+                // Block doesn't exist, we need to create and decode it
+                // This will trigger the normal block creation process
+                DynarecCodeEntryPtr code = rdv_CompilePC(0);
+                if (!code) {
+                    ERROR_LOG(DYNAREC, "Failed to create block for PC: %08X", pc);
+                    break;
+                }
+                
+                // Get the block again after compilation
+                blockPtr = bm_GetBlock(pc);
+                if (!blockPtr) {
+                    ERROR_LOG(DYNAREC, "Block creation succeeded but block not found for PC: %08X", pc);
+                    break;
+                }
+                block = blockPtr.get();
+            }
+            
+            // Check if this is a SHIL interpreter block
+            if (__builtin_expect(reinterpret_cast<uintptr_t>(block->code) & 0x1, 1)) {
+                // This is a SHIL interpreter block
+                ShilInterpreter::executeBlock(block);
+            } else {
+                // This is a regular JIT block - shouldn't happen in interpreter mode
+                ERROR_LOG(DYNAREC, "Unexpected JIT block in SHIL interpreter mode");
+                break;
+            }
+            
+            // Update cycle counter
+            Sh4cntx.cycle_counter -= block->guest_cycles;
+        }
+    } catch (const SH4ThrownException&) {
+        ERROR_LOG(DYNAREC, "SH4ThrownException in SHIL interpreter mainloop");
+        throw FlycastException("Fatal: Unhandled SH4 exception");
+    }
+} 
\ No newline at end of file
diff --git a/core/hw/sh4/dyna/shil_interpreter.h b/core/hw/sh4/dyna/shil_interpreter.h
new file mode 100644
index 000000000..16e70c2cc
--- /dev/null
+++ b/core/hw/sh4/dyna/shil_interpreter.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include "types.h"
+#include "shil.h"
+#include "blockmanager.h"
+
+/// SHIL Interpreter - executes SHIL opcodes directly without JIT compilation
+class ShilInterpreter {
+public:
+    /// Execute a single SHIL opcode
+    static void executeOpcode(const shil_opcode& op);
+    
+    /// Execute an entire SHIL block
+    static void executeBlock(RuntimeBlockInfo* block);
+    
+    /// Get register value for SHIL parameter
+    static u32 getRegValue(const shil_param& param);
+    
+    /// Set register value for SHIL parameter  
+    static void setRegValue(const shil_param& param, u32 value);
+    
+    /// Get float register value for SHIL parameter
+    static f32 getFloatRegValue(const shil_param& param);
+    
+    /// Set float register value for SHIL parameter
+    static void setFloatRegValue(const shil_param& param, f32 value);
+    
+    /// Get 64-bit register value for SHIL parameter
+    static u64 getReg64Value(const shil_param& param);
+    
+    /// Set 64-bit register value for SHIL parameter
+    static void setReg64Value(const shil_param& param, u64 value);
+    
+    /// Get pointer for SHIL parameter
+    static void* getPointer(const shil_param& param);
+    
+private:
+    /// Handle memory read operations
+    static u32 handleMemoryRead(const shil_param& addr, u32 size);
+    
+    /// Handle memory write operations  
+    static void handleMemoryWrite(const shil_param& addr, const shil_param& value, u32 size);
+    
+    /// Handle interpreter fallback (shop_ifb)
+    static void handleInterpreterFallback(const shil_opcode& op);
+    
+    /// Handle dynamic jumps (shop_jdyn)
+    static void handleDynamicJump(const shil_opcode& op);
+    
+    /// Handle conditional jumps (shop_jcond)
+    static void handleConditionalJump(const shil_opcode& op);
+    
+    /// Handle illegal instructions (shop_illegal)
+    static void handleIllegalInstruction(const shil_opcode& op);
+};
+
+/// Simple flag to enable SHIL interpretation mode
+extern bool enable_shil_interpreter;
+
+/// Initialize SHIL interpreter setting from config
+void init_shil_interpreter_setting();
+
+/// SHIL interpretation mainloop
+void shil_interpreter_mainloop(void* v_cntx); 
\ No newline at end of file