From 594241ffa3f178425b2d2a8a445a5d3015cb16f9 Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Fri, 29 Jan 2021 11:46:05 +0100 Subject: [PATCH] x64 rec: generate main loop. use mem stubs. fast mem access for !mmu enable fast mem path for both !mmu and mmu simplify rewrite get rid of .asm file and inline assembler code --- CMakeLists.txt | 2 +- core/rec-x64/msvc.asm | 77 ---- core/rec-x64/rec_x64.cpp | 907 +++++++++++++++++---------------------- 3 files changed, 400 insertions(+), 586 deletions(-) delete mode 100644 core/rec-x64/msvc.asm diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c3e5f552..f50635f79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -900,7 +900,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD6 core/rec-x64/rec_x64.cpp core/rec-x64/x64_regalloc.h) if(MSVC) - target_sources(${PROJECT_NAME} PRIVATE core/rec-x64/msvc.asm core/hw/arm7/arm7_x64_msvc.asm) + target_sources(${PROJECT_NAME} PRIVATE core/hw/arm7/arm7_x64_msvc.asm) endif() endif() else() diff --git a/core/rec-x64/msvc.asm b/core/rec-x64/msvc.asm deleted file mode 100644 index 9f77391aa..000000000 --- a/core/rec-x64/msvc.asm +++ /dev/null @@ -1,77 +0,0 @@ -_TEXT SEGMENT - -SH4_TIMESLICE = 448 -CPU_RUNNING = 135266148 -PC = 135266120 - -EXTERN bm_GetCodeByVAddr: PROC -EXTERN UpdateSystem_INTC: PROC -EXTERN setjmp: PROC -EXTERN cycle_counter: dword -EXTERN p_sh4rcb: qword -EXTERN jmp_env: qword - -PUBLIC ngen_mainloop -ngen_mainloop PROC FRAME - - push rbx - .pushreg rbx - push rbp - .pushreg rbp - push rdi - .pushreg rdi - push rsi - .pushreg rsi - push r12 - .pushreg r12 - push r13 - .pushreg r13 - push r14 - .pushreg r14 - push r15 - .pushreg r15 - sub rsp, 40 ; 32-byte shadow space + 8 for stack 16-byte alignment - .allocstack 40 - .endprolog - - mov dword ptr [cycle_counter], SH4_TIMESLICE - - lea rcx, qword ptr[jmp_env] - xor rdx, rdx - call setjmp - -run_loop: - mov rax, qword ptr [p_sh4rcb] - mov edx, dword ptr[CPU_RUNNING + rax] - test edx, edx - je end_run_loop - -slice_loop: - mov rax, qword ptr [p_sh4rcb] - mov ecx, dword ptr[PC + rax] - call bm_GetCodeByVAddr - call rax - mov ecx, dword ptr [cycle_counter] - test ecx, ecx - jg slice_loop - - add ecx, SH4_TIMESLICE - mov dword ptr [cycle_counter], ecx - call UpdateSystem_INTC - jmp run_loop - -end_run_loop: - - add rsp, 40 - pop r15 - pop r14 - pop r13 - pop r12 - pop rsi - pop rdi - pop rbp - pop rbx - ret -ngen_mainloop ENDP -_TEXT ENDS -END diff --git a/core/rec-x64/rec_x64.cpp b/core/rec-x64/rec_x64.cpp index 70d15664b..7fbab8828 100644 --- a/core/rec-x64/rec_x64.cpp +++ b/core/rec-x64/rec_x64.cpp @@ -3,12 +3,12 @@ #if FEAT_SHREC == DYNAREC_JIT && HOST_CPU == CPU_X64 #include -//#define PROFILING //#define CANONICAL_TEST #define XBYAK_NO_OP_NAMES #include #include +using namespace Xbyak::util; #include "types.h" #include "hw/sh4/sh4_opcode_list.h" @@ -20,7 +20,6 @@ #include "hw/sh4/sh4_core.h" #include "hw/sh4/sh4_mem.h" #include "hw/mem/vmem32.h" -#include "profiler/profiler.h" #include "oslib/oslib.h" #include "x64_regalloc.h" #include "xbyak_base.h" @@ -36,190 +35,54 @@ struct DynaRBI : RuntimeBlockInfo } }; -extern "C" { - int cycle_counter; -} - -u64 host_cpu_time; +static int cycle_counter; +static void (*mainloop)(); u32 mem_writes, mem_reads; u32 mem_rewrites_w, mem_rewrites_r; -#ifdef PROFILING -static clock_t slice_start; -int start_cycle; -extern "C" -{ -static __attribute((used)) void* start_slice(void *p) -{ - slice_start = clock(); - start_cycle = cycle_counter; - return p; +static jmp_buf jmp_env; +static u32 exception_raised; + +namespace MemSize { + enum { + S8, + S16, + S32, + S64, + Count + }; } -static __attribute((used)) void end_slice() +namespace MemOp { + enum { + R, + W, + Count + }; +} +namespace MemType { + enum { + Fast, + Slow, + Count + }; +} + +static const void *MemHandlers[MemType::Count][MemSize::Count][MemOp::Count]; +static const u8 *MemHandlerStart, *MemHandlerEnd; + +void ngen_mainloop(void *) { - clock_t now = clock(); - if (slice_start != 0) - { - host_cpu_time += now - slice_start; - guest_cpu_cycles += start_cycle - cycle_counter; + try { + mainloop(); + } catch (const SH4ThrownException&) { + ERROR_LOG(DYNAREC, "SH4ThrownException in mainloop"); + } catch (...) { + ERROR_LOG(DYNAREC, "Uncaught unknown exception in mainloop"); } - slice_start = now; - start_cycle = cycle_counter; } -} -#endif - -#if defined(__APPLE__) -#define _U "_" -#else -#define _U -#endif - -#ifdef _WIN32 -#define WIN32_ONLY(x) x -#else -#define WIN32_ONLY(x) -#endif - -#define STRINGIFY(x) #x -#define _S(x) STRINGIFY(x) -#if RAM_SIZE_MAX == 16*1024*1024 -#define CPU_RUNNING 68157284 -#define PC 68157256 -#elif RAM_SIZE_MAX == 32*1024*1024 -#define CPU_RUNNING 135266148 -#define PC 135266120 -#else -#error RAM_SIZE_MAX unknown -#endif - -extern "C" { - jmp_buf jmp_env; -} - -#ifndef _MSC_VER - -#ifdef _WIN32 - // Fully naked function in win32 for proper SEH prologue - __asm__ ( - ".text \n\t" - ".p2align 4,,15 \n\t" - ".globl ngen_mainloop \n\t" - ".def ngen_mainloop; .scl 2; .type 32; .endef \n\t" - ".seh_proc ngen_mainloop \n\t" - "ngen_mainloop: \n\t" -#else -void ngen_mainloop(void* v_cntx) -{ - __asm__ ( -#endif - "pushq %rbx \n\t" -WIN32_ONLY( ".seh_pushreg %rbx \n\t") -#if !defined(__APPLE__) // rbp is pushed in the standard function prologue - "pushq %rbp \n\t" -#endif -#ifdef _WIN32 - ".seh_pushreg %rbp \n\t" - "pushq %rdi \n\t" - ".seh_pushreg %rdi \n\t" - "pushq %rsi \n\t" - ".seh_pushreg %rsi \n\t" -#endif - "pushq %r12 \n\t" -WIN32_ONLY( ".seh_pushreg %r12 \n\t") - "pushq %r13 \n\t" -WIN32_ONLY( ".seh_pushreg %r13 \n\t") - "pushq %r14 \n\t" -WIN32_ONLY( ".seh_pushreg %r14 \n\t") - "pushq %r15 \n\t" -#ifdef _WIN32 - ".seh_pushreg %r15 \n\t" - "subq $40, %rsp \n\t" // 32-byte shadow space + 8 for stack 16-byte alignment - ".seh_stackalloc 40 \n\t" - ".seh_endprologue \n\t" -#else - "subq $8, %rsp \n\t" // 8 for stack 16-byte alignment -#endif - "movl $" _S(SH4_TIMESLICE) "," _U "cycle_counter(%rip) \n\t" - -#ifdef _WIN32 - "leaq " _U "jmp_env(%rip), %rcx \n\t" // SETJMP - "xor %rdx, %rdx \n\t" // no frame pointer -#else - "leaq " _U "jmp_env(%rip), %rdi \n\t" -#endif - "call " _U "setjmp \n" - - "1: \n\t" // run_loop - "movq " _U "p_sh4rcb(%rip), %rax \n\t" - "movl " _S(CPU_RUNNING) "(%rax), %edx \n\t" - "testl %edx, %edx \n\t" - "je 3f \n" // end_run_loop -#ifdef PROFILING - "call start_slice \n\t" -#endif - - "2: \n\t" // slice_loop - "movq " _U "p_sh4rcb(%rip), %rax \n\t" -#ifdef _WIN32 - "movl " _S(PC)"(%rax), %ecx \n\t" -#else - "movl " _S(PC)"(%rax), %edi \n\t" -#endif - "call " _U "bm_GetCodeByVAddr \n\t" - "call *%rax \n\t" -#ifdef PROFILING - "call end_slice \n\t" -#endif - "movl " _U "cycle_counter(%rip), %ecx \n\t" - "testl %ecx, %ecx \n\t" - "jg 2b \n\t" // slice_loop - - "addl $" _S(SH4_TIMESLICE) ", %ecx \n\t" - "movl %ecx, " _U "cycle_counter(%rip) \n\t" - "call " _U "UpdateSystem_INTC \n\t" - "jmp 1b \n" // run_loop - - "3: \n\t" // end_run_loop - -#ifdef _WIN32 - "addq $40, %rsp \n\t" -#else - "addq $8, %rsp \n\t" -#endif - "popq %r15 \n\t" - "popq %r14 \n\t" - "popq %r13 \n\t" - "popq %r12 \n\t" -#ifdef _WIN32 - "popq %rsi \n\t" - "popq %rdi \n\t" -#endif -#if !defined(__APPLE__) - "popq %rbp \n\t" -#endif - "popq %rbx \n\t" -#ifdef _WIN32 - "ret \n\t" - ".seh_endproc \n" - ); -#else - ); -} -#endif - -#endif // !_MSC_VER -#undef _U -#undef _S void ngen_init() -{ - verify(CPU_RUNNING == offsetof(Sh4RCB, cntx.CpuRunning)); - verify(PC == offsetof(Sh4RCB, cntx.pc)); -} - -void ngen_ResetBlocks() { } @@ -253,8 +116,6 @@ static void handle_mem_exception(u32 exception_raised, u32 pc) } } -static u32 exception_raised; - template static T ReadMemNoEx(u32 addr, u32 pc) { @@ -310,24 +171,19 @@ static void do_sqw_mmu_no_ex(u32 addr, u32 pc) } } -static void do_sqw_nommu_local(u32 addr, u8* sqb) -{ - do_sqw_nommu(addr, sqb); -} - const std::array call_regs #ifdef _WIN32 - { Xbyak::util::ecx, Xbyak::util::edx, Xbyak::util::r8d, Xbyak::util::r9d }; + { ecx, edx, r8d, r9d }; #else - { Xbyak::util::edi, Xbyak::util::esi, Xbyak::util::edx, Xbyak::util::ecx }; + { edi, esi, edx, ecx }; #endif const std::array call_regs64 #ifdef _WIN32 - { Xbyak::util::rcx, Xbyak::util::rdx, Xbyak::util::r8, Xbyak::util::r9 }; + { rcx, rdx, r8, r9 }; #else - { Xbyak::util::rdi, Xbyak::util::rsi, Xbyak::util::rdx, Xbyak::util::rcx }; + { rdi, rsi, rdx, rcx }; #endif -const std::array call_regsxmm { Xbyak::util::xmm0, Xbyak::util::xmm1, Xbyak::util::xmm2, Xbyak::util::xmm3 }; +const std::array call_regsxmm { xmm0, xmm1, xmm2, xmm3 }; class BlockCompiler : public BaseXbyakRec { @@ -432,11 +288,14 @@ public: add(call_regs[0], dword[rax]); } } - if (!optimise || !GenReadMemoryFast(op, block)) - GenReadMemorySlow(op, block); + int size = op.flags & 0x7f; - u32 size = op.flags & 0x7f; - if (size != 8) + if (mmu_enabled()) + mov(call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc + size = size == 1 ? MemSize::S8 : size == 2 ? MemSize::S16 : size == 4 ? MemSize::S32 : MemSize::S64; + GenCall((void (*)())MemHandlers[optimise ? MemType::Fast : MemType::Slow][size][MemOp::R], mmu_enabled()); + + if (size != MemSize::S64) host_reg_to_shil_param(op.rd, eax); else { mov(rcx, (uintptr_t)op.rd.reg_ptr()); @@ -470,8 +329,11 @@ public: mov(rax, (uintptr_t)op.rs2.reg_ptr()); mov(call_regs64[1], qword[rax]); } - if (!optimise || !GenWriteMemoryFast(op, block)) - GenWriteMemorySlow(op, block); + + if (mmu_enabled()) + mov(call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc + size = size == 1 ? MemSize::S8 : size == 2 ? MemSize::S16 : size == 4 ? MemSize::S32 : MemSize::S64; + GenCall((void (*)())MemHandlers[optimise ? MemType::Fast : MemType::Slow][size][MemOp::W], mmu_enabled()); } } break; @@ -514,52 +376,36 @@ public: break; case shop_pref: - if (op.rs1.is_imm()) { - // this test shouldn't be necessary - if ((op.rs1._imm & 0xFC000000) == 0xE0000000) + Xbyak::Label no_sqw; + if (op.rs1.is_imm()) { - mov(call_regs[0], op.rs1._imm); - if (mmu_enabled()) - { - mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc + // this test shouldn't be necessary + if ((op.rs1._imm & 0xFC000000) != 0xE0000000) + break; - GenCall(do_sqw_mmu_no_ex); - } - else - { - if (CCN_MMUCR.AT == 1) - { - GenCall(do_sqw_mmu); - } - else - { - mov(call_regs64[1], (uintptr_t)sq_both); - GenCall(&do_sqw_nommu_local); - } - } - } - } - else - { - Xbyak::Reg32 rn; - if (regalloc.IsAllocg(op.rs1)) - { - rn = regalloc.MapRegister(op.rs1); + mov(call_regs[0], op.rs1._imm); } else { - mov(rax, (uintptr_t)op.rs1.reg_ptr()); - mov(eax, dword[rax]); - rn = eax; - } - mov(ecx, rn); - shr(ecx, 26); - cmp(ecx, 0x38); - Xbyak::Label no_sqw; - jne(no_sqw); + Xbyak::Reg32 rn; + if (regalloc.IsAllocg(op.rs1)) + { + rn = regalloc.MapRegister(op.rs1); + } + else + { + mov(rax, (uintptr_t)op.rs1.reg_ptr()); + mov(eax, dword[rax]); + rn = eax; + } + mov(ecx, rn); + shr(ecx, 26); + cmp(ecx, 0x38); + jne(no_sqw); - mov(call_regs[0], rn); + mov(call_regs[0], rn); + } if (mmu_enabled()) { mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc @@ -575,7 +421,10 @@ public: else { mov(call_regs64[1], (uintptr_t)sq_both); - GenCall(&do_sqw_nommu_local); + mov(rax, (size_t)&do_sqw_nommu); + saveXmmRegisters(); + call(qword[rax]); + restoreXmmRegisters(); } } L(no_sqw); @@ -585,14 +434,14 @@ public: case shop_frswap: mov(rax, (uintptr_t)op.rs1.reg_ptr()); mov(rcx, (uintptr_t)op.rd.reg_ptr()); - if (cpu.has(Xbyak::util::Cpu::tAVX512F)) + if (cpu.has(Cpu::tAVX512F)) { vmovaps(zmm0, zword[rax]); vmovaps(zmm1, zword[rcx]); vmovaps(zword[rax], zmm1); vmovaps(zword[rcx], zmm0); } - else if (cpu.has(Xbyak::util::Cpu::tAVX)) + else if (cpu.has(Cpu::tAVX)) { vmovaps(ymm0, yword[rax]); vmovaps(ymm1, yword[rcx]); @@ -707,113 +556,6 @@ public: emit_Skip(getSize()); } - void GenReadMemorySlow(const shil_opcode& op, RuntimeBlockInfo* block) - { - const u8 *start_addr = getCurr(); - if (mmu_enabled()) - mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc - - u32 size = op.flags & 0x7f; - switch (size) { - case 1: - if (!mmu_enabled()) - GenCall(ReadMem8); - else - GenCall(ReadMemNoEx, true); - movsx(eax, al); - break; - case 2: - if (!mmu_enabled()) - GenCall(ReadMem16); - else - GenCall(ReadMemNoEx, true); - movsx(eax, ax); - break; - - case 4: - if (!mmu_enabled()) - GenCall(ReadMem32); - else - GenCall(ReadMemNoEx, true); - break; - case 8: - if (!mmu_enabled()) - GenCall(ReadMem64); - else - GenCall(ReadMemNoEx, true); - break; - default: - die("1..8 bytes"); - } - - if (mmu_enabled() && vmem32_enabled()) - { - Xbyak::Label quick_exit; - if (getCurr() - start_addr <= read_mem_op_size - 6) - jmp(quick_exit, T_NEAR); - while (getCurr() - start_addr < read_mem_op_size) - nop(); - L(quick_exit); - verify(getCurr() - start_addr == read_mem_op_size); - } - } - - void GenWriteMemorySlow(const shil_opcode& op, RuntimeBlockInfo* block) - { - const u8 *start_addr = getCurr(); - if (mmu_enabled()) - mov(call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc - - u32 size = op.flags & 0x7f; - switch (size) { - case 1: - if (!mmu_enabled()) - GenCall(WriteMem8); - else - GenCall(WriteMemNoEx, true); - break; - case 2: - if (!mmu_enabled()) - GenCall(WriteMem16); - else - GenCall(WriteMemNoEx, true); - break; - case 4: - if (!mmu_enabled()) - GenCall(WriteMem32); - else - GenCall(WriteMemNoEx, true); - break; - case 8: - if (!mmu_enabled()) - GenCall(WriteMem64); - else - GenCall(WriteMemNoEx, true); - break; - default: - die("1..8 bytes"); - } - if (mmu_enabled() && vmem32_enabled()) - { - Xbyak::Label quick_exit; - if (getCurr() - start_addr <= write_mem_op_size - 6) - jmp(quick_exit, T_NEAR); - while (getCurr() - start_addr < write_mem_op_size) - nop(); - L(quick_exit); - verify(getCurr() - start_addr == write_mem_op_size); - } - } - - void InitializeRewrite(RuntimeBlockInfo *block, size_t opid) - { - } - - void FinalizeRewrite() - { - ready(); - } - void ngen_CC_Start(const shil_opcode& op) { CC_pars.clear(); @@ -909,6 +651,124 @@ public: movss(dword[rax], Xbyak::Xmm(nreg)); } + void genMainloop() + { + push(rbx); + push(rbp); +#ifdef _WIN32 + push(rdi); + push(rsi); +#endif + push(r12); + push(r13); + push(r14); + push(r15); +#ifdef _WIN32 + sub(rsp, 40); // 32-byte shadow space + 8 for stack 16-byte alignment +#else + sub(rsp, 8); // stack 16-byte alignment +#endif + + mov(dword[rip + &cycle_counter], SH4_TIMESLICE); + + lea(call_regs64[0], qword[rip + &jmp_env]); +#ifdef _WIN32 + xor_(call_regs64[1], call_regs64[1]); // no frame pointer +#endif +#ifdef _MSC_VER + // FIXME call((const void *)_setjmp); +#else + call((const void *)_setjmp); +#endif + + //run_loop: + Xbyak::Label run_loop; + L(run_loop); + Xbyak::Label end_run_loop; + mov(rax, (size_t)&p_sh4rcb->cntx.CpuRunning); + mov(edx, dword[rax]); + + test(edx, edx); + je(end_run_loop); + + //slice_loop: + Xbyak::Label slice_loop; + L(slice_loop); + mov(rax, (size_t)&p_sh4rcb->cntx.pc); + mov(call_regs[0], dword[rax]); + call(bm_GetCodeByVAddr); + call(rax); + mov(ecx, dword[rip + &cycle_counter]); + test(ecx, ecx); + jg(slice_loop); + + add(ecx, SH4_TIMESLICE); + mov(dword[rip + &cycle_counter], ecx); + call(UpdateSystem_INTC); + jmp(run_loop); + + //end_run_loop: + L(end_run_loop); +#ifdef _WIN32 + add(rsp, 40); +#else + add(rsp, 8); +#endif + pop(r15); + pop(r14); + pop(r13); + pop(r12); +#ifdef _WIN32 + pop(rsi); + pop(rdi); +#endif + pop(rbp); + pop(rbx); + ret(); + + genMemHandlers(); + + ready(); + mainloop = (void (*)())getCode(); + + emit_Skip(getSize()); + } + + bool rewriteMemAccess(size_t& host_pc, size_t retadr, size_t accessedAddress) + { + if (!_nvmem_enabled() || (mmu_enabled() && !vmem32_enabled())) + return false; + + //printf("ngen_Rewrite pc %p\n", host_pc); + if (host_pc < (size_t)MemHandlerStart || host_pc >= (size_t)MemHandlerEnd) + return false; + + size_t ca = *(s32 *)(retadr - 4) + retadr; + for (int size = 0; size < MemSize::Count; size++) + { + for (int op = 0; op < MemOp::Count; op++) + { + if ((size_t)MemHandlers[MemType::Fast][size][op] != ca) + continue; + + //found ! + const u8 *start = getCurr(); + call(MemHandlers[MemType::Slow][size][op]); + verify(getCurr() - start == 5); + + ready(); + + host_pc = retadr - 5; + + return true; + } + } + ERROR_LOG(DYNAREC, "rewriteMemAccess code not found: hpc %08x retadr %08x acc %08x", host_pc, retadr, accessedAddress); + die("Failed to match the code"); + + return false; + } + private: bool GenReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* block) { @@ -1157,104 +1017,6 @@ private: return true; } - bool GenReadMemoryFast(const shil_opcode& op, RuntimeBlockInfo* block) - { - if (!mmu_enabled() || !vmem32_enabled()) - return false; - mem_reads++; - const u8 *start_addr = getCurr(); - - mov(rax, (uintptr_t)&p_sh4rcb->cntx.exception_pc); - mov(dword[rax], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0)); - - mov(rax, (uintptr_t)virt_ram_base); - - u32 size = op.flags & 0x7f; - //verify(getCurr() - start_addr == 26); - if (mem_access_offset == 0) - mem_access_offset = getCurr() - start_addr; - else - verify(getCurr() - start_addr == mem_access_offset); - - block->memory_accesses[(void*)getCurr()] = (u32)current_opid; - switch (size) - { - case 1: - movsx(eax, byte[rax + call_regs64[0]]); - break; - - case 2: - movsx(eax, word[rax + call_regs64[0]]); - break; - - case 4: - mov(eax, dword[rax + call_regs64[0]]); - break; - - case 8: - mov(rax, qword[rax + call_regs64[0]]); - break; - - default: - die("1..8 bytes"); - } - - while (getCurr() - start_addr < read_mem_op_size) - nop(); - verify(getCurr() - start_addr == read_mem_op_size); - - return true; - } - - bool GenWriteMemoryFast(const shil_opcode& op, RuntimeBlockInfo* block) - { - if (!mmu_enabled() || !vmem32_enabled()) - return false; - mem_writes++; - const u8 *start_addr = getCurr(); - - mov(rax, (uintptr_t)&p_sh4rcb->cntx.exception_pc); - mov(dword[rax], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0)); - - mov(rax, (uintptr_t)virt_ram_base); - - u32 size = op.flags & 0x7f; - //verify(getCurr() - start_addr == 26); - if (mem_access_offset == 0) - mem_access_offset = getCurr() - start_addr; - else - verify(getCurr() - start_addr == mem_access_offset); - - block->memory_accesses[(void*)getCurr()] = (u32)current_opid; - switch (size) - { - case 1: - mov(byte[rax + call_regs64[0] + 0], call_regs[1].cvt8()); - break; - - case 2: - mov(word[rax + call_regs64[0]], call_regs[1].cvt16()); - break; - - case 4: - mov(dword[rax + call_regs64[0]], call_regs[1]); - break; - - case 8: - mov(qword[rax + call_regs64[0]], call_regs64[1]); - break; - - default: - die("1..8 bytes"); - } - - while (getCurr() - start_addr < write_mem_op_size) - nop(); - verify(getCurr() - start_addr == write_mem_op_size); - - return true; - } - void CheckBlock(bool force_checks, RuntimeBlockInfo* block) { mov(call_regs[0], block->addr); @@ -1307,23 +1069,156 @@ private: } } - template - void GenCall(Ret(*function)(Params...), bool skip_floats = false) + void genMemHandlers() + { + // make sure the memory handlers are set + verify(ReadMem8 != nullptr); + + MemHandlerStart = getCurr(); + for (int type = 0; type < MemOp::Count; type++) + { + for (int size = 0; size < MemSize::Count; size++) + { + for (int op = 0; op < MemOp::Count; op++) + { + MemHandlers[type][size][op] = getCurr(); + if (type == MemType::Fast && _nvmem_enabled() && (!mmu_enabled() || vmem32_enabled())) + { + if (mmu_enabled()) + { + mov(rax, (uintptr_t)&p_sh4rcb->cntx.exception_pc); + mov(dword[rax], call_regs[2]); + } + mov(rax, (uintptr_t)virt_ram_base); + mov(r9, call_regs64[0]); + if (!_nvmem_4gb_space()) + and_(call_regs[0], 0x1FFFFFFF); + switch (size) + { + case MemSize::S8: + if (op == MemOp::R) + movsx(eax, byte[rax + call_regs64[0]]); + else + mov(byte[rax + call_regs64[0]], call_regs[1].cvt8()); + break; + + case MemSize::S16: + if (op == MemOp::R) + movsx(eax, word[rax + call_regs64[0]]); + else + mov(word[rax + call_regs64[0]], call_regs[1].cvt16()); + break; + + case MemSize::S32: + if (op == MemOp::R) + mov(eax, dword[rax + call_regs64[0]]); + else + mov(dword[rax + call_regs64[0]], call_regs[1]); + break; + + case MemSize::S64: + if (op == MemOp::R) + mov(rax, qword[rax + call_regs64[0]]); + else + mov(qword[rax + call_regs64[0]], call_regs64[1]); + break; + } + } + else + { + // Slow path + if (op == MemOp::R) + { + if (mmu_enabled()) + mov(call_regs[1], call_regs[2]); + switch (size) { + case MemSize::S8: + if (mmu_enabled()) + call((const void *)ReadMemNoEx); + else + call((const void *)ReadMem8); + movsx(eax, al); + break; + case MemSize::S16: + if (mmu_enabled()) + call((const void *)ReadMemNoEx); + else + call((const void *)ReadMem16); + movsx(eax, ax); + break; + case MemSize::S32: + if (mmu_enabled()) + jmp((const void *)ReadMemNoEx); + else + jmp((const void *)ReadMem32); // tail call + continue; + case MemSize::S64: + if (mmu_enabled()) + jmp((const void *)ReadMemNoEx); + else + jmp((const void *)ReadMem64); // tail call + continue; + default: + die("1..8 bytes"); + } + } + else + { + switch (size) { + case MemSize::S8: + if (mmu_enabled()) + jmp((const void *)WriteMemNoEx); + else + jmp((const void *)WriteMem8); // tail call + continue; + case MemSize::S16: + if (mmu_enabled()) + jmp((const void *)WriteMemNoEx); + else + jmp((const void *)WriteMem16); // tail call + continue; + case MemSize::S32: + if (mmu_enabled()) + jmp((const void *)WriteMemNoEx); + else + jmp((const void *)WriteMem32); // tail call + continue; + case MemSize::S64: + if (mmu_enabled()) + jmp((const void *)WriteMemNoEx); + else + jmp((const void *)WriteMem64); // tail call + continue; + default: + die("1..8 bytes"); + } + } + } + ret(); + } + } + } + MemHandlerEnd = getCurr(); + } + + void saveXmmRegisters() { #ifndef _WIN32 - bool xmm8_mapped = !skip_floats && current_opid != (size_t)-1 && regalloc.IsMapped(xmm8, current_opid); - bool xmm9_mapped = !skip_floats && current_opid != (size_t)-1 && regalloc.IsMapped(xmm9, current_opid); - bool xmm10_mapped = !skip_floats && current_opid != (size_t)-1 && regalloc.IsMapped(xmm10, current_opid); - bool xmm11_mapped = !skip_floats && current_opid != (size_t)-1 && regalloc.IsMapped(xmm11, current_opid); + if (current_opid == (size_t)-1) + return; + + bool xmm8_mapped = regalloc.IsMapped(xmm8, current_opid); + bool xmm9_mapped = regalloc.IsMapped(xmm9, current_opid); + bool xmm10_mapped = regalloc.IsMapped(xmm10, current_opid); + bool xmm11_mapped = regalloc.IsMapped(xmm11, current_opid); // Need to save xmm registers as they are not preserved in linux/mach - int offset = 0; - u32 stack_size = 0; if (xmm8_mapped || xmm9_mapped || xmm10_mapped || xmm11_mapped) { - stack_size = 4 * (xmm8_mapped + xmm9_mapped + xmm10_mapped + xmm11_mapped); + u32 stack_size = 4 * (xmm8_mapped + xmm9_mapped + xmm10_mapped + xmm11_mapped); stack_size = (((stack_size + 15) >> 4) << 4); // Stack needs to be 16-byte aligned before the call sub(rsp, stack_size); + int offset = 0; if (xmm8_mapped) { movd(ptr[rsp + offset], xmm8); @@ -1340,43 +1235,62 @@ private: offset += 4; } if (xmm11_mapped) - { movd(ptr[rsp + offset], xmm11); - offset += 4; - } } #endif + } - call(CC_RX2RW(function)); - + void restoreXmmRegisters() + { #ifndef _WIN32 + if (current_opid == (size_t)-1) + return; + + bool xmm8_mapped = regalloc.IsMapped(xmm8, current_opid); + bool xmm9_mapped = regalloc.IsMapped(xmm9, current_opid); + bool xmm10_mapped = regalloc.IsMapped(xmm10, current_opid); + bool xmm11_mapped = regalloc.IsMapped(xmm11, current_opid); if (xmm8_mapped || xmm9_mapped || xmm10_mapped || xmm11_mapped) { + u32 stack_size = 4 * (xmm8_mapped + xmm9_mapped + xmm10_mapped + xmm11_mapped); + int offset = stack_size; + stack_size = (((stack_size + 15) >> 4) << 4); // Stack needs to be 16-byte aligned before the call if (xmm11_mapped) { - offset -= 4; movd(xmm11, ptr[rsp + offset]); + offset -= 4; } if (xmm10_mapped) { - offset -= 4; movd(xmm10, ptr[rsp + offset]); + offset -= 4; } if (xmm9_mapped) { - offset -= 4; movd(xmm9, ptr[rsp + offset]); + offset -= 4; } if (xmm8_mapped) { - offset -= 4; movd(xmm8, ptr[rsp + offset]); + offset -= 4; } + verify(offset == -4); add(rsp, stack_size); } #endif } + template + void GenCall(Ret(*function)(Params...), bool skip_floats = false) + { + if (!skip_floats) + saveXmmRegisters(); + call(CC_RX2RW(function)); + if (!skip_floats) + restoreXmmRegisters(); + } + struct CC_PS { CanonicalParamType type; @@ -1388,16 +1302,8 @@ private: Xbyak::util::Cpu cpu; size_t current_opid; Xbyak::Label exit_block; - static const u32 read_mem_op_size; - static const u32 write_mem_op_size; -public: - static u32 mem_access_offset; }; -const u32 BlockCompiler::read_mem_op_size = 30; -const u32 BlockCompiler::write_mem_op_size = 30; -u32 BlockCompiler::mem_access_offset = 0; - void X64RegAlloc::Preload(u32 reg, Xbyak::Operand::Code nreg) { compiler->RegPreload(reg, nreg); @@ -1422,8 +1328,11 @@ void ngen_Compile(RuntimeBlockInfo* block, bool smc_checks, bool reset, bool sta verify(emit_FreeSpace() >= 16 * 1024); compiler = new BlockCompiler(); - - compiler->compile(block, smc_checks, reset, staging, optimise); + try { + compiler->compile(block, smc_checks, reset, staging, optimise); + } catch (const Xbyak::Error& e) { + ERROR_LOG(DYNAREC, "Fatal xbyak error: %s", e.what()); + } delete compiler; } @@ -1447,52 +1356,34 @@ void ngen_CC_Finish(shil_opcode* op) { } -bool ngen_Rewrite(unat& host_pc, unat, unat) +bool ngen_Rewrite(size_t& host_pc, size_t retadr, size_t acc) { - if (!mmu_enabled() || !vmem32_enabled()) - return false; - - //printf("ngen_Rewrite pc %p\n", host_pc); - RuntimeBlockInfoPtr block = bm_GetBlock((void *)host_pc); - if (block == NULL) - { - WARN_LOG(DYNAREC, "ngen_Rewrite: Block at %p not found", (void *)host_pc); + std::unique_ptr compiler(new BlockCompiler((u8*)(retadr - 5))); + try { + return compiler->rewriteMemAccess(host_pc, retadr, acc); + } catch (const Xbyak::Error& e) { + ERROR_LOG(DYNAREC, "Fatal xbyak error: %s", e.what()); return false; } - u8 *code_ptr = (u8*)host_pc; - auto it = block->memory_accesses.find(code_ptr); - if (it == block->memory_accesses.end()) - { - WARN_LOG(DYNAREC, "ngen_Rewrite: memory access at %p not found (%lu entries)", code_ptr, block->memory_accesses.size()); - return false; - } - u32 opid = it->second; - verify(opid < block->oplist.size()); - const shil_opcode& op = block->oplist[opid]; - - BlockCompiler *assembler = new BlockCompiler(code_ptr - BlockCompiler::mem_access_offset); - assembler->InitializeRewrite(block.get(), opid); - if (op.op == shop_readm) - { - mem_rewrites_r++; - assembler->GenReadMemorySlow(op, block.get()); - } - else - { - mem_rewrites_w++; - assembler->GenWriteMemorySlow(op, block.get()); - } - assembler->FinalizeRewrite(); - verify(block->host_code_size >= assembler->getSize()); - delete assembler; - block->memory_accesses.erase(it); - host_pc = (unat)(code_ptr - BlockCompiler::mem_access_offset); - - return true; } void ngen_HandleException() { longjmp(jmp_env, 1); } + +void ngen_ResetBlocks() +{ + // Avoid generating the main loop more than once + if (mainloop != nullptr && mainloop != emit_GetCCPtr()) + return; + + std::unique_ptr compiler(new BlockCompiler()); + try { + compiler->genMainloop(); + } catch (const Xbyak::Error& e) { + ERROR_LOG(DYNAREC, "Fatal xbyak error: %s", e.what()); + } +} + #endif