From 810b8a59da2bb25e992bd10f6dfe55d61098e236 Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Mon, 29 Apr 2019 18:23:00 +0200 Subject: [PATCH] wince: 32-bit virtual mem space use fast mem read/write for x64 and arm64 dynarecs --- core/build.h | 4 + core/hw/mem/vmem32.cpp | 396 +++++++++++++++ core/hw/mem/vmem32.h | 11 + core/hw/sh4/dyna/driver.cpp | 5 +- core/hw/sh4/dyna/ngen.h | 1 + core/hw/sh4/interpr/sh4_interpreter.cpp | 4 + core/hw/sh4/interpr/sh4_opcodes.cpp | 2 +- core/hw/sh4/modules/ccn.cpp | 15 +- core/hw/sh4/sh4_if.h | 3 + core/linux/common.cpp | 21 +- core/nullDC.cpp | 15 +- core/rec-ARM64/rec_arm64.cpp | 185 ++++--- core/rec-x64/rec_x64.cpp | 623 +++++++++++++++--------- core/rend/TexCache.cpp | 21 +- core/rend/TexCache.h | 14 - core/types.h | 1 + core/windows/winmain.cpp | 20 +- 17 files changed, 1024 insertions(+), 317 deletions(-) create mode 100644 core/hw/mem/vmem32.cpp create mode 100644 core/hw/mem/vmem32.h diff --git a/core/build.h b/core/build.h index 5add9b580..de8a72034 100755 --- a/core/build.h +++ b/core/build.h @@ -288,6 +288,10 @@ #define FEAT_HAS_SOFTREND BUILD_COMPILER == COMPILER_VC //GCC wants us to enable sse4 globaly to enable intrins #endif +#if HOST_CPU == CPU_X64 || HOST_CPU == CPU_ARM64 +#define HOST_64BIT_CPU +#endif + #define RAM_SIZE_MAX (32*1024*1024) #define VRAM_SIZE_MAX (16*1024*1024) #define ARAM_SIZE_MAX (8*1024*1024) diff --git a/core/hw/mem/vmem32.cpp b/core/hw/mem/vmem32.cpp new file mode 100644 index 000000000..472fc561d --- /dev/null +++ b/core/hw/mem/vmem32.cpp @@ -0,0 +1,396 @@ +/* + * vmem32.cpp + * + * Created on: Apr 11, 2019 + * Author: Flyinghead + */ +#include +#include "build.h" +#include "vmem32.h" +#include "_vmem.h" + +#if HOST_OS == OS_WINDOWS +#include +#else +#include +#include /* For mode constants */ +#include /* For O_* constants */ +#include +#include +#ifdef _ANDROID +#include +#endif +#endif + +#ifndef MAP_NOSYNC +#define MAP_NOSYNC 0 +#endif + +#include "types.h" +#include "hw/sh4/dyna/ngen.h" +#include "hw/sh4/modules/mmu.h" + +extern bool VramLockedWriteOffset(size_t offset); +extern cMutex vramlist_lock; + +#if HOST_OS == OS_WINDOWS +extern HANDLE mem_handle; +#else +extern int vmem_fd; +#endif + +#define VMEM32_ERROR_NOT_MAPPED 0x100 + +// FIXME stolen from _vmem.cpp +#define MAP_RAM_START_OFFSET 0 +#define MAP_VRAM_START_OFFSET (MAP_RAM_START_OFFSET+RAM_SIZE) +#define MAP_ARAM_START_OFFSET (MAP_VRAM_START_OFFSET+VRAM_SIZE) + +static const u64 VMEM32_SIZE = 0x100000000L; +static const u64 KERNEL_SPACE = 0x80000000L; +static const u64 AREA7_ADDRESS = 0x7C000000L; + +#define VRAM_PROT_SEGMENT (1024 * 1024) // vram protection regions are grouped by 1MB segment + +u8* vmem32_base; +unordered_set vram_mapped_pages; +vector vram_blocks[VRAM_SIZE / VRAM_PROT_SEGMENT]; + +// stats +u64 vmem32_page_faults; +u64 vmem32_flush; + +static void* vmem32_map_buffer(u32 dst, u32 addrsz, u32 offset, u32 size, bool write) +{ + void* ptr; + void* rv; + + //printf("MAP32 %08X w/ %d\n",dst,offset); + u32 map_times = addrsz / size; +#if HOST_OS == OS_WINDOWS + rv = MapViewOfFileEx(mem_handle, FILE_MAP_READ | (write ? FILE_MAP_WRITE : 0), 0, offset, size, &vmem32_base[dst]); + if (rv == NULL) + return NULL; + + for (u32 i = 1; i < map_times; i++) + { + dst += size; + ptr = MapViewOfFileEx(mem_handle, FILE_MAP_READ | (write ? FILE_MAP_WRITE : 0), 0, offset, size, &vmem32_base[dst]); + if (ptr == NULL) + return NULL; + } +#else + u32 prot = PROT_READ | (write ? PROT_WRITE : 0); + rv = mmap(&vmem32_base[dst], size, prot, MAP_SHARED | MAP_NOSYNC | MAP_FIXED, vmem_fd, offset); + if (MAP_FAILED == rv) + { + printf("MAP1 failed %d\n", errno); + return NULL; + } + + for (u32 i = 1; i < map_times; i++) + { + dst += size; + ptr = mmap(&vmem32_base[dst], size, prot , MAP_SHARED | MAP_NOSYNC | MAP_FIXED, vmem_fd, offset); + if (MAP_FAILED == ptr) + { + printf("MAP2 failed %d\n", errno); + return NULL; + } + } +#endif + return rv; +} + +static void vmem32_unmap_buffer(u32 start, u64 end) +{ +#if HOST_OS == OS_LINUX + mmap(&vmem32_base[start], end - start, PROT_NONE, MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0); +#elif HOST_OS == OS_WINDOWS + VirtualAlloc(&vmem32_base[start], end - start, MEM_RESERVE, PAGE_NOACCESS); +#else +#error Unsupported OS +#endif +} + +static void vmem32_protect_buffer(u32 start, u32 size) +{ + verify((start & PAGE_MASK) == 0); +#if HOST_OS == OS_LINUX + mprotect(&vmem32_base[start], size, PROT_READ); +#elif HOST_OS == OS_WINDOWS + DWORD old; + VirtualProtect(vmem32_base + start, end - start, PAGE_READONLY, &old); +#else +#error Unsupported OS +#endif +} + +static void vmem32_unprotect_buffer(u32 start, u32 size) +{ + verify((start & PAGE_MASK) == 0); +#if HOST_OS == OS_LINUX + mprotect(&vmem32_base[start], size, PROT_READ | PROT_WRITE); +#elif HOST_OS == OS_WINDOWS + DWORD old; + VirtualProtect(vmem32_base + start, end - start, PAGE_READWRITE, &old); +#else +#error Unsupported OS +#endif +} + +void vmem32_protect_vram(vram_block *block) +{ + if (vmem32_base == NULL) + return; + for (int i = block->start / VRAM_PROT_SEGMENT; i <= block->end / VRAM_PROT_SEGMENT; i++) + { + vram_blocks[i].push_back(block); + } +} +void vmem32_unprotect_vram(vram_block *block) +{ + if (vmem32_base == NULL) + return; + for (int page = block->start / VRAM_PROT_SEGMENT; page <= block->end / VRAM_PROT_SEGMENT; page++) + { + for (int i = 0; i < vram_blocks[page].size(); i++) + if (vram_blocks[page][i] == block) + { + vram_blocks[page].erase(vram_blocks[page].begin() + i); + break; + } + } +} + +static bool vmem32_map_areas() +{ + // Aica ram + vmem32_map_buffer(0x80800000, 0x00800000, MAP_ARAM_START_OFFSET, ARAM_SIZE, true); // P1 + vmem32_map_buffer(0x82800000, ARAM_SIZE, MAP_ARAM_START_OFFSET, ARAM_SIZE, true); + vmem32_map_buffer(0xA0800000, 0x00800000, MAP_ARAM_START_OFFSET, ARAM_SIZE, true); // P2 + vmem32_map_buffer(0xA2800000, ARAM_SIZE, MAP_ARAM_START_OFFSET, ARAM_SIZE, true); + + // Vram + // Note: this should be mapped read/write but doesn't seem to be used + vmem32_map_buffer(0x84000000, 0x01000000, MAP_VRAM_START_OFFSET, VRAM_SIZE, false); // P1 + vmem32_map_buffer(0x86000000, 0x01000000, MAP_VRAM_START_OFFSET, VRAM_SIZE, false); + vmem32_map_buffer(0xA4000000, 0x01000000, MAP_VRAM_START_OFFSET, VRAM_SIZE, false); // P2 + vmem32_map_buffer(0xA6000000, 0x01000000, MAP_VRAM_START_OFFSET, VRAM_SIZE, false); + + // System ram + vmem32_map_buffer(0x8C000000, 0x04000000, MAP_RAM_START_OFFSET, RAM_SIZE, true); // P1 + vmem32_map_buffer(0xAC000000, 0x04000000, MAP_RAM_START_OFFSET, RAM_SIZE, true); // P2 + + return true; +} + +static const u32 page_sizes[] = { 1024, 4 * 1024, 64 * 1024, 1024 * 1024 }; + +static u32 vmem32_paddr_to_offset(u32 address) +{ + u32 low_addr = address & 0x1FFFFFFF; + switch ((address >> 26) & 7) + { + case 0: // area 0 + // Aica ram + if (low_addr >= 0x00800000 && low_addr < 0x00800000 + 0x00800000) + { + return ((low_addr - 0x00800000) & (ARAM_SIZE - 1)) + MAP_ARAM_START_OFFSET; + } + else if (low_addr >= 0x02800000 && low_addr < 0x02800000 + 0x00800000) + { + return low_addr - 0x02800000 + MAP_ARAM_START_OFFSET; + } + break; + case 1: // area 1 + // Vram + if (low_addr >= 0x04000000 && low_addr < 0x04000000 + 0x01000000) + { + return ((low_addr - 0x04000000) & (VRAM_SIZE - 1)) + MAP_VRAM_START_OFFSET; + } + else if (low_addr >= 0x06000000 && low_addr < 0x06000000 + 0x01000000) + { + return ((low_addr - 0x06000000) & (VRAM_SIZE - 1)) + MAP_VRAM_START_OFFSET; + } + break; + case 3: // area 3 + // System ram + if (low_addr >= 0x0C000000 && low_addr < 0x0C000000 + 0x04000000) + { + return ((low_addr - 0x0C000000) & (RAM_SIZE - 1)) + MAP_RAM_START_OFFSET; + } + break; + //case 4: + // TODO vram? + //break; + default: + break; + } + // Unmapped address + return -1; +} + +static u32 vmem32_map_mmu(u32 address, bool write) +{ +#ifndef NO_MMU + u32 pa; + const TLB_Entry *entry; + u32 rc = mmu_full_lookup(address, &entry, pa); + if (rc == MMU_ERROR_NONE) + { + //0X & User mode-> protection violation + //if ((entry->Data.PR >> 1) == 0 && p_sh4rcb->cntx.sr.MD == 0) + // return MMU_ERROR_PROTECTED; + + //if (write) + //{ + // if ((entry->Data.PR & 1) == 0) + // return MMU_ERROR_PROTECTED; + // if (entry->Data.D == 0) + // return MMU_ERROR_FIRSTWRITE; + //} + u32 page_size = page_sizes[entry->Data.SZ1 * 2 + entry->Data.SZ0]; + if (page_size == 1024) + return VMEM32_ERROR_NOT_MAPPED; + + u32 vpn = (entry->Address.VPN << 10) & ~(page_size - 1); + u32 ppn = (entry->Data.PPN << 10) & ~(page_size - 1); + u32 offset = vmem32_paddr_to_offset(ppn); + if (offset == -1) + return VMEM32_ERROR_NOT_MAPPED; + + if (offset >= MAP_VRAM_START_OFFSET && offset < MAP_VRAM_START_OFFSET + VRAM_SIZE) + { + // Check vram protected regions + u32 start = offset - MAP_VRAM_START_OFFSET; + if (!vram_mapped_pages.insert(vpn).second) + { + // page has been mapped already: vram locked write + vmem32_unprotect_buffer(address & ~PAGE_MASK, PAGE_SIZE); + u32 addr_offset = start + (address & (page_size - 1)); + VramLockedWriteOffset(addr_offset); + + return MMU_ERROR_NONE; + } + verify(vmem32_map_buffer(vpn, page_size, offset, page_size, (entry->Data.PR & 1) != 0) != NULL); + u32 end = start + page_size; + const vector& blocks = vram_blocks[start / VRAM_PROT_SEGMENT]; + + vramlist_lock.Lock(); + for (int i = blocks.size() - 1; i >= 0; i--) + { + if (blocks[i]->start < end && blocks[i]->end >= start) + { + u32 prot_start = max(start, blocks[i]->start); + u32 prot_size = min(end, blocks[i]->end + 1) - prot_start; + prot_size += prot_start % PAGE_SIZE; + prot_start &= ~PAGE_MASK; + vmem32_protect_buffer(vpn + (prot_start & (page_size - 1)), prot_size); + } + } + vramlist_lock.Unlock(); + } + else + // Not vram + verify(vmem32_map_buffer(vpn, page_size, offset, page_size, (entry->Data.PR & 1) != 0) != NULL); + + return MMU_ERROR_NONE; + } +#else + u32 rc = MMU_ERROR_PROTECTED; +#endif + return rc; +} + +static u32 vmem32_map_address(u32 address, bool write) +{ + u32 area = address >> 29; + switch (area) + { + case 3: // P0/U0 + if (address >= AREA7_ADDRESS) + // area 7: unmapped + return VMEM32_ERROR_NOT_MAPPED; + /* no break */ + case 0: + case 1: + case 2: + case 6: // P3 + return vmem32_map_mmu(address, write); + + default: + break; + } + return VMEM32_ERROR_NOT_MAPPED; +} + +#if !defined(NO_MMU) && defined(HOST_64BIT_CPU) +bool vmem32_handle_signal(void *fault_addr, bool write) +{ + if ((u8*)fault_addr < vmem32_base || (u8*)fault_addr >= vmem32_base + VMEM32_SIZE) + return false; + vmem32_page_faults++; + u32 guest_addr = (u8*)fault_addr - vmem32_base; + u32 rv = vmem32_map_address(guest_addr, write); + //printf("vmem32_handle_signal handled signal %s @ %p -> %08x rv=%d\n", write ? "W" : "R", fault_addr, guest_addr, rv); + if (rv == MMU_ERROR_NONE) + return true; + if (rv == VMEM32_ERROR_NOT_MAPPED) + return false; + p_sh4rcb->cntx.pc = p_sh4rcb->cntx.exception_pc; + DoMMUException(guest_addr, rv, write ? MMU_TT_DWRITE : MMU_TT_DREAD); + ngen_HandleException(); + // not reached + return true; +} +#endif + +void vmem32_flush_mmu() +{ + vmem32_flush++; + vram_mapped_pages.clear(); + vmem32_unmap_buffer(0, KERNEL_SPACE); + // TODO flush P3? +} + +bool vmem32_init() +{ + if (!_nvmem_enabled()) + return false; +#ifdef HOST_64BIT_CPU +#if HOST_OS == OS_LINUX + void* rv = mmap(0, VMEM32_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + verify(rv != NULL); + munmap(rv, VMEM32_SIZE); + vmem32_base = (u8*)rv; +#elif HOST_OS == OS_WINDOWS + void* rv = (u8 *)VirtualAlloc(0, VMEM32_SIZE, MEM_RESERVE, PAGE_NOACCESS); + if (rv != NULL) + VirtualFree(rv, 0, MEM_RELEASE); + vmem32_base = (u8*)rv; +#else +#error Unsupported OS +#endif + + vmem32_unmap_buffer(0, VMEM32_SIZE); + printf("vmem32_init: allocated %zx bytes from %p to %p\n", VMEM32_SIZE, vmem32_base, vmem32_base + VMEM32_SIZE); + + if (!vmem32_map_areas()) + { + vmem32_term(); + return false; + } +#endif + return true; +} + +void vmem32_term() +{ + if (vmem32_base != NULL) + { + munmap(vmem32_base, VMEM32_SIZE); + vmem32_base = NULL; + } +} + diff --git a/core/hw/mem/vmem32.h b/core/hw/mem/vmem32.h new file mode 100644 index 000000000..692d25b5e --- /dev/null +++ b/core/hw/mem/vmem32.h @@ -0,0 +1,11 @@ +#include "types.h" + +bool vmem32_init(); +void vmem32_term(); +bool vmem32_handle_signal(void *fault_addr, bool write); +void vmem32_flush_mmu(); +void vmem32_protect_vram(vram_block *block); +void vmem32_unprotect_vram(vram_block *block); +static inline bool vmem32_enabled() { + return !settings.dynarec.disable_vmem32; +} diff --git a/core/hw/sh4/dyna/driver.cpp b/core/hw/sh4/dyna/driver.cpp index f27750cc6..a659c82bb 100644 --- a/core/hw/sh4/dyna/driver.cpp +++ b/core/hw/sh4/dyna/driver.cpp @@ -88,12 +88,11 @@ void clear_temp_cache(bool full) void recSh4_ClearCache() { + printf("recSh4:Dynarec Cache clear at %08X free space %d\n",curr_pc, emit_FreeSpace()); LastAddr=LastAddr_min; bm_Reset(); smc_hotspots.clear(); clear_temp_cache(true); - - printf("recSh4:Dynarec Cache clear at %08X\n",curr_pc); } void recSh4_Run() @@ -279,7 +278,7 @@ DynarecCodeEntryPtr rdv_CompilePC(u32 blockcheck_failures) emit_ptr_limit = (u32 *)(TempCodeCache + TEMP_CODE_SIZE); rbi->temp_block = true; } - bool do_opts=((rbi->addr&0x3FFFFFFF)>0x0C010100); + bool do_opts = !rbi->temp_block; //((rbi->addr&0x3FFFFFFF)>0x0C010100); rbi->staging_runs=do_opts?100:-100; ngen_Compile(rbi,DoCheck(rbi->addr),(pc&0xFFFFFF)==0x08300 || (pc&0xFFFFFF)==0x10000,false,do_opts); verify(rbi->code!=0); diff --git a/core/hw/sh4/dyna/ngen.h b/core/hw/sh4/dyna/ngen.h index f6a256510..76abe6210 100644 --- a/core/hw/sh4/dyna/ngen.h +++ b/core/hw/sh4/dyna/ngen.h @@ -100,6 +100,7 @@ extern void (*ngen_FailedToFindBlock)(); void ngen_mainloop(void* cntx); void ngen_GetFeatures(ngen_features* dst); +void ngen_HandleException(); //Canonical callback interface enum CanonicalParamType diff --git a/core/hw/sh4/interpr/sh4_interpreter.cpp b/core/hw/sh4/interpr/sh4_interpreter.cpp index fc358ee05..b9dc3a765 100644 --- a/core/hw/sh4/interpr/sh4_interpreter.cpp +++ b/core/hw/sh4/interpr/sh4_interpreter.cpp @@ -124,6 +124,8 @@ void Sh4_int_Skip() } } +extern u8 *vmem32_base; + void Sh4_int_Reset(bool Manual) { if (sh4_int_bCpuRun) @@ -148,6 +150,8 @@ void Sh4_int_Reset(bool Manual) old_fpscr=fpscr; UpdateFPSCR(); + p_sh4rcb->cntx.vmem32_base = vmem32_base; + //Any more registers have default value ? printf("Sh4 Reset\n"); } diff --git a/core/hw/sh4/interpr/sh4_opcodes.cpp b/core/hw/sh4/interpr/sh4_opcodes.cpp index 3cb3a52f7..7330db791 100644 --- a/core/hw/sh4/interpr/sh4_opcodes.cpp +++ b/core/hw/sh4/interpr/sh4_opcodes.cpp @@ -2066,7 +2066,7 @@ sh4op(i0000_nnnn_0110_1010) { u32 n = GetN(op); r[n] = fpscr.full; - UpdateFPSCR(); + //UpdateFPSCR(); } //sts.l FPSCR,@- diff --git a/core/hw/sh4/modules/ccn.cpp b/core/hw/sh4/modules/ccn.cpp index d084f58fc..efb1375a9 100644 --- a/core/hw/sh4/modules/ccn.cpp +++ b/core/hw/sh4/modules/ccn.cpp @@ -8,6 +8,7 @@ #include "../sh4_core.h" #include "hw/pvr/pvr_mem.h" #include "hw/mem/_vmem.h" +#include "hw/mem/vmem32.h" #include "mmu.h" //Types @@ -41,6 +42,16 @@ void CCN_QACR_write(u32 addr, u32 value) } } +void CCN_PTEH_write(u32 addr, u32 value) +{ + CCN_PTEH_type temp; + temp.reg_data = value; + if (temp.ASID != CCN_PTEH.ASID && vmem32_enabled()) + vmem32_flush_mmu(); + + CCN_PTEH = temp; +} + void CCN_MMUCR_write(u32 addr, u32 value) { CCN_MMUCR_type temp; @@ -52,6 +63,8 @@ void CCN_MMUCR_write(u32 addr, u32 value) { //sh4_cpu.ResetCache(); mmu_flush_table(); + if (vmem32_enabled()) + vmem32_flush_mmu(); temp.TI = 0; } @@ -99,7 +112,7 @@ static u32 CCN_PRR_read(u32 addr) void ccn_init() { //CCN PTEH 0xFF000000 0x1F000000 32 Undefined Undefined Held Held Iclk - sh4_rio_reg(CCN,CCN_PTEH_addr,RIO_DATA,32); + sh4_rio_reg(CCN,CCN_PTEH_addr,RIO_WF,32,0,&CCN_PTEH_write); //CCN PTEL 0xFF000004 0x1F000004 32 Undefined Undefined Held Held Iclk sh4_rio_reg(CCN,CCN_PTEL_addr,RIO_DATA,32); diff --git a/core/hw/sh4/sh4_if.h b/core/hw/sh4/sh4_if.h index c9505700b..c6999df96 100644 --- a/core/hw/sh4/sh4_if.h +++ b/core/hw/sh4/sh4_if.h @@ -282,6 +282,9 @@ struct Sh4Context int sh4_sched_next; u32 interrupt_pend; + + u32 exception_pc; + u8 *vmem32_base; }; u64 raw[64-8]; }; diff --git a/core/linux/common.cpp b/core/linux/common.cpp index 43c367b09..347b80235 100644 --- a/core/linux/common.cpp +++ b/core/linux/common.cpp @@ -29,6 +29,7 @@ #endif #include #include "hw/sh4/dyna/blockmanager.h" +#include "hw/mem/vmem32.h" #include "linux/context.h" @@ -48,7 +49,7 @@ void sigill_handler(int sn, siginfo_t * si, void *segfault_ctx) { context_from_segfault(&ctx, segfault_ctx); unat pc = (unat)ctx.pc; - bool dyna_cde = (pc>(unat)CodeCache) && (pc<(unat)(CodeCache + CODE_SIZE)); + bool dyna_cde = (pc>(unat)CodeCache) && (pc<(unat)(CodeCache + CODE_SIZE + TEMP_CODE_SIZE)); printf("SIGILL @ %lx -> %p was not in vram, dynacode:%d\n", pc, si->si_addr, dyna_cde); @@ -64,12 +65,21 @@ void fault_handler (int sn, siginfo_t * si, void *segfault_ctx) context_from_segfault(&ctx, segfault_ctx); - bool dyna_cde = ((unat)ctx.pc>(unat)CodeCache) && ((unat)ctx.pc<(unat)(CodeCache + CODE_SIZE)); + bool dyna_cde = ((unat)ctx.pc>(unat)CodeCache) && ((unat)ctx.pc<(unat)(CodeCache + CODE_SIZE + TEMP_CODE_SIZE)); //ucontext_t* ctx=(ucontext_t*)ctxr; //printf("mprot hit @ ptr 0x%08X @@ code: %08X, %d\n",si->si_addr,ctx->uc_mcontext.arm_pc,dyna_cde); - +#if !defined(NO_MMU) && defined(HOST_64BIT_CPU) +#if HOST_CPU == CPU_ARM64 + u32 op = *(u32*)ctx.pc; + bool write = (op & 0x00400000) == 0; +#elif HOST_CPU == CPU_X64 + bool write = false; // TODO? +#endif + if (vmem32_handle_signal(si->si_addr, write)) + return; +#endif if (VramLockedWrite((u8*)si->si_addr) || BM_LockedWrite((u8*)si->si_addr)) return; #if FEAT_SHREC == DYNAREC_JIT @@ -91,7 +101,10 @@ void fault_handler (int sn, siginfo_t * si, void *segfault_ctx) context_to_segfault(&ctx, segfault_ctx); } #elif HOST_CPU == CPU_X64 - //x64 has no rewrite support + else if (dyna_cde && ngen_Rewrite((unat&)ctx.pc, 0, 0)) + { + context_to_segfault(&ctx, segfault_ctx); + } #elif HOST_CPU == CPU_ARM64 else if (dyna_cde && ngen_Rewrite(ctx.pc, 0, 0)) { diff --git a/core/nullDC.cpp b/core/nullDC.cpp index 388d8418d..d46cee630 100755 --- a/core/nullDC.cpp +++ b/core/nullDC.cpp @@ -6,6 +6,7 @@ #include "oslib/oslib.h" #include "oslib/audiostream.h" #include "hw/mem/_vmem.h" +#include "hw/mem/vmem32.h" #include "stdclass.h" #include "cfg/cfg.h" @@ -140,7 +141,9 @@ void LoadSpecialSettings() extra_depth_game = false; full_mmu_game = false; - if (reios_windows_ce) + if (reios_windows_ce + // Half-life + || !strncmp("MK-51035", reios_product_number, 8)) { printf("Enabling Full MMU and Extra depth scaling for Windows CE game\n"); settings.rend.ExtraDepthScale = 0.1; @@ -275,6 +278,13 @@ int reicast_init(int argc, char* argv[]) printf("Failed to alloc mem\n"); return -1; } +#ifdef HOST_64BIT_CPU + if (!vmem32_init()) + { + printf("Failed to alloc 32-bit mem space\n"); + return -1; + } +#endif if (ParseCommandLine(argc, argv)) { return 69; @@ -461,6 +471,7 @@ void InitSettings() settings.dynarec.idleskip = true; settings.dynarec.unstable_opt = false; settings.dynarec.safemode = true; + settings.dynarec.disable_vmem32 = false; settings.dreamcast.cable = 3; // TV composite settings.dreamcast.region = 3; // default settings.dreamcast.broadcast = 4; // default @@ -534,6 +545,7 @@ void LoadSettings(bool game_specific) settings.dynarec.idleskip = cfgLoadBool(config_section, "Dynarec.idleskip", settings.dynarec.idleskip); settings.dynarec.unstable_opt = cfgLoadBool(config_section, "Dynarec.unstable-opt", settings.dynarec.unstable_opt); settings.dynarec.safemode = cfgLoadBool(config_section, "Dynarec.safe-mode", settings.dynarec.safemode); + settings.dynarec.disable_vmem32 = cfgLoadBool(config_section, "Dynarec.DisableVmem32", settings.dynarec.disable_vmem32); //disable_nvmem can't be loaded, because nvmem init is before cfg load settings.dreamcast.cable = cfgLoadInt(config_section, "Dreamcast.Cable", settings.dreamcast.cable); settings.dreamcast.region = cfgLoadInt(config_section, "Dreamcast.Region", settings.dreamcast.region); @@ -670,6 +682,7 @@ void SaveSettings() cfgSaveBool("config", "Dynarec.unstable-opt", settings.dynarec.unstable_opt); if (!safemode_game || !settings.dynarec.safemode) cfgSaveBool("config", "Dynarec.safe-mode", settings.dynarec.safemode); + cfgSaveBool("config", "Dynarec.DisableVmem32", settings.dynarec.disable_vmem32); cfgSaveInt("config", "Dreamcast.Language", settings.dreamcast.language); cfgSaveBool("config", "aica.LimitFPS", settings.aica.LimitFPS); cfgSaveBool("config", "aica.NoBatch", settings.aica.NoBatch); diff --git a/core/rec-ARM64/rec_arm64.cpp b/core/rec-ARM64/rec_arm64.cpp index 74bb3bf1a..a76f796b6 100644 --- a/core/rec-ARM64/rec_arm64.cpp +++ b/core/rec-ARM64/rec_arm64.cpp @@ -39,6 +39,7 @@ using namespace vixl::aarch64; #include "hw/sh4/dyna/ngen.h" #include "hw/sh4/sh4_mem.h" #include "hw/sh4/sh4_rom.h" +#include "hw/mem/vmem32.h" #include "arm64_regalloc.h" #undef do_sqw_nommu @@ -185,8 +186,8 @@ void ngen_mainloop(void* v_cntx) "stp x29, x30, [sp, #144] \n\t" "stp %[cntx], %[cycle_counter], [sp, #-16]! \n\t" // Push context, cycle_counter address - "mov w27, %[_SH4_TIMESLICE] \n\t" - "str w27, [%[cycle_counter]] \n\t" + "mov w1, %[_SH4_TIMESLICE] \n\t" + "str w1, [%[cycle_counter]] \n\t" "mov x0, %[jmp_env] \n\t" // SETJMP "bl setjmp \n\t" @@ -195,15 +196,17 @@ void ngen_mainloop(void* v_cntx) "ldr x28, [sp] \n\t" // Set context // w29 is next_pc "ldr w29, [x28, %[pc]] \n\t" + // x27 is vmem32_base + "ldr x27, [x28, %[vmem32_base]] \n\t" "b no_update \n" ".hidden intc_sched \n\t" ".globl intc_sched \n\t" "intc_sched: \n\t" - "ldr x27, [sp, #8] \n\t" // &cycle_counter - "ldr w0, [x27] \n\t" // cycle_counter + "ldr x1, [sp, #8] \n\t" // &cycle_counter + "ldr w0, [x1] \n\t" // cycle_counter "add w0, w0, %[_SH4_TIMESLICE] \n\t" - "str w0, [x27] \n\t" + "str w0, [x1] \n\t" "mov x29, lr \n\t" // Trashing pc here but it will be reset at the end of the block or in DoInterrupts "bl UpdateSystem \n\t" "mov lr, x29 \n\t" @@ -260,7 +263,8 @@ void ngen_mainloop(void* v_cntx) [RCB_SIZE] "i" (sizeof(Sh4RCB) >> 16), [SH4CTX_SIZE] "i" (sizeof(Sh4Context)), [jmp_env] "r"(reinterpret_cast(jmp_env)), - [cycle_counter] "r"(reinterpret_cast(&cycle_counter)) + [cycle_counter] "r"(reinterpret_cast(&cycle_counter)), + [vmem32_base] "i"(offsetof(Sh4Context, vmem32_base)) : "memory" ); } @@ -476,10 +480,10 @@ public: regalloc.DoAlloc(block); // scheduler - Mov(x27, reinterpret_cast(&cycle_counter)); - Ldr(w0, MemOperand(x27)); + Mov(x1, reinterpret_cast(&cycle_counter)); + Ldr(w0, MemOperand(x1)); Subs(w0, w0, block->guest_cycles); - Str(w0, MemOperand(x27)); + Str(w0, MemOperand(x1)); Label cycles_remaining; B(&cycles_remaining, pl); GenCallRuntime(intc_sched); @@ -568,11 +572,11 @@ public: break; case shop_readm: - GenReadMemory(op, i); + GenReadMemory(op, i, optimise); break; case shop_writem: - GenWriteMemory(op, i); + GenWriteMemory(op, i, optimise); break; case shop_sync_sr: @@ -1073,10 +1077,10 @@ public: void GenWriteMemorySlow(const shil_opcode& op) { + Instruction *start_instruction = GetCursorAddress(); if (mmu_enabled()) Mov(*call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc - Instruction *start_instruction = GetCursorAddress(); u32 size = op.flags & 0x7f; switch (size) { @@ -1117,7 +1121,10 @@ public: void InitializeRewrite(RuntimeBlockInfo *block, size_t opid) { - regalloc.DoAlloc(block); + this->block = block; + // with full mmu, all regs are flushed before mem ops + if (!mmu_enabled()) + regalloc.DoAlloc(block); regalloc.current_opid = opid; } @@ -1308,14 +1315,14 @@ private: B(&code_label, cond); } - void GenReadMemory(const shil_opcode& op, size_t opid) + void GenReadMemory(const shil_opcode& op, size_t opid, bool optimise) { if (GenReadMemoryImmediate(op)) return; GenMemAddr(op, call_regs[0]); - if (GenReadMemoryFast(op, opid)) + if (optimise && GenReadMemoryFast(op, opid)) return; GenReadMemorySlow(op); @@ -1431,59 +1438,104 @@ private: bool GenReadMemoryFast(const shil_opcode& op, size_t opid) { // Direct memory access. Need to handle SIGSEGV and rewrite block as needed. See ngen_Rewrite() - if (!_nvmem_enabled() || mmu_enabled()) + if (!_nvmem_enabled() || (mmu_enabled() && !vmem32_enabled())) return false; Instruction *start_instruction = GetCursorAddress(); - // WARNING: the rewrite code relies on having two ops before the memory access + const XRegister* base_reg; + const XRegister* offset_reg; + // WARNING: the rewrite code relies on having two ops before the memory access (3 when mmu is enabled) // Update ngen_Rewrite (and perhaps read_memory_rewrite_size) if adding or removing code - Add(w1, *call_regs[0], sizeof(Sh4Context), LeaveFlags); - Bfc(w1, 29, 3); // addr &= ~0xE0000000 + if (!mmu_enabled()) + { + Add(w1, *call_regs[0], sizeof(Sh4Context), LeaveFlags); + Bfc(w1, 29, 3); // addr &= ~0xE0000000 + base_reg = &x28; + offset_reg = &x1; + } + else + { + u32 exception_pc = block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0); + // 3 ops before memory access + Mov(w8, exception_pc & 0xFFFF); + Movk(w8, exception_pc >> 16, 16); + Str(w8, sh4_context_mem_operand(&p_sh4rcb->cntx.exception_pc)); + base_reg = &x27; + offset_reg = call_regs64[0]; + } //printf("direct read memory access opid %d pc %p code addr %08x\n", opid, GetCursorAddress(), this->block->addr); this->block->memory_accesses[GetCursorAddress()] = (u32)opid; u32 size = op.flags & 0x7f; - switch(size) + if (regalloc.IsAllocAny(op.rd)) { - case 1: - Ldrsb(regalloc.MapRegister(op.rd), MemOperand(x28, x1, SXTW)); - break; + switch(size) + { + case 1: + Ldrsb(regalloc.MapRegister(op.rd), MemOperand(*base_reg, *offset_reg)); + break; - case 2: - Ldrsh(regalloc.MapRegister(op.rd), MemOperand(x28, x1, SXTW)); - break; + case 2: + Ldrsh(regalloc.MapRegister(op.rd), MemOperand(*base_reg, *offset_reg)); + break; - case 4: - if (!op.rd.is_r32f()) - Ldr(regalloc.MapRegister(op.rd), MemOperand(x28, x1)); - else - Ldr(regalloc.MapVRegister(op.rd), MemOperand(x28, x1)); - break; + case 4: + if (!op.rd.is_r32f()) + Ldr(regalloc.MapRegister(op.rd), MemOperand(*base_reg, *offset_reg)); + else + Ldr(regalloc.MapVRegister(op.rd), MemOperand(*base_reg, *offset_reg)); + break; - case 8: - Ldr(x1, MemOperand(x28, x1)); - break; - } + case 8: + Ldr(x1, MemOperand(*base_reg, *offset_reg)); + break; + } - if (size == 8) - { + if (size == 8) + { #ifdef EXPLODE_SPANS - verify(op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1)); - Fmov(regalloc.MapVRegister(op.rd, 0), w1); - Lsr(x1, x1, 32); - Fmov(regalloc.MapVRegister(op.rd, 1), w1); + verify(op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1)); + Fmov(regalloc.MapVRegister(op.rd, 0), w1); + Lsr(x1, x1, 32); + Fmov(regalloc.MapVRegister(op.rd, 1), w1); #else - Str(x1, sh4_context_mem_operand(op.rd.reg_ptr())); + Str(x1, sh4_context_mem_operand(op.rd.reg_ptr())); #endif + } + } + else + { + switch(size) + { + case 1: + Ldrsb(w1, MemOperand(*base_reg, *offset_reg)); + break; + + case 2: + Ldrsh(w1, MemOperand(*base_reg, *offset_reg)); + break; + + case 4: + Ldr(w1, MemOperand(*base_reg, *offset_reg)); + break; + + case 8: + Ldr(x1, MemOperand(*base_reg, *offset_reg)); + break; + } + if (size == 8) + Str(x1, sh4_context_mem_operand(op.rd.reg_ptr())); + else + Str(w1, sh4_context_mem_operand(op.rd.reg_ptr())); } EnsureCodeSize(start_instruction, read_memory_rewrite_size); return true; } - void GenWriteMemory(const shil_opcode& op, size_t opid) + void GenWriteMemory(const shil_opcode& op, size_t opid, bool optimise) { GenMemAddr(op, call_regs[0]); @@ -1502,7 +1554,7 @@ private: shil_param_to_host_reg(op.rs2, *call_regs64[1]); #endif } - if (GenWriteMemoryFast(op, opid)) + if (optimise && GenWriteMemoryFast(op, opid)) return; GenWriteMemorySlow(op); @@ -1511,15 +1563,31 @@ private: bool GenWriteMemoryFast(const shil_opcode& op, size_t opid) { // Direct memory access. Need to handle SIGSEGV and rewrite block as needed. See ngen_Rewrite() - if (!_nvmem_enabled() || mmu_enabled()) + if (!_nvmem_enabled() || (mmu_enabled() && !vmem32_enabled())) return false; Instruction *start_instruction = GetCursorAddress(); - // WARNING: the rewrite code relies on having two ops before the memory access + const XRegister* base_reg; + const XRegister* offset_reg; + // WARNING: the rewrite code relies on having two ops before the memory access (3 when mmu is enabled) // Update ngen_Rewrite (and perhaps write_memory_rewrite_size) if adding or removing code - Add(w7, *call_regs[0], sizeof(Sh4Context), LeaveFlags); - Bfc(w7, 29, 3); // addr &= ~0xE0000000 + if (!mmu_enabled()) + { + Add(w7, *call_regs[0], sizeof(Sh4Context), LeaveFlags); + Bfc(w7, 29, 3); // addr &= ~0xE0000000 + base_reg = &x28; + offset_reg = &x7; + } + else + { + u32 exception_pc = block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0); + Mov(w8, exception_pc & 0xFFFF); + Movk(w8, exception_pc >> 16, 16); + Str(w8, sh4_context_mem_operand(&p_sh4rcb->cntx.exception_pc)); + base_reg = &x27; + offset_reg = call_regs64[0]; + } //printf("direct write memory access opid %d pc %p code addr %08x\n", opid, GetCursorAddress(), this->block->addr); this->block->memory_accesses[GetCursorAddress()] = (u32)opid; @@ -1528,19 +1596,19 @@ private: switch(size) { case 1: - Strb(w1, MemOperand(x28, x7, SXTW)); + Strb(w1, MemOperand(*base_reg, *offset_reg)); break; case 2: - Strh(w1, MemOperand(x28, x7, SXTW)); + Strh(w1, MemOperand(*base_reg, *offset_reg)); break; case 4: - Str(w1, MemOperand(x28, x7)); + Str(w1, MemOperand(*base_reg, *offset_reg)); break; case 8: - Str(x1, MemOperand(x28, x7)); + Str(x1, MemOperand(*base_reg, *offset_reg)); break; } EnsureCodeSize(start_instruction, write_memory_rewrite_size); @@ -1699,7 +1767,7 @@ private: RuntimeBlockInfo* block = NULL; const int read_memory_rewrite_size = 6; // worst case for u64: add, bfc, ldr, fmov, lsr, fmov // FIXME rewrite size per read/write size? - const int write_memory_rewrite_size = 3; + const int write_memory_rewrite_size = 4; }; static Arm64Assembler* compiler; @@ -1755,7 +1823,7 @@ bool ngen_Rewrite(unat& host_pc, unat, unat) u32 opid = it->second; verify(opid < block->oplist.size()); const shil_opcode& op = block->oplist[opid]; - Arm64Assembler *assembler = new Arm64Assembler(code_ptr - 2); // Skip the 2 preceding ops (bic, add) + Arm64Assembler *assembler = new Arm64Assembler(code_ptr - 2 - (mmu_enabled() ? 1 : 0)); // Skip the 2 preceding ops (bic, add) assembler->InitializeRewrite(block, opid); if (op.op == shop_readm) assembler->GenReadMemorySlow(op); @@ -1763,11 +1831,16 @@ bool ngen_Rewrite(unat& host_pc, unat, unat) assembler->GenWriteMemorySlow(op); assembler->Finalize(true); delete assembler; - host_pc = (unat)(code_ptr - 2); + host_pc = (unat)(code_ptr - 2 - (mmu_enabled() ? 1 : 0)); return true; } +void ngen_HandleException() +{ + longjmp(jmp_env, 1); +} + u32 DynaRBI::Relink() { if (mmu_enabled()) diff --git a/core/rec-x64/rec_x64.cpp b/core/rec-x64/rec_x64.cpp index 9723ffc43..848cc6d84 100644 --- a/core/rec-x64/rec_x64.cpp +++ b/core/rec-x64/rec_x64.cpp @@ -19,6 +19,7 @@ #include "hw/sh4/sh4_core.h" #include "hw/sh4/sh4_mem.h" #include "hw/sh4/sh4_rom.h" +#include "hw/mem/vmem32.h" #include "emitter/x86_emitter.h" #include "profiler/profiler.h" #include "oslib/oslib.h" @@ -130,7 +131,6 @@ WIN32_ONLY( ".seh_pushreg %r14 \n\t") "lea " _U "jmp_env(%rip), %rdi \n\t" #endif "call " _U "setjmp \n\t" -// "testl %rax, %rax \n\t" "1: \n\t" // run_loop "movq " _U "p_sh4rcb(%rip), %rax \n\t" @@ -219,13 +219,8 @@ static void ngen_blockcheckfail(u32 pc) { rdv_BlockCheckFail(pc); } -static u32 exception_raised; - -template -static T ReadMemNoEx(u32 addr, u32 pc) +static void handle_mem_exception(u32 exception_raised, u32 pc) { -#ifndef NO_MMU - T rv = mmu_ReadMemNoEx(addr, &exception_raised); if (exception_raised) { if (pc & 1) @@ -233,8 +228,19 @@ static T ReadMemNoEx(u32 addr, u32 pc) spc = pc - 1; else spc = pc; + cycle_counter += CPU_RATIO * 2; // probably more is needed but no easy way to find out longjmp(jmp_env, 1); } +} + +template +static T ReadMemNoEx(u32 addr, u32 pc) +{ +#ifndef NO_MMU + u32 exception_raised; + T rv = mmu_ReadMemNoEx(addr, &exception_raised); + handle_mem_exception(exception_raised, pc); + return rv; #else // not used @@ -246,32 +252,30 @@ template static void WriteMemNoEx(u32 addr, T data, u32 pc) { #ifndef NO_MMU - exception_raised = mmu_WriteMemNoEx(addr, data); - if (exception_raised) - { - if (pc & 1) - // Delay slot - spc = pc - 1; - else - spc = pc; - longjmp(jmp_env, 1); - } + u32 exception_raised = mmu_WriteMemNoEx(addr, data); + handle_mem_exception(exception_raised, pc); #endif } +static void handle_sh4_exception(SH4ThrownException& ex, u32 pc) +{ + if (pc & 1) + { + // Delay slot + AdjustDelaySlotException(ex); + pc--; + } + Do_Exception(pc, ex.expEvn, ex.callVect); + cycle_counter += CPU_RATIO * 4; // probably more is needed + longjmp(jmp_env, 1); +} + static void interpreter_fallback(u16 op, OpCallFP *oph, u32 pc) { try { oph(op); } catch (SH4ThrownException& ex) { - if (pc & 1) - { - // Delay slot - AdjustDelaySlotException(ex); - pc--; - } - Do_Exception(pc, ex.expEvn, ex.callVect); - longjmp(jmp_env, 1); + handle_sh4_exception(ex, pc); } } @@ -279,16 +283,8 @@ static void do_sqw_mmu_no_ex(u32 addr, u32 pc) { try { do_sqw_mmu(addr); - exception_raised = 0; } catch (SH4ThrownException& ex) { - if (pc & 1) - { - // Delay slot - AdjustDelaySlotException(ex); - pc--; - } - Do_Exception(pc, ex.expEvn, ex.callVect); - exception_raised = 1; + handle_sh4_exception(ex, pc); } } @@ -300,7 +296,9 @@ static void do_sqw_nommu_local(u32 addr, u8* sqb) class BlockCompiler : public Xbyak::CodeGenerator { public: - BlockCompiler() : Xbyak::CodeGenerator(emit_FreeSpace(), emit_GetCCPtr()), regalloc(this) + BlockCompiler() : BlockCompiler((u8 *)emit_GetCCPtr()) {} + + BlockCompiler(u8 *code_ptr) : Xbyak::CodeGenerator(emit_FreeSpace(), code_ptr), regalloc(this) { #if HOST_OS == OS_WINDOWS call_regs.push_back(ecx); @@ -333,17 +331,11 @@ public: void compile(RuntimeBlockInfo* block, bool force_checks, bool reset, bool staging, bool optimise) { //printf("X86_64 compiling %08x to %p\n", block->addr, emit_GetCCPtr()); + current_opid = -1; if (force_checks) { CheckBlock(block); } - regalloc.DoAlloc(block); - sub(dword[rip + &cycle_counter], block->guest_cycles); -#ifdef PROFILING - mov(rax, (uintptr_t)&guest_cpu_cycles); - mov(ecx, block->guest_cycles); - add(qword[rax], rcx); -#endif #ifdef _WIN32 sub(rsp, 0x28); // 32-byte shadow space + 8 byte alignment #else @@ -364,6 +356,13 @@ public: jmp(exit_block, T_NEAR); L(fpu_enabled); } + sub(dword[rip + &cycle_counter], block->guest_cycles); +#ifdef PROFILING + mov(rax, (uintptr_t)&guest_cpu_cycles); + mov(ecx, block->guest_cycles); + add(qword[rax], rcx); +#endif + regalloc.DoAlloc(block); for (current_opid = 0; current_opid < block->oplist.size(); current_opid++) { @@ -440,98 +439,7 @@ public: break; case shop_readm: - { - u32 size = op.flags & 0x7f; - bool immediate_address = op.rs1.is_imm(); - u32 addr = op.rs1._imm; - if (immediate_address && mmu_enabled()) - { - if ((op.rs1._imm >> 12) != (block->vaddr >> 12)) - { - // When full mmu is on, only consider addresses in the same 4k page - immediate_address = false; - } - else - { - u32 paddr; - u32 rv; - if (size == 2) - rv = mmu_data_translation(addr, paddr); - else if (size == 4) - rv = mmu_data_translation(addr, paddr); - else - die("Invalid immediate size"); - if (rv != MMU_ERROR_NONE) - immediate_address = false; - else - addr = paddr; - } - } - if (immediate_address) - { - bool isram = false; - void* ptr = _vmem_read_const(addr, isram, size); - - if (isram) - { - // Immediate pointer to RAM: super-duper fast access - mov(rax, reinterpret_cast(ptr)); - switch (size) - { - case 2: - if (regalloc.IsAllocg(op.rd)) - movsx(regalloc.MapRegister(op.rd), word[rax]); - else - { - movsx(eax, word[rax]); - mov(rcx, (uintptr_t)op.rd.reg_ptr()); - mov(dword[rcx], eax); - } - break; - - case 4: - if (regalloc.IsAllocg(op.rd)) - mov(regalloc.MapRegister(op.rd), dword[rax]); - else if (regalloc.IsAllocf(op.rd)) - movd(regalloc.MapXRegister(op.rd), dword[rax]); - else - { - mov(eax, dword[rax]); - mov(rcx, (uintptr_t)op.rd.reg_ptr()); - mov(dword[rcx], eax); - } - break; - - default: - die("Invalid immediate size"); - break; - } - } - else - { - // Not RAM: the returned pointer is a memory handler - mov(call_regs[0], addr); - - switch(size) - { - case 2: - GenCall((void (*)())ptr); - movsx(ecx, ax); - break; - - case 4: - GenCall((void (*)())ptr); - mov(ecx, eax); - break; - - default: - die("Invalid immediate size"); - break; - } - host_reg_to_shil_param(op.rd, ecx); - } - } - else + if (!GenReadMemImmediate(op, block)) { // Not an immediate address shil_param_to_host_reg(op.rs1, call_regs[0]); @@ -547,47 +455,10 @@ public: add(call_regs[0], dword[rax]); } } - if (mmu_enabled()) - mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc - - if (size == 1) { - if (!mmu_enabled()) - GenCall(ReadMem8); - else - GenCall(ReadMemNoEx); - movsx(ecx, al); - } - else if (size == 2) { - if (!mmu_enabled()) - GenCall(ReadMem16); - else - GenCall(ReadMemNoEx); - movsx(ecx, ax); - } - else if (size == 4) { - if (!mmu_enabled()) - GenCall(ReadMem32); - else - GenCall(ReadMemNoEx); - mov(ecx, eax); - } - else if (size == 8) { - if (!mmu_enabled()) - GenCall(ReadMem64); - else - GenCall(ReadMemNoEx); - mov(rcx, rax); - } - else { - die("1..8 bytes"); - } - -// if (mmu_enabled()) -// { -// test(dword[(void *)&exception_raised], 1); -// jnz(exit_block, T_NEAR); -// } + if (!optimise || !GenReadMemoryFast(op, block)) + GenReadMemorySlow(op, block); + u32 size = op.flags & 0x7f; if (size != 8) host_reg_to_shil_param(op.rd, ecx); else { @@ -606,12 +477,10 @@ public: } } } - } - break; + break; case shop_writem: { - u32 size = op.flags & 0x7f; shil_param_to_host_reg(op.rs1, call_regs[0]); if (!op.rs3.is_null()) { @@ -626,6 +495,7 @@ public: } } + u32 size = op.flags & 0x7f; if (size != 8) shil_param_to_host_reg(op.rs2, call_regs[1]); else { @@ -644,42 +514,8 @@ public: mov(call_regs64[1], qword[rax]); } } - if (mmu_enabled()) - mov(call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc - - if (size == 1) { - if (!mmu_enabled()) - GenCall(WriteMem8); - else - GenCall(WriteMemNoEx); - } - else if (size == 2) { - if (!mmu_enabled()) - GenCall(WriteMem16); - else - GenCall(WriteMemNoEx); - } - else if (size == 4) { - if (!mmu_enabled()) - GenCall(WriteMem32); - else - GenCall(WriteMemNoEx); - } - else if (size == 8) { - if (!mmu_enabled()) - GenCall(WriteMem64); - else - GenCall(WriteMemNoEx); - } - else { - die("1..8 bytes"); - } - -// if (mmu_enabled()) -// { -// test(dword[(void *)&exception_raised], 1); -// jnz(exit_block, T_NEAR); -// } + if (!optimise || !GenWriteMemoryFast(op, block)) + GenWriteMemorySlow(op, block); } break; @@ -729,8 +565,8 @@ public: mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); \ if (op.rs2.is_imm()) \ natop(regalloc.MapRegister(op.rd), op.rs2._imm); \ - else if (op.rs2.is_reg()) \ - natop(regalloc.MapRegister(op.rd), Xbyak::Reg8(regalloc.MapRegister(op.rs2).getIdx())); + else \ + die("Unsupported operand"); case shop_shl: SHIFT_OP(shl) break; @@ -925,9 +761,6 @@ public: mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc GenCall(do_sqw_mmu_no_ex); - - test(dword[(void *)&exception_raised], 1); - jnz(exit_block, T_NEAR); } else { @@ -1163,6 +996,7 @@ public: regalloc.OpEnd(&op); } regalloc.Cleanup(); + current_opid = -1; mov(rax, (size_t)&next_pc); @@ -1242,6 +1076,118 @@ public: emit_Skip(getSize()); } + void GenReadMemorySlow(const shil_opcode& op, RuntimeBlockInfo* block) + { + const u8 *start_addr = getCurr(); + if (mmu_enabled()) + mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc + + u32 size = op.flags & 0x7f; + switch (size) { + case 1: + if (!mmu_enabled()) + GenCall(ReadMem8); + else + GenCall(ReadMemNoEx); + movsx(ecx, al); + break; + case 2: + if (!mmu_enabled()) + GenCall(ReadMem16); + else + GenCall(ReadMemNoEx); + movsx(ecx, ax); + break; + + case 4: + if (!mmu_enabled()) + GenCall(ReadMem32); + else + GenCall(ReadMemNoEx); + mov(ecx, eax); + break; + case 8: + if (!mmu_enabled()) + GenCall(ReadMem64); + else + GenCall(ReadMemNoEx); + mov(rcx, rax); + break; + default: + die("1..8 bytes"); + } + + if (mmu_enabled()) + { + Xbyak::Label quick_exit; + if (getCurr() - start_addr <= read_mem_op_size - 6) + jmp(quick_exit, T_NEAR); + while (getCurr() - start_addr < read_mem_op_size) + nop(); + L(quick_exit); + verify(getCurr() - start_addr == read_mem_op_size); + } + } + + void GenWriteMemorySlow(const shil_opcode& op, RuntimeBlockInfo* block) + { + const u8 *start_addr = getCurr(); + if (mmu_enabled()) + mov(call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc + + u32 size = op.flags & 0x7f; + switch (size) { + case 1: + if (!mmu_enabled()) + GenCall(WriteMem8); + else + GenCall(WriteMemNoEx); + break; + case 2: + if (!mmu_enabled()) + GenCall(WriteMem16); + else + GenCall(WriteMemNoEx); + break; + case 4: + if (!mmu_enabled()) + GenCall(WriteMem32); + else + GenCall(WriteMemNoEx); + break; + case 8: + if (!mmu_enabled()) + GenCall(WriteMem64); + else + GenCall(WriteMemNoEx); + break; + default: + die("1..8 bytes"); + } + if (mmu_enabled()) + { + Xbyak::Label quick_exit; + if (getCurr() - start_addr <= write_mem_op_size - 6) + jmp(quick_exit, T_NEAR); + while (getCurr() - start_addr < write_mem_op_size) + nop(); + L(quick_exit); + verify(getCurr() - start_addr == write_mem_op_size); + } + } + + void InitializeRewrite(RuntimeBlockInfo *block, size_t opid) + { + // shouldn't be necessary since all regs are flushed before mem access when mmu is enabled + //regalloc.DoAlloc(block); + regalloc.current_opid = opid; + } + + void FinalizeRewrite() + { + ready(); + } + void ngen_CC_Start(const shil_opcode& op) { CC_pars.clear(); @@ -1346,16 +1292,188 @@ private: typedef void (BlockCompiler::*X64BinaryOp)(const Xbyak::Operand&, const Xbyak::Operand&); typedef void (BlockCompiler::*X64BinaryFOp)(const Xbyak::Xmm&, const Xbyak::Operand&); + bool GenReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* block) + { + if (!op.rs1.is_imm()) + return false; + u32 size = op.flags & 0x7f; + u32 addr = op.rs1._imm; + if (mmu_enabled()) + { + if ((addr >> 12) != (block->vaddr >> 12)) + // When full mmu is on, only consider addresses in the same 4k page + return false; + + u32 paddr; + u32 rv; + if (size == 2) + rv = mmu_data_translation(addr, paddr); + else if (size == 4) + rv = mmu_data_translation(addr, paddr); + else + die("Invalid immediate size"); + if (rv != MMU_ERROR_NONE) + return false; + + addr = paddr; + } + bool isram = false; + void* ptr = _vmem_read_const(addr, isram, size); + + if (isram) + { + // Immediate pointer to RAM: super-duper fast access + mov(rax, reinterpret_cast(ptr)); + switch (size) + { + case 2: + if (regalloc.IsAllocg(op.rd)) + movsx(regalloc.MapRegister(op.rd), word[rax]); + else + { + movsx(eax, word[rax]); + mov(rcx, (uintptr_t)op.rd.reg_ptr()); + mov(dword[rcx], eax); + } + break; + + case 4: + if (regalloc.IsAllocg(op.rd)) + mov(regalloc.MapRegister(op.rd), dword[rax]); + else if (regalloc.IsAllocf(op.rd)) + movd(regalloc.MapXRegister(op.rd), dword[rax]); + else + { + mov(eax, dword[rax]); + mov(rcx, (uintptr_t)op.rd.reg_ptr()); + mov(dword[rcx], eax); + } + break; + + default: + die("Invalid immediate size"); + break; + } + } + else + { + // Not RAM: the returned pointer is a memory handler + mov(call_regs[0], addr); + + switch(size) + { + case 2: + GenCall((void (*)())ptr); + movsx(ecx, ax); + break; + + case 4: + GenCall((void (*)())ptr); + mov(ecx, eax); + break; + + default: + die("Invalid immediate size"); + break; + } + host_reg_to_shil_param(op.rd, ecx); + } + + return true; + } + + bool GenReadMemoryFast(const shil_opcode& op, RuntimeBlockInfo* block) + { + if (!mmu_enabled() || !vmem32_enabled()) + return false; + const u8 *start_addr = getCurr(); + + mov(rax, (uintptr_t)&p_sh4rcb->cntx.exception_pc); + mov(dword[rax], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0)); + + mov(rax, (uintptr_t)p_sh4rcb->cntx.vmem32_base); + + u32 size = op.flags & 0x7f; + verify(getCurr() - start_addr == 26); + + block->memory_accesses[(void*)getCurr()] = (u32)current_opid; + switch (size) + { + case 1: + movsx(ecx, byte[rax + call_regs64[0]]); + break; + + case 2: + movsx(ecx, word[rax + call_regs64[0]]); + break; + + case 4: + mov(ecx, dword[rax + call_regs64[0]]); + break; + + case 8: + mov(rcx, qword[rax + call_regs64[0]]); + break; + + default: + die("1..8 bytes"); + } + + while (getCurr() - start_addr < read_mem_op_size) + nop(); + verify(getCurr() - start_addr == read_mem_op_size); + + return true; + } + + bool GenWriteMemoryFast(const shil_opcode& op, RuntimeBlockInfo* block) + { + if (!mmu_enabled() || !vmem32_enabled()) + return false; + const u8 *start_addr = getCurr(); + + mov(rax, (uintptr_t)&p_sh4rcb->cntx.exception_pc); + mov(dword[rax], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0)); + + mov(rax, (uintptr_t)p_sh4rcb->cntx.vmem32_base); + + u32 size = op.flags & 0x7f; + verify(getCurr() - start_addr == 26); + + block->memory_accesses[(void*)getCurr()] = (u32)current_opid; + switch (size) + { + case 1: + mov(byte[rax + call_regs64[0] + 0], Xbyak::Reg8(call_regs[1].getIdx(), call_regs[1] == edi || call_regs[1] == esi)); + break; + + case 2: + mov(word[rax + call_regs64[0]], Xbyak::Reg16(call_regs[1].getIdx())); + break; + + case 4: + mov(dword[rax + call_regs64[0]], call_regs[1]); + break; + + case 8: + mov(qword[rax + call_regs64[0]], call_regs64[1]); + break; + + default: + die("1..8 bytes"); + } + + while (getCurr() - start_addr < write_mem_op_size) + nop(); + verify(getCurr() - start_addr == write_mem_op_size); + + return true; + } + void CheckBlock(RuntimeBlockInfo* block) { mov(call_regs[0], block->addr); -// if (mmu_enabled() && block->asid != 0xFFFFFFFF) -// { -// mov(rax, (uintptr_t)&CCN_PTEH.reg_data); -// cmp(byte[rax], block->asid); -// jne(reinterpret_cast(&ngen_blockcheckfail)); -// } - // FIXME Neither of these tests should be necessary + // FIXME This test shouldn't be necessary // However the decoder makes various assumptions about the current PC value, which are simply not // true in a virtualized memory model. So this can only work if virtual and phy addresses are the // same at compile and run times. @@ -1424,10 +1542,10 @@ private: void GenCall(Ret(*function)(Params...)) { #ifndef _WIN32 - bool xmm8_mapped = regalloc.IsMapped(xmm8, current_opid); - bool xmm9_mapped = regalloc.IsMapped(xmm9, current_opid); - bool xmm10_mapped = regalloc.IsMapped(xmm10, current_opid); - bool xmm11_mapped = regalloc.IsMapped(xmm11, current_opid); + bool xmm8_mapped = current_opid != -1 && regalloc.IsMapped(xmm8, current_opid); + bool xmm9_mapped = current_opid != -1 && regalloc.IsMapped(xmm9, current_opid); + bool xmm10_mapped = current_opid != -1 && regalloc.IsMapped(xmm10, current_opid); + bool xmm11_mapped = current_opid != -1 && regalloc.IsMapped(xmm11, current_opid); // Need to save xmm registers as they are not preserved in linux/mach int offset = 0; @@ -1587,11 +1705,15 @@ private: static const u32 float_sign_mask; static const u32 float_abs_mask; static const f32 cvtf2i_pos_saturation; + static const u32 read_mem_op_size; + static const u32 write_mem_op_size; }; const u32 BlockCompiler::float_sign_mask = 0x80000000; const u32 BlockCompiler::float_abs_mask = 0x7fffffff; const f32 BlockCompiler::cvtf2i_pos_saturation = 2147483520.0f; // IEEE 754: 0x4effffff; +const u32 BlockCompiler::read_mem_op_size = 30; +const u32 BlockCompiler::write_mem_op_size = 30; void X64RegAlloc::Preload(u32 reg, Xbyak::Operand::Code nreg) { @@ -1641,4 +1763,47 @@ void ngen_CC_Call(shil_opcode* op, void* function) void ngen_CC_Finish(shil_opcode* op) { } + +bool ngen_Rewrite(unat& host_pc, unat, unat) +{ + if (!mmu_enabled() || !vmem32_enabled()) + return false; + + //printf("ngen_Rewrite pc %p\n", host_pc); + RuntimeBlockInfo *block = bm_GetBlock((void *)host_pc); + if (block == NULL) + { + printf("ngen_Rewrite: Block at %p not found\n", (void *)host_pc); + return false; + } + u8 *code_ptr = (u8*)host_pc; + auto it = block->memory_accesses.find(code_ptr); + if (it == block->memory_accesses.end()) + { + printf("ngen_Rewrite: memory access at %p not found (%lu entries)\n", code_ptr, block->memory_accesses.size()); + return false; + } + u32 opid = it->second; + verify(opid < block->oplist.size()); + const shil_opcode& op = block->oplist[opid]; + + BlockCompiler *assembler = new BlockCompiler(code_ptr - 26); + assembler->InitializeRewrite(block, opid); + if (op.op == shop_readm) + assembler->GenReadMemorySlow(op, block); + else + assembler->GenWriteMemorySlow(op, block); + assembler->FinalizeRewrite(); + verify(block->host_code_size >= assembler->getSize()); + delete assembler; + block->memory_accesses.erase(it); + host_pc = (unat)(code_ptr - 26); + + return true; +} + +void ngen_HandleException() +{ + longjmp(jmp_env, 1); +} #endif diff --git a/core/rend/TexCache.cpp b/core/rend/TexCache.cpp index b0480c273..dba4825fd 100644 --- a/core/rend/TexCache.cpp +++ b/core/rend/TexCache.cpp @@ -7,6 +7,8 @@ #include "TexCache.h" #include "hw/pvr/pvr_regs.h" #include "hw/mem/_vmem.h" +#include "hw/mem/vmem32.h" +#include "hw/sh4/modules/mmu.h" #include "deps/xbrz/xbrz.h" #include "deps/xxhash/xxhash.h" @@ -213,6 +215,8 @@ vram_block* libCore_vramlock_Lock(u32 start_offset64,u32 end_offset64,void* user if (_nvmem_enabled() && VRAM_SIZE == 0x800000) { vram.LockRegion(block->start + VRAM_SIZE, block->len); } + if (mmu_enabled()) + vmem32_protect_vram(block); vramlock_list_add(block); @@ -222,11 +226,8 @@ vram_block* libCore_vramlock_Lock(u32 start_offset64,u32 end_offset64,void* user return block; } - -bool VramLockedWrite(u8* address) +bool VramLockedWriteOffset(size_t offset) { - size_t offset=address-vram.data; - if (offset, u16>(PixelBuffer* pb,u8* p_in, #define tex1555_VQ32 texture_VQ, u32> #define tex4444_VQ32 texture_VQ, u32> -#define Is_64_Bit(addr) ((addr &0x1000000)==0) - -//vram_block, vramLockCBFP on plugin headers - - -u32 vramlock_ConvAddrtoOffset64(u32 Address); -u32 vramlock_ConvOffset32toOffset64(u32 offset32); - -void vramlock_Unlock_block(vram_block* block); -vram_block* vramlock_Lock_32(u32 start_offset32,u32 end_offset32,void* userdata); -vram_block* vramlock_Lock_64(u32 start_offset64,u32 end_offset64,void* userdata); - -void vram_LockedWrite(u32 offset64); - void DePosterize(u32* source, u32* dest, int width, int height); void UpscalexBRZ(int factor, u32* source, u32* dest, int width, int height, bool has_alpha); diff --git a/core/types.h b/core/types.h index e082e13b0..ec6ed7a43 100644 --- a/core/types.h +++ b/core/types.h @@ -736,6 +736,7 @@ struct settings_t bool unstable_opt; bool safemode; bool disable_nvmem; + bool disable_vmem32; } dynarec; struct diff --git a/core/windows/winmain.cpp b/core/windows/winmain.cpp index d54e44a97..5250a0443 100644 --- a/core/windows/winmain.cpp +++ b/core/windows/winmain.cpp @@ -1,6 +1,7 @@ #include "oslib\oslib.h" #include "oslib\audiostream.h" #include "imgread\common.h" +#include "hw\mem\vmem32.h" #include "xinput_gamepad.h" #include "win_keyboard.h" @@ -141,6 +142,11 @@ LONG ExeptionHandler(EXCEPTION_POINTERS *ExceptionInfo) u8* address=(u8*)pExceptionRecord->ExceptionInformation[1]; //printf("[EXC] During access to : 0x%X\n", address); +#if !defined(NO_MMU) && defined(HOST_64BIT_CPU) + bool write = false; // TODO? + if (vmem32_handle_signal(ep->ContextRecord->Rcx, write)) + return EXCEPTION_CONTINUE_EXECUTION; +#endif if (VramLockedWrite(address)) { @@ -152,7 +158,8 @@ LONG ExeptionHandler(EXCEPTION_POINTERS *ExceptionInfo) return EXCEPTION_CONTINUE_EXECUTION; } #endif -#if FEAT_SHREC == DYNAREC_JIT && HOST_CPU == CPU_X86 +#if FEAT_SHREC == DYNAREC_JIT +#if HOST_CPU == CPU_X86 else if ( ngen_Rewrite((unat&)ep->ContextRecord->Eip,*(unat*)ep->ContextRecord->Esp,ep->ContextRecord->Eax) ) { //remove the call from call stack @@ -161,6 +168,11 @@ LONG ExeptionHandler(EXCEPTION_POINTERS *ExceptionInfo) ep->ContextRecord->Ecx=ep->ContextRecord->Eax; return EXCEPTION_CONTINUE_EXECUTION; } +#elif HOST_CPU == CPU_X64 + else if (dyna_cde && ngen_Rewrite((unat&)ep->ContextRecord->Rip, 0, 0)) + { + return EXCEPTION_CONTINUE_EXECUTION; + } #endif else { @@ -576,7 +588,7 @@ _In_opt_ PVOID Context // (DWORD)((u8 *)__gnat_SEH_error_handler - CodeCache); /* Set its scope to the entire program. */ Table[0].BeginAddress = 0;// (CodeCache - (u8*)__ImageBase); - Table[0].EndAddress = /*(CodeCache - (u8*)__ImageBase) +*/ CODE_SIZE; + Table[0].EndAddress = /*(CodeCache - (u8*)__ImageBase) +*/ CODE_SIZE + TEMP_CODE_SIZE; Table[0].UnwindData = (DWORD)((u8 *)unwind_info - CodeCache); printf("TABLE CALLBACK\n"); //for (;;); @@ -605,13 +617,13 @@ void setup_seh() { //(DWORD)((u8 *)__gnat_SEH_error_handler - CodeCache); /* Set its scope to the entire program. */ Table[0].BeginAddress = 0;// (CodeCache - (u8*)__ImageBase); - Table[0].EndAddress = /*(CodeCache - (u8*)__ImageBase) +*/ CODE_SIZE; + Table[0].EndAddress = /*(CodeCache - (u8*)__ImageBase) +*/ CODE_SIZE + TEMP_CODE_SIZE; Table[0].UnwindData = (DWORD)((u8 *)unwind_info - CodeCache); /* Register the unwind information. */ RtlAddFunctionTable(Table, 1, (DWORD64)CodeCache); #endif - //verify(RtlInstallFunctionTableCallback((unat)CodeCache | 0x3, (DWORD64)CodeCache, CODE_SIZE, seh_callback, 0, 0)); + //verify(RtlInstallFunctionTableCallback((unat)CodeCache | 0x3, (DWORD64)CodeCache, CODE_SIZE + TEMP_CODE_SIZE, seh_callback, 0, 0)); } #endif