From 31debefbe10eb726e0d2dbe0610de5f327e81534 Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Tue, 24 Jan 2023 09:21:46 +0100 Subject: [PATCH] release dynarec mem allocs when terminating, including arm and dsp Issue #453 --- core/emulator.cpp | 1 + core/hw/aica/dsp.cpp | 4 +++ core/hw/aica/dsp.h | 1 + core/hw/aica/dsp_arm32.cpp | 4 +++ core/hw/aica/dsp_arm64.cpp | 16 ++++++++++++ core/hw/aica/dsp_x64.cpp | 12 +++++++++ core/hw/aica/dsp_x86.cpp | 5 ++++ core/hw/arm7/arm7.cpp | 7 ++++++ core/hw/arm7/arm7.h | 1 + core/hw/arm7/arm7_rec.cpp | 15 ++++++++++-- core/hw/arm7/arm7_rec.h | 1 + core/hw/mem/_vmem.h | 8 ++++-- core/hw/sh4/dyna/driver.cpp | 23 ++++++++++------- core/linux/posix_vmem.cpp | 30 +++++++++++++++++------ core/windows/win_vmem.cpp | 49 ++++++++++++++++++++++++------------- 15 files changed, 140 insertions(+), 37 deletions(-) diff --git a/core/emulator.cpp b/core/emulator.cpp index 2c0ea2b40..903792af3 100644 --- a/core/emulator.cpp +++ b/core/emulator.cpp @@ -663,6 +663,7 @@ void Emulator::term() sh4_cpu.Term(); custom_texture.Terminate(); // lr: avoid deadlock on exit (win32) reios_term(); + aicaarm::term(); libAICA_Term(); pvr::term(); mem_Term(); diff --git a/core/hw/aica/dsp.cpp b/core/hw/aica/dsp.cpp index 613b872ef..c9b4f9f4d 100644 --- a/core/hw/aica/dsp.cpp +++ b/core/hw/aica/dsp.cpp @@ -100,6 +100,9 @@ void DecodeInst(const u32 *IPtr, Instruction *i) void recInit() { } +void recTerm() { +} + void recompile() { } #endif @@ -124,6 +127,7 @@ void writeProg(u32 addr) void term() { state.stopped = true; + recTerm(); } void step() diff --git a/core/hw/aica/dsp.h b/core/hw/aica/dsp.h index ff3b9453a..86b2ec238 100644 --- a/core/hw/aica/dsp.h +++ b/core/hw/aica/dsp.h @@ -62,6 +62,7 @@ void step(); void writeProg(u32 addr); void recInit(); +void recTerm(); void runStep(); void recompile(); diff --git a/core/hw/aica/dsp_arm32.cpp b/core/hw/aica/dsp_arm32.cpp index 48f8f03f9..066c37e4a 100644 --- a/core/hw/aica/dsp_arm32.cpp +++ b/core/hw/aica/dsp_arm32.cpp @@ -411,6 +411,10 @@ void recInit() verify(rc); } +void recTerm() +{ +} + void runStep() { ((void (*)())DynCode)(); diff --git a/core/hw/aica/dsp_arm64.cpp b/core/hw/aica/dsp_arm64.cpp index 8ea2d6f0c..e703c78a0 100644 --- a/core/hw/aica/dsp_arm64.cpp +++ b/core/hw/aica/dsp_arm64.cpp @@ -461,6 +461,22 @@ void recInit() #endif } + +void recTerm() +{ +#if defined(TARGET_IPHONE) || defined(TARGET_ARM_MAC) + DynCode = nullptr; +#endif +#ifdef FEAT_NO_RWX_PAGES + if (pCodeBuffer != nullptr) + vmem_platform_release_jit_block(DynCode, pCodeBuffer, CodeSize); +#else + if (pCodeBuffer != nullptr && pCodeBuffer != DynCode) + vmem_platform_release_jit_block(pCodeBuffer, CodeSize); +#endif + pCodeBuffer = nullptr; +} + void runStep() { ((void (*)())DynCode)(); diff --git a/core/hw/aica/dsp_x64.cpp b/core/hw/aica/dsp_x64.cpp index d7b8b07dd..f6525c95a 100644 --- a/core/hw/aica/dsp_x64.cpp +++ b/core/hw/aica/dsp_x64.cpp @@ -428,6 +428,18 @@ void recInit() die("vmem_platform_prepare_jit_block failed in x64 dsp"); } +void recTerm() +{ +#ifdef FEAT_NO_RWX_PAGES + if (pCodeBuffer != nullptr) + vmem_platform_release_jit_block(CodeBuffer, pCodeBuffer, CodeBufferSize); +#else + if (pCodeBuffer != nullptr && pCodeBuffer != CodeBuffer) + vmem_platform_release_jit_block(pCodeBuffer, CodeBufferSize); +#endif + pCodeBuffer = nullptr; +} + void runStep() { ((void (*)())&pCodeBuffer[0])(); diff --git a/core/hw/aica/dsp_x86.cpp b/core/hw/aica/dsp_x86.cpp index 9598d921f..d4d9b71bb 100644 --- a/core/hw/aica/dsp_x86.cpp +++ b/core/hw/aica/dsp_x86.cpp @@ -383,6 +383,11 @@ void recInit() die("mprotect failed in x86 dsp"); } +void recTerm() +{ + pCodeBuffer = nullptr; +} + void runStep() { ((void (*)())&CodeBuffer[0])(); diff --git a/core/hw/arm7/arm7.cpp b/core/hw/arm7/arm7.cpp index 531f48666..02434bbd5 100644 --- a/core/hw/arm7/arm7.cpp +++ b/core/hw/arm7/arm7.cpp @@ -104,6 +104,13 @@ void aicaarm::init() } } +void aicaarm::term() +{ +#if FEAT_AREC != DYNAREC_NONE + recompiler::term(); +#endif +} + static void CPUSwitchMode(int mode, bool saveState) { CPUUpdateCPSR(); diff --git a/core/hw/arm7/arm7.h b/core/hw/arm7/arm7.h index e98d84a72..7d7dd660f 100644 --- a/core/hw/arm7/arm7.h +++ b/core/hw/arm7/arm7.h @@ -4,6 +4,7 @@ namespace aicaarm { void init(); +void term(); void reset(); void run(u32 samples); void enable(bool enabled); diff --git a/core/hw/arm7/arm7_rec.cpp b/core/hw/arm7/arm7_rec.cpp index c0af21bc2..f432a6ff7 100644 --- a/core/hw/arm7/arm7_rec.cpp +++ b/core/hw/arm7/arm7_rec.cpp @@ -674,8 +674,6 @@ void init() #endif verify(rc); - icPtr = ICache; - for (int i = 0; i < 256; i++) { int count = 0; @@ -685,6 +683,19 @@ void init() cpuBitsSet[i] = count; } + flush(); +} + +void term() +{ +#ifdef FEAT_NO_RWX_PAGES + if (ICache != nullptr) + vmem_platform_release_jit_block(ARM7_TCB, ICache, ICacheSize); +#else + if (ICache != nullptr && ICache != ARM7_TCB) + vmem_platform_release_jit_block(ICache, ICacheSize); +#endif + ICache = nullptr; } template diff --git a/core/hw/arm7/arm7_rec.h b/core/hw/arm7/arm7_rec.h index 9cdd96ba4..c079b474e 100644 --- a/core/hw/arm7/arm7_rec.h +++ b/core/hw/arm7/arm7_rec.h @@ -421,6 +421,7 @@ protected: namespace recompiler { void init(); +void term(); void flush(); void compile(); void *getMemOp(bool load, bool byte); diff --git a/core/hw/mem/_vmem.h b/core/hw/mem/_vmem.h index 996710cdc..8e73828f8 100644 --- a/core/hw/mem/_vmem.h +++ b/core/hw/mem/_vmem.h @@ -20,14 +20,18 @@ void vmem_platform_create_mappings(const vmem_mapping *vmem_maps, unsigned numma void vmem_platform_destroy(); // Given a block of data in the .text section, prepares it for JIT action. // both code_area and size are page aligned. Returns success. -bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code_area_rwx); +bool vmem_platform_prepare_jit_block(void *code_area, size_t size, void **code_area_rwx); // Same as above but uses two address spaces one with RX and RW protections. // Note: this function doesnt have to be implemented, it's a fallback for the above one. -bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code_area_rw, ptrdiff_t *rx_offset); +bool vmem_platform_prepare_jit_block(void *code_area, size_t size, void **code_area_rw, ptrdiff_t *rx_offset); // This might not need an implementation (ie x86/64 cpus). void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end); // Change a code buffer permissions from r-x to/from rw- void vmem_platform_jit_set_exec(void* code, size_t size, bool enable); +// Release a jit block previously allocated by vmem_platform_prepare_jit_block +void vmem_platform_release_jit_block(void *code_area, size_t size); +// Release a jit block previously allocated by vmem_platform_prepare_jit_block (with dual RW and RX areas) +void vmem_platform_release_jit_block(void *code_area1, void *code_area2, size_t size); // Note: if you want to disable vmem magic in any given platform, implement the // above functions as empty functions and make vmem_platform_init return false. diff --git a/core/hw/sh4/dyna/driver.cpp b/core/hw/sh4/dyna/driver.cpp index 21603481d..f19c9b4ce 100644 --- a/core/hw/sh4/dyna/driver.cpp +++ b/core/hw/sh4/dyna/driver.cpp @@ -23,7 +23,7 @@ #if defined(_WIN32) || FEAT_SHREC != DYNAREC_JIT || defined(TARGET_IPHONE) || defined(TARGET_ARM_MAC) static u8 *SH4_TCB; #else -static u8 SH4_TCB[CODE_SIZE + TEMP_CODE_SIZE + 4096] +alignas(4096) static u8 SH4_TCB[CODE_SIZE + TEMP_CODE_SIZE] #if defined(__unix__) || defined(__SWITCH__) __attribute__((section(".text"))); #elif defined(__APPLE__) @@ -394,24 +394,20 @@ static void recSh4_Init() Get_Sh4Interpreter(&sh4Interp); sh4Interp.Init(); bm_Init(); - if (_nvmem_enabled()) verify(mem_b.data == ((u8*)p_sh4rcb->sq_buffer + 512 + 0x0C000000)); - // Prepare some pointer to the pre-allocated code cache: - void *candidate_ptr = (void*)(((unat)SH4_TCB + 4095) & ~4095); - // Call the platform-specific magic to make the pages RWX - CodeCache = NULL; + CodeCache = nullptr; #ifdef FEAT_NO_RWX_PAGES - bool rc = vmem_platform_prepare_jit_block(candidate_ptr, CODE_SIZE + TEMP_CODE_SIZE, (void**)&CodeCache, &cc_rx_offset); + bool rc = vmem_platform_prepare_jit_block(SH4_TCB, CODE_SIZE + TEMP_CODE_SIZE, (void**)&CodeCache, &cc_rx_offset); #else - bool rc = vmem_platform_prepare_jit_block(candidate_ptr, CODE_SIZE + TEMP_CODE_SIZE, (void**)&CodeCache); + bool rc = vmem_platform_prepare_jit_block(SH4_TCB, CODE_SIZE + TEMP_CODE_SIZE, (void**)&CodeCache); #endif verify(rc); // Ensure the pointer returned is non-null - verify(CodeCache != NULL); + verify(CodeCache != nullptr); TempCodeCache = CodeCache + CODE_SIZE; ngen_init(); @@ -421,6 +417,15 @@ static void recSh4_Init() static void recSh4_Term() { INFO_LOG(DYNAREC, "recSh4 Term"); +#ifdef FEAT_NO_RWX_PAGES + if (CodeCache != nullptr) + vmem_platform_release_jit_block(CodeCache, (u8 *)CodeCache + cc_rx_offset, CODE_SIZE + TEMP_CODE_SIZE); +#else + if (CodeCache != nullptr && CodeCache != SH4_TCB) + vmem_platform_release_jit_block(CodeCache, CODE_SIZE + TEMP_CODE_SIZE); +#endif + CodeCache = nullptr; + TempCodeCache = nullptr; bm_Term(); sh4Interp.Term(); } diff --git a/core/linux/posix_vmem.cpp b/core/linux/posix_vmem.cpp index ef63959fe..022073e74 100644 --- a/core/linux/posix_vmem.cpp +++ b/core/linux/posix_vmem.cpp @@ -150,7 +150,6 @@ static int allocate_shared_filemem(unsigned size) { // Implement vmem initialization for RAM, ARAM, VRAM and SH4 context, fpcb etc. int vmem_fd = -1; -static int shmem_fd2 = -1; static void *reserved_base; static size_t reserved_size; @@ -242,7 +241,7 @@ void vmem_platform_create_mappings(const vmem_mapping *vmem_maps, unsigned numma } // Prepares the code region for JIT operations, thus marking it as RWX -bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code_area_rwx) +bool vmem_platform_prepare_jit_block(void *code_area, size_t size, void **code_area_rwx) { // Try to map is as RWX, this fails apparently on OSX (and perhaps other systems?) if (code_area != nullptr && mem_region_set_exec(code_area, size)) @@ -278,10 +277,16 @@ bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code return true; } +void vmem_platform_release_jit_block(void *code_area, size_t size) +{ + munmap(code_area, size); +} + // Use two addr spaces: need to remap something twice, therefore use allocate_shared_filemem() -bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code_area_rw, ptrdiff_t *rx_offset) { - shmem_fd2 = allocate_shared_filemem(size); - if (shmem_fd2 < 0) +bool vmem_platform_prepare_jit_block(void *code_area, size_t size, void **code_area_rw, ptrdiff_t *rx_offset) +{ + int fd = allocate_shared_filemem(size); + if (fd < 0) return false; // Need to unmap the section we are about to use (it might be already unmapped but nevertheless...) @@ -289,20 +294,31 @@ bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code // Map the RX bits on the code_area, for proximity, as usual. void *ptr_rx = mmap(code_area, size, PROT_READ | PROT_EXEC, - MAP_SHARED | MAP_NOSYNC | MAP_FIXED, shmem_fd2, 0); + MAP_SHARED | MAP_NOSYNC | MAP_FIXED, fd, 0); if (ptr_rx != code_area) + { + close(fd); return false; + } // Now remap the same memory as RW in some location we don't really care at all. void *ptr_rw = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_NOSYNC, shmem_fd2, 0); + MAP_SHARED | MAP_NOSYNC, fd, 0); *code_area_rw = ptr_rw; *rx_offset = (char*)ptr_rx - (char*)ptr_rw; + close(fd); INFO_LOG(DYNAREC, "Info: Using NO_RWX mode, rx ptr: %p, rw ptr: %p, offset: %ld", ptr_rx, ptr_rw, (long)*rx_offset); return (ptr_rw != MAP_FAILED); } + +void vmem_platform_release_jit_block(void *code_area1, void *code_area2, size_t size) +{ + // keep code_area1 (RX) mapped since it's statically allocated + munmap(code_area2, size); +} + #endif // !__SWITCH__ void vmem_platform_jit_set_exec(void* code, size_t size, bool enable) { diff --git a/core/windows/win_vmem.cpp b/core/windows/win_vmem.cpp index 713dddd75..59e327101 100644 --- a/core/windows/win_vmem.cpp +++ b/core/windows/win_vmem.cpp @@ -57,7 +57,7 @@ bool vmem_platform_init(void **vmem_base_addr, void **sh4rcb_addr, size_t ramSiz mapped_regions.reserve(32); // First let's try to allocate the in-memory file - mem_handle = CreateFileMapping(INVALID_HANDLE_VALUE, 0, PAGE_READWRITE, 0, ramSize, 0); + mem_handle = CreateFileMapping(INVALID_HANDLE_VALUE, 0, PAGE_READWRITE, 0, (DWORD)ramSize, 0); // Now allocate the actual address space (it will be 64KB aligned on windows). unsigned memsize = 512*1024*1024 + sizeof(Sh4RCB) + ARAM_SIZE_MAX; @@ -114,7 +114,7 @@ void vmem_platform_create_mappings(const vmem_mapping *vmem_maps, unsigned numma unmapped_regions.clear(); for (unsigned i = 0; i < nummaps; i++) { - unsigned address_range_size = vmem_maps[i].end_address - vmem_maps[i].start_address; + size_t address_range_size = vmem_maps[i].end_address - vmem_maps[i].start_address; DWORD protection = vmem_maps[i].allow_writes ? (FILE_MAP_READ | FILE_MAP_WRITE) : FILE_MAP_READ; if (!vmem_maps[i].memsize) { @@ -125,14 +125,14 @@ void vmem_platform_create_mappings(const vmem_mapping *vmem_maps, unsigned numma } else { // Calculate the number of mirrors - unsigned num_mirrors = (address_range_size) / vmem_maps[i].memsize; + unsigned num_mirrors = (unsigned)(address_range_size / vmem_maps[i].memsize); verify((address_range_size % vmem_maps[i].memsize) == 0 && num_mirrors >= 1); // Remap the views one by one for (unsigned j = 0; j < num_mirrors; j++) { - unsigned offset = vmem_maps[i].start_address + j * vmem_maps[i].memsize; + size_t offset = vmem_maps[i].start_address + j * vmem_maps[i].memsize; - void *ptr = MapViewOfFileEx(mem_handle, protection, 0, vmem_maps[i].memoffset, + void *ptr = MapViewOfFileEx(mem_handle, protection, 0, (DWORD)vmem_maps[i].memoffset, vmem_maps[i].memsize, &virt_ram_base[offset]); verify(ptr == &virt_ram_base[offset]); mapped_regions.push_back(ptr); @@ -142,15 +142,14 @@ void vmem_platform_create_mappings(const vmem_mapping *vmem_maps, unsigned numma #endif } -typedef void* (*mapper_fn) (void *addr, unsigned size); - -// This is a templated function since it's used twice -static void* vmem_platform_prepare_jit_block_template(void *code_area, unsigned size, mapper_fn mapper) { +template +static void *vmem_platform_prepare_jit_block_template(size_t size, Mapper mapper) +{ // Several issues on Windows: can't protect arbitrary pages due to (I guess) the way // kernel tracks mappings, so only stuff that has been allocated with VirtualAlloc can be // protected (the entire allocation IIUC). - // Strategy: ignore code_area and allocate a new one. Protect it properly. + // Strategy: Allocate a new region. Protect it properly. // More issues: the area should be "close" to the .text stuff so that code gen works. // Remember that on x64 we have 4 byte jump/load offset immediates, no issues on x86 :D @@ -177,7 +176,7 @@ static void* vmem_platform_prepare_jit_block_template(void *code_area, unsigned return NULL; } -static void* mem_alloc(void *addr, unsigned size) +static void* mem_alloc(void *addr, size_t size) { #ifdef TARGET_UWP // rwx is not allowed. Need to switch between r-x and rw- @@ -188,9 +187,10 @@ static void* mem_alloc(void *addr, unsigned size) } // Prepares the code region for JIT operations, thus marking it as RWX -bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code_area_rwx) { +bool vmem_platform_prepare_jit_block(void *, size_t size, void **code_area_rwx) +{ // Get the RWX page close to the code_area - void *ptr = vmem_platform_prepare_jit_block_template(code_area, size, &mem_alloc); + void *ptr = vmem_platform_prepare_jit_block_template(size, mem_alloc); if (!ptr) return false; @@ -202,8 +202,12 @@ bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code return true; } +void vmem_platform_release_jit_block(void *code_area, size_t) +{ + VirtualFree(code_area, 0, MEM_RELEASE); +} -static void* mem_file_map(void *addr, unsigned size) +static void* mem_file_map(void *addr, size_t size) { // Maps the entire file at the specified addr. void *ptr = VirtualAlloc(addr, size, MEM_RESERVE, PAGE_NOACCESS); @@ -221,12 +225,12 @@ static void* mem_file_map(void *addr, unsigned size) } // Use two addr spaces: need to remap something twice, therefore use CreateFileMapping() -bool vmem_platform_prepare_jit_block(void* code_area, unsigned size, void** code_area_rw, ptrdiff_t* rx_offset) +bool vmem_platform_prepare_jit_block(void *, size_t size, void** code_area_rw, ptrdiff_t* rx_offset) { - mem_handle2 = CreateFileMapping(INVALID_HANDLE_VALUE, 0, PAGE_EXECUTE_READWRITE, 0, size, 0); + mem_handle2 = CreateFileMapping(INVALID_HANDLE_VALUE, 0, PAGE_EXECUTE_READWRITE, 0, (DWORD)size, 0); // Get the RX page close to the code_area - void* ptr_rx = vmem_platform_prepare_jit_block_template(code_area, size, &mem_file_map); + void* ptr_rx = vmem_platform_prepare_jit_block_template(size, mem_file_map); if (!ptr_rx) return false; @@ -244,6 +248,17 @@ bool vmem_platform_prepare_jit_block(void* code_area, unsigned size, void** code return (ptr_rw != NULL); } +void vmem_platform_release_jit_block(void *code_area1, void *code_area2, size_t) +{ + UnmapViewOfFile(code_area1); + UnmapViewOfFile(code_area2); + // FIXME the same handle is used for all allocations, and thus leaks. + // And the last opened handle is closed multiple times. + // But windows doesn't need separate RW and RX areas except perhaps UWP + // instead of switching back and forth between RX and RW + CloseHandle(mem_handle2); +} + void vmem_platform_jit_set_exec(void* code, size_t size, bool enable) { #ifdef TARGET_UWP