// Implementation of the vmem related function for POSIX-like platforms. // There's some minimal amount of platform specific hacks to support // Android and OSX since they are slightly different in some areas. #include "types.h" #ifndef __SWITCH__ #include #include #include #include #include #include #include "hw/mem/_vmem.h" #include "hw/sh4/sh4_if.h" #include "stdclass.h" #ifndef MAP_NOSYNC #define MAP_NOSYNC 0 //missing from linux :/ -- could be the cause of android slowness ? #endif #ifdef __ANDROID__ #include #ifndef ASHMEM_DEVICE #define ASHMEM_DEVICE "/dev/ashmem" #undef PAGE_MASK #define PAGE_MASK (PAGE_SIZE-1) #else #define PAGE_SIZE 4096 #define PAGE_MASK (PAGE_SIZE-1) #endif // Only available in SDK 26+. Required in SDK 29+ (android 10) extern "C" int __attribute__((weak)) ASharedMemory_create(const char*, size_t); // Android specific ashmem-device stuff for creating shared memory regions int ashmem_create_region(const char *name, size_t size) { if (ASharedMemory_create != nullptr) return ASharedMemory_create(name, size); int fd = open(ASHMEM_DEVICE, O_RDWR); if (fd < 0) return -1; if (ioctl(fd, ASHMEM_SET_SIZE, size) < 0) { close(fd); return -1; } return fd; } #endif // #ifdef __ANDROID__ bool mem_region_lock(void *start, size_t len) { size_t inpage = (uintptr_t)start & PAGE_MASK; if (mprotect((u8*)start - inpage, len + inpage, PROT_READ)) die("mprotect failed..."); return true; } bool mem_region_unlock(void *start, size_t len) { size_t inpage = (uintptr_t)start & PAGE_MASK; if (mprotect((u8*)start - inpage, len + inpage, PROT_READ | PROT_WRITE)) // Add some way to see why it failed? gdb> info proc mappings die("mprotect failed..."); return true; } bool mem_region_set_exec(void *start, size_t len) { size_t inpage = (uintptr_t)start & PAGE_MASK; int protFlags = PROT_READ | PROT_EXEC; #ifndef TARGET_IPHONE protFlags |= PROT_WRITE; #endif if (mprotect((u8*)start - inpage, len + inpage, protFlags)) { WARN_LOG(VMEM, "mem_region_set_exec: mprotect failed. errno %d", errno); return false; } return true; } static void *mem_region_reserve(void *start, size_t len) { void *p = mmap(start, len, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); if (p == MAP_FAILED) { perror("mmap"); return NULL; } else return p; } static bool mem_region_release(void *start, size_t len) { return munmap(start, len) == 0; } static void *mem_region_map_file(void *file_handle, void *dest, size_t len, size_t offset, bool readwrite) { int flags = MAP_SHARED | MAP_NOSYNC | (dest != NULL ? MAP_FIXED : 0); void *p = mmap(dest, len, PROT_READ | (readwrite ? PROT_WRITE : 0), flags, (int)(uintptr_t)file_handle, offset); if (p == MAP_FAILED) { perror("mmap"); return NULL; } else return p; } // Allocates memory via a fd on shmem/ahmem or even a file on disk static int allocate_shared_filemem(unsigned size) { int fd = -1; #if defined(__ANDROID__) // Use Android's specific shmem stuff. fd = ashmem_create_region("RAM", size); #else #if !defined(__APPLE__) fd = shm_open("/dcnzorz_mem", O_CREAT | O_EXCL | O_RDWR, S_IREAD | S_IWRITE); shm_unlink("/dcnzorz_mem"); #endif // if shmem does not work (or using OSX) fallback to a regular file on disk if (fd < 0) { std::string path = get_writable_data_path("dcnzorz_mem"); fd = open(path.c_str(), O_CREAT|O_RDWR|O_TRUNC, S_IRWXU|S_IRWXG|S_IRWXO); unlink(path.c_str()); } // If we can't open the file, fallback to slow mem. if (fd < 0) return -1; // Finally make the file as big as we need! if (ftruncate(fd, size)) { // Can't get as much memory as needed, fallback. close(fd); return -1; } #endif return fd; } // Implement vmem initialization for RAM, ARAM, VRAM and SH4 context, fpcb etc. // The function supports allocating 512MB or 4GB addr spaces. int vmem_fd = -1; static int shmem_fd2 = -1; static void *reserved_base; static size_t reserved_size; // vmem_base_addr points to an address space of 512MB (or 4GB) that can be used for fast memory ops. // In negative offsets of the pointer (up to FPCB size, usually 65/129MB) the context and jump table // can be found. If the platform init returns error, the user is responsible for initializing the // memory using a fallback (that is, regular mallocs and falling back to slow memory JIT). VMemType vmem_platform_init(void **vmem_base_addr, void **sh4rcb_addr) { // Firt let's try to allocate the shm-backed memory vmem_fd = allocate_shared_filemem(RAM_SIZE_MAX + VRAM_SIZE_MAX + ARAM_SIZE_MAX); if (vmem_fd < 0) return MemTypeError; // Now try to allocate a contiguous piece of memory. VMemType rv; #if HOST_CPU == CPU_X64 || HOST_CPU == CPU_ARM64 reserved_size = 0x100000000L + sizeof(Sh4RCB) + 0x10000; // 4GB + context size + 64K padding reserved_base = mem_region_reserve(NULL, reserved_size); rv = MemType4GB; #endif if (reserved_base == NULL) { reserved_size = 512*1024*1024 + sizeof(Sh4RCB) + ARAM_SIZE_MAX + 0x10000; reserved_base = mem_region_reserve(NULL, reserved_size); if (!reserved_base) { close(vmem_fd); return MemTypeError; } rv = MemType512MB; } // Align pointer to 64KB too, some Linaro bug (no idea but let's just be safe I guess). uintptr_t ptrint = (uintptr_t)reserved_base; ptrint = (ptrint + 0x10000 - 1) & (~0xffff); *sh4rcb_addr = (void*)ptrint; *vmem_base_addr = (void*)(ptrint + sizeof(Sh4RCB)); const size_t fpcb_size = sizeof(((Sh4RCB *)NULL)->fpcb); void *sh4rcb_base_ptr = (void*)(ptrint + fpcb_size); // Now map the memory for the SH4 context, do not include FPCB on purpose (paged on demand). mem_region_unlock(sh4rcb_base_ptr, sizeof(Sh4RCB) - fpcb_size); return rv; } // Just tries to wipe as much as possible in the relevant area. void vmem_platform_destroy() { if (reserved_base != NULL) mem_region_release(reserved_base, reserved_size); } // Resets a chunk of memory by deleting its data and setting its protection back. void vmem_platform_reset_mem(void *ptr, unsigned size_bytes) { // Mark them as non accessible. mprotect(ptr, size_bytes, PROT_NONE); // Tell the kernel to flush'em all (FIXME: perhaps unmap+mmap 'd be better?) madvise(ptr, size_bytes, MADV_DONTNEED); #if defined(MADV_REMOVE) madvise(ptr, size_bytes, MADV_REMOVE); #elif defined(MADV_FREE) madvise(ptr, size_bytes, MADV_FREE); #endif } // Allocates a bunch of memory (page aligned and page-sized) void vmem_platform_ondemand_page(void *address, unsigned size_bytes) { verify(mem_region_unlock(address, size_bytes)); } // Creates mappings to the underlying file including mirroring sections void vmem_platform_create_mappings(const vmem_mapping *vmem_maps, unsigned nummaps) { for (unsigned i = 0; i < nummaps; i++) { // Ignore unmapped stuff, it is already reserved as PROT_NONE if (!vmem_maps[i].memsize) continue; // Calculate the number of mirrors u64 address_range_size = vmem_maps[i].end_address - vmem_maps[i].start_address; unsigned num_mirrors = (address_range_size) / vmem_maps[i].memsize; verify((address_range_size % vmem_maps[i].memsize) == 0 && num_mirrors >= 1); for (unsigned j = 0; j < num_mirrors; j++) { u64 offset = vmem_maps[i].start_address + j * vmem_maps[i].memsize; verify(mem_region_map_file((void*)(uintptr_t)vmem_fd, &virt_ram_base[offset], vmem_maps[i].memsize, vmem_maps[i].memoffset, vmem_maps[i].allow_writes) != NULL); } } } // Prepares the code region for JIT operations, thus marking it as RWX bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code_area_rwx) { // Try to map is as RWX, this fails apparently on OSX (and perhaps other systems?) if (code_area != nullptr && mem_region_set_exec(code_area, size)) { // Pointer location should be same: *code_area_rwx = code_area; return true; } #ifndef TARGET_ARM_MAC void *ret_ptr = MAP_FAILED; if (code_area != nullptr) { // Well it failed, use another approach, unmap the memory area and remap it back. // Seems it works well on Darwin according to reicast code :P munmap(code_area, size); ret_ptr = mmap(code_area, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_ANON, 0, 0); } if (ret_ptr == MAP_FAILED) { // mmap at the requested code_area location failed, so let the OS pick one for us ret_ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); if (ret_ptr == MAP_FAILED) return false; } #else // MAP_JIT and toggleable write protection is required on Apple Silicon // Cannot use MAP_FIXED with MAP_JIT void *ret_ptr = mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON | MAP_JIT, -1, 0); if ( ret_ptr == MAP_FAILED ) return false; #endif *code_area_rwx = ret_ptr; return true; } // Use two addr spaces: need to remap something twice, therefore use allocate_shared_filemem() bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code_area_rw, ptrdiff_t *rx_offset) { shmem_fd2 = allocate_shared_filemem(size); if (shmem_fd2 < 0) return false; // Need to unmap the section we are about to use (it might be already unmapped but nevertheless...) munmap(code_area, size); // Map the RX bits on the code_area, for proximity, as usual. void *ptr_rx = mmap(code_area, size, PROT_READ | PROT_EXEC, MAP_SHARED | MAP_NOSYNC | MAP_FIXED, shmem_fd2, 0); if (ptr_rx != code_area) return false; // Now remap the same memory as RW in some location we don't really care at all. void *ptr_rw = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NOSYNC, shmem_fd2, 0); *code_area_rw = ptr_rw; *rx_offset = (char*)ptr_rx - (char*)ptr_rw; INFO_LOG(DYNAREC, "Info: Using NO_RWX mode, rx ptr: %p, rw ptr: %p, offset: %ld", ptr_rx, ptr_rw, (long)*rx_offset); return (ptr_rw != MAP_FAILED); } #endif // !__SWITCH__ // Some OSes restrict cache flushing, cause why not right? :D #if HOST_CPU == CPU_ARM64 #if defined(__APPLE__) #include #endif // Code borrowed from Dolphin https://github.com/dolphin-emu/dolphin static void Arm64_CacheFlush(void* start, void* end) { if (start == end) return; #if defined(__APPLE__) // Header file says this is equivalent to: sys_icache_invalidate(start, end - start); sys_cache_control(kCacheFunctionPrepareForExecution, start, (uintptr_t)end - (uintptr_t)start); #else // Don't rely on GCC's __clear_cache implementation, as it caches // icache/dcache cache line sizes, that can vary between cores on // big.LITTLE architectures. u64 addr, ctr_el0; static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff; size_t isize, dsize; __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0)); isize = 4 << ((ctr_el0 >> 0) & 0xf); dsize = 4 << ((ctr_el0 >> 16) & 0xf); // use the global minimum cache line size icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize; dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize; addr = (u64)start & ~(u64)(dsize - 1); for (; addr < (u64)end; addr += dsize) // use "civac" instead of "cvau", as this is the suggested workaround for // Cortex-A53 errata 819472, 826319, 827319 and 824069. __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory"); __asm__ volatile("dsb ish" : : : "memory"); addr = (u64)start & ~(u64)(isize - 1); for (; addr < (u64)end; addr += isize) __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory"); __asm__ volatile("dsb ish" : : : "memory"); __asm__ volatile("isb" : : : "memory"); #endif } void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end) { Arm64_CacheFlush(dcache_start, dcache_end); // Dont risk it and flush and invalidate icache&dcache for both ranges just in case. if (icache_start != dcache_start) Arm64_CacheFlush(icache_start, icache_end); } #elif HOST_CPU == CPU_ARM #if defined(__APPLE__) #include static void CacheFlush(void* code, void* pEnd) { sys_dcache_flush(code, (u8*)pEnd - (u8*)code + 1); sys_icache_invalidate(code, (u8*)pEnd - (u8*)code + 1); } #elif !defined(ARMCC) #ifdef __ANDROID__ #include // for cache flushing. #endif static void CacheFlush(void* code, void* pEnd) { #if !defined(__ANDROID__) #ifdef __GNUC__ __builtin___clear_cache((char *)code, (char *)pEnd); #else __clear_cache((void*)code, pEnd); #endif #else // defined(__ANDROID__) void* start=code; size_t size=(u8*)pEnd-(u8*)start+4; // Ideally, we would call // syscall(__ARM_NR_cacheflush, start, // reinterpret_cast(start) + size, 0); // however, syscall(int, ...) is not supported on all platforms, especially // not when using EABI, so we call the __ARM_NR_cacheflush syscall directly. register uint32_t beg asm("a1") = reinterpret_cast(start); register uint32_t end asm("a2") = reinterpret_cast(start) + size; register uint32_t flg asm("a3") = 0; #ifdef __ARM_EABI__ #if defined (__arm__) && !defined(__thumb__) // __arm__ may be defined in thumb mode. register uint32_t scno asm("r7") = __ARM_NR_cacheflush; asm volatile( "svc 0x0" : "=r" (beg) : "0" (beg), "r" (end), "r" (flg), "r" (scno)); #else // r7 is reserved by the EABI in thumb mode. asm volatile( "@ Enter ARM Mode \n\t" "adr r3, 1f \n\t" "bx r3 \n\t" ".ALIGN 4 \n\t" ".ARM \n" "1: push {r7} \n\t" "mov r7, %4 \n\t" "svc 0x0 \n\t" "pop {r7} \n\t" "@ Enter THUMB Mode\n\t" "adr r3, 2f+1 \n\t" "bx r3 \n\t" ".THUMB \n" "2: \n\t" : "=r" (beg) : "0" (beg), "r" (end), "r" (flg), "r" (__ARM_NR_cacheflush) : "r3"); #endif // !defined (__arm__) || defined(__thumb__) #else // ! __ARM_EABI__ #if defined (__arm__) && !defined(__thumb__) // __arm__ may be defined in thumb mode. asm volatile( "svc %1" : "=r" (beg) : "i" (__ARM_NR_cacheflush), "0" (beg), "r" (end), "r" (flg)); #else // Do not use the value of __ARM_NR_cacheflush in the inline assembly // below, because the thumb mode value would be used, which would be // wrong, since we switch to ARM mode before executing the svc instruction asm volatile( "@ Enter ARM Mode \n\t" "adr r3, 1f \n\t" "bx r3 \n\t" ".ALIGN 4 \n\t" ".ARM \n" "1: svc 0x9f0002 \n" "@ Enter THUMB Mode\n\t" "adr r3, 2f+1 \n\t" "bx r3 \n\t" ".THUMB \n" "2: \n\t" : "=r" (beg) : "0" (beg), "r" (end), "r" (flg) : "r3"); #endif // !defined (__arm__) || defined(__thumb__) #endif // !__ARM_EABI__ #if 0 const int syscall = 0xf0002; __asm __volatile ( "mov r0, %0\n" "mov r1, %1\n" "mov r7, %2\n" "mov r2, #0x0\n" "svc 0x00000000\n" : : "r" (code), "r" (pEnd), "r" (syscall) : "r0", "r1", "r7" ); #endif #endif // defined(__ANDROID__) } #else // defined(ARMCC) asm static void CacheFlush(void* code, void* pEnd) { ARM push {r7} //add r1, r1, r0 mov r7, #0xf0000 add r7, r7, #0x2 mov r2, #0x0 svc #0x0 pop {r7} bx lr } #endif void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end) { CacheFlush(icache_start, icache_end); } #endif // #if HOST_CPU == CPU_ARM