From aa4fc8dd601e5a832318bc907115a40ac7535a04 Mon Sep 17 00:00:00 2001 From: David Guillen Fandos Date: Sun, 12 May 2019 23:25:43 +0200 Subject: [PATCH] Move cache invalidation to vmem_platform and implement NO_RWX on arm64 Still not functional yet, this commit breaks arm64 on android for instance. Rewrites are not yet figured out, will push them in the next commit. --- core/hw/aica/dsp_arm64.cpp | 14 +++---- core/hw/arm7/arm64.cpp | 8 ++-- core/hw/mem/_vmem.cpp | 1 + core/hw/mem/_vmem.h | 2 + core/linux/posix_vmem.cpp | 56 +++++++++++++++++++++++++++- core/rec-ARM64/rec_arm64.cpp | 71 ++++++++++++------------------------ core/windows/win_vmem.cpp | 2 +- 7 files changed, 95 insertions(+), 59 deletions(-) diff --git a/core/hw/aica/dsp_arm64.cpp b/core/hw/aica/dsp_arm64.cpp index ba5515ed4..cac7b4c71 100644 --- a/core/hw/aica/dsp_arm64.cpp +++ b/core/hw/aica/dsp_arm64.cpp @@ -27,7 +27,7 @@ #include "deps/vixl/aarch64/macro-assembler-aarch64.h" using namespace vixl::aarch64; -extern void Arm64CacheFlush(void* start, void* end); +extern void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end); class DSPAssembler : public MacroAssembler { @@ -54,9 +54,9 @@ public: Stp(xzr, xzr, MemOperand(x0, 48)); Ret(); FinalizeCode(); -#ifdef _ANDROID - Arm64CacheFlush(GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress()); -#endif + vmem_platform_flush_cache( + GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress(), + GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress()); return; } @@ -387,9 +387,9 @@ public: #endif FinalizeCode(); -#ifdef _ANDROID - Arm64CacheFlush(GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress()); -#endif + vmem_platform_flush_cache( + GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress(), + GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress()); } private: diff --git a/core/hw/arm7/arm64.cpp b/core/hw/arm7/arm64.cpp index 89e57ab66..e712855bd 100644 --- a/core/hw/arm7/arm64.cpp +++ b/core/hw/arm7/arm64.cpp @@ -28,7 +28,7 @@ using namespace vixl::aarch64; //#include "deps/vixl/aarch32/disasm-aarch32.h" -extern void Arm64CacheFlush(void* start, void* end); +extern void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end); extern u32 arm_single_op(u32 opcode); extern "C" void arm_dispatch(); extern "C" void arm_exit(); @@ -41,7 +41,7 @@ extern reg_pair arm_Reg[RN_ARM_REG_COUNT]; MacroAssembler *assembler; extern "C" void armFlushICache(void *bgn, void *end) { - Arm64CacheFlush(bgn, end); + vmem_platform_flush_cache(bgn, end, bgn, end); } static MemOperand arm_reg_operand(u32 regn) @@ -143,7 +143,9 @@ void armv_end(void* codestart, u32 cycl) assembler->FinalizeCode(); verify(assembler->GetBuffer()->GetCursorOffset() <= assembler->GetBuffer()->GetCapacity()); - Arm64CacheFlush(codestart, assembler->GetBuffer()->GetEndAddress()); + vmem_platform_flush_cache( + codestart, assembler->GetBuffer()->GetEndAddress(), + codestart, assembler->GetBuffer()->GetEndAddress()); icPtr += assembler->GetBuffer()->GetSizeInBytes(); #if 0 diff --git a/core/hw/mem/_vmem.cpp b/core/hw/mem/_vmem.cpp index 3fbfdf6b5..cabe7941c 100644 --- a/core/hw/mem/_vmem.cpp +++ b/core/hw/mem/_vmem.cpp @@ -469,6 +469,7 @@ bool _vmem_reserve() { } else { printf("Info: nvmem is enabled, with addr space of size %s\n", vmemstatus == MemType4GB ? "4GB" : "512MB"); + printf("Info: p_sh4rcb: %p virt_ram_base: %p\n", p_sh4rcb, virt_ram_base); // Map the different parts of the memory file into the new memory range we got. #define MAP_RAM_START_OFFSET 0 #define MAP_VRAM_START_OFFSET (MAP_RAM_START_OFFSET+RAM_SIZE) diff --git a/core/hw/mem/_vmem.h b/core/hw/mem/_vmem.h index 394194146..dca2c09e3 100644 --- a/core/hw/mem/_vmem.h +++ b/core/hw/mem/_vmem.h @@ -30,6 +30,8 @@ bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code // Same as above but uses two address spaces one with RX and RW protections. // Note: this function doesnt have to be implemented, it's a fallback for the above one. bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code_area_rw, uintptr_t *rx_offset); +// This might not need an implementation (ie x86/64 cpus). +void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end); // Note: if you want to disable vmem magic in any given platform, implement the // above functions as empty functions and make vmem_platform_init return MemTypeError. diff --git a/core/linux/posix_vmem.cpp b/core/linux/posix_vmem.cpp index 8547955e1..9a573cdae 100644 --- a/core/linux/posix_vmem.cpp +++ b/core/linux/posix_vmem.cpp @@ -215,9 +215,63 @@ bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code *code_area_rw = ptr_rw; *rx_offset = (char*)ptr_rx - (char*)ptr_rw; - printf("Info: Using NO_RWX mode, rx ptr: %p, rw ptr: %p, offset: %p\n", ptr_rx, ptr_rw, *rx_offset); + printf("Info: Using NO_RWX mode, rx ptr: %p, rw ptr: %p, offset: %lu\n", ptr_rx, ptr_rw, (unsigned long)*rx_offset); return (ptr_rw != MAP_FAILED); } +// Some OSes restrict cache flushing, cause why not right? :D + +#if HOST_CPU == CPU_ARM64 + +// Code borrowed from Dolphin https://github.com/dolphin-emu/dolphin +static void Arm64_CacheFlush(void* start, void* end) { + if (start == end) + return; + +#if HOST_OS == OS_DARWIN + // Header file says this is equivalent to: sys_icache_invalidate(start, end - start); + sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start); +#else + // Don't rely on GCC's __clear_cache implementation, as it caches + // icache/dcache cache line sizes, that can vary between cores on + // big.LITTLE architectures. + u64 addr, ctr_el0; + static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff; + size_t isize, dsize; + + __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0)); + isize = 4 << ((ctr_el0 >> 0) & 0xf); + dsize = 4 << ((ctr_el0 >> 16) & 0xf); + + // use the global minimum cache line size + icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize; + dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize; + + addr = (u64)start & ~(u64)(dsize - 1); + for (; addr < (u64)end; addr += dsize) + // use "civac" instead of "cvau", as this is the suggested workaround for + // Cortex-A53 errata 819472, 826319, 827319 and 824069. + __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory"); + __asm__ volatile("dsb ish" : : : "memory"); + + addr = (u64)start & ~(u64)(isize - 1); + for (; addr < (u64)end; addr += isize) + __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory"); + + __asm__ volatile("dsb ish" : : : "memory"); + __asm__ volatile("isb" : : : "memory"); +#endif +} + + +void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end) { + Arm64_CacheFlush(dcache_start, dcache_end); + + // Dont risk it and flush and invalidate icache&dcache for both ranges just in case. + if (icache_start != dcache_start) + Arm64_CacheFlush(icache_start, icache_end); +} + +#endif // #if HOST_CPU == CPU_ARM64 diff --git a/core/rec-ARM64/rec_arm64.cpp b/core/rec-ARM64/rec_arm64.cpp index 2a9108349..03795ab13 100644 --- a/core/rec-ARM64/rec_arm64.cpp +++ b/core/rec-ARM64/rec_arm64.cpp @@ -45,12 +45,11 @@ using namespace vixl::aarch64; extern "C" void no_update(); extern "C" void intc_sched(); extern "C" void ngen_blockcheckfail(u32 pc); - extern "C" void ngen_LinkBlock_Generic_stub(); extern "C" void ngen_LinkBlock_cond_Branch_stub(); extern "C" void ngen_LinkBlock_cond_Next_stub(); - extern "C" void ngen_FailedToFindBlock_(); +extern void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end); struct DynaRBI : RuntimeBlockInfo { @@ -61,47 +60,6 @@ struct DynaRBI : RuntimeBlockInfo } }; -// Code borrowed from Dolphin https://github.com/dolphin-emu/dolphin -void Arm64CacheFlush(void* start, void* end) -{ - if (start == end) - return; - -#if HOST_OS == OS_DARWIN - // Header file says this is equivalent to: sys_icache_invalidate(start, end - start); - sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start); -#else - // Don't rely on GCC's __clear_cache implementation, as it caches - // icache/dcache cache line sizes, that can vary between cores on - // big.LITTLE architectures. - u64 addr, ctr_el0; - static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff; - size_t isize, dsize; - - __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0)); - isize = 4 << ((ctr_el0 >> 0) & 0xf); - dsize = 4 << ((ctr_el0 >> 16) & 0xf); - - // use the global minimum cache line size - icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize; - dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize; - - addr = (u64)start & ~(u64)(dsize - 1); - for (; addr < (u64)end; addr += dsize) - // use "civac" instead of "cvau", as this is the suggested workaround for - // Cortex-A53 errata 819472, 826319, 827319 and 824069. - __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory"); - __asm__ volatile("dsb ish" : : : "memory"); - - addr = (u64)start & ~(u64)(isize - 1); - for (; addr < (u64)end; addr += isize) - __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory"); - - __asm__ volatile("dsb ish" : : : "memory"); - __asm__ volatile("isb" : : : "memory"); -#endif -} - double host_cpu_time; u64 guest_cpu_cycles; @@ -147,7 +105,7 @@ __asm__ "ngen_LinkBlock_Shared_stub: \n\t" "mov x0, lr \n\t" "sub x0, x0, #4 \n\t" // go before the call - "bl rdv_LinkBlock \n\t" + "bl rdv_LinkBlock \n\t" // returns an RX addr "br x0 \n" ".hidden ngen_FailedToFindBlock_ \n\t" @@ -1013,7 +971,7 @@ public: Ldr(w29, sh4_context_mem_operand(&next_pc)); - GenBranch(no_update); + GenBranchRuntime(no_update); break; default: @@ -1038,7 +996,12 @@ public: emit_Skip(block->host_code_size); } - Arm64CacheFlush(GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress()); + + // Flush and invalidate caches + vmem_platform_flush_cache( + CC_RW2RX(GetBuffer()->GetStartAddress()), CC_RW2RX(GetBuffer()->GetEndAddress()), + GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress()); + #if 0 // if (rewrite) { @@ -1060,10 +1023,13 @@ public: } private: + // Runtime branches/calls need to be adjusted if rx space is different to rw space. + // Therefore can't mix GenBranch with GenBranchRuntime! + template void GenCallRuntime(R (*function)(P...)) { - ptrdiff_t offset = reinterpret_cast(function) - GetBuffer()->GetStartAddress(); + ptrdiff_t offset = reinterpret_cast(function) - reinterpret_cast(CC_RW2RX(GetBuffer()->GetStartAddress())); verify(offset >= -128 * 1024 * 1024 && offset <= 128 * 1024 * 1024); verify((offset & 3) == 0); Label function_label; @@ -1071,6 +1037,17 @@ private: Bl(&function_label); } + template + void GenBranchRuntime(R (*target)(P...)) + { + ptrdiff_t offset = reinterpret_cast(target) - reinterpret_cast(CC_RW2RX(GetBuffer()->GetStartAddress())); + verify(offset >= -128 * 1024 * 1024 && offset <= 128 * 1024 * 1024); + verify((offset & 3) == 0); + Label target_label; + BindToOffset(&target_label, offset); + B(&target_label); + } + template void GenBranch(R (*code)(P...), Condition cond = al) { diff --git a/core/windows/win_vmem.cpp b/core/windows/win_vmem.cpp index d62d138cf..edb4b21cd 100644 --- a/core/windows/win_vmem.cpp +++ b/core/windows/win_vmem.cpp @@ -183,7 +183,7 @@ bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code *code_area_rw = ptr_rw; *rx_offset = (char*)ptr_rx - (char*)ptr_rw; - printf("Info: Using NO_RWX mode, rx ptr: %p, rw ptr: %p, offset: %p\n", ptr_rx, ptr_rw, *rx_offset); + printf("Info: Using NO_RWX mode, rx ptr: %p, rw ptr: %p, offset: %lu\n", ptr_rx, ptr_rw, (unsigned long)*rx_offset); return (ptr_rw != NULL); }